陳鍾誠的程式作品 -- 機器翻譯範例集

程式作品

C 語言

Java

C#

JavaScript

常用函數

文字處理

遊戲程式

衛星定位

系統程式

資料結構

網路程式

自然語言

人工智慧

機率統計

資訊安全

等待完成

訊息

相關網站

參考文獻

最新修改

簡體版

English

檔案:MT1.java

import java.util.*;
 
public class MT1 {
    public static void main(String[] args) {
        TreeMap map = STR.text2map("book=書\nis=是\nthis=這\na=一\n");
        System.out.println(MT1.translate("this is a excellent book", map));
    }
 
    public static String translate(String pText, Map map) {
        StringBuffer rzText = new StringBuffer();
        String[] words = pText.split("\\W+");
        for (int i=0; i<words.length;i++) {
            String toWord = (String) map.get(words[i]);
            if (toWord == null)    toWord = words[i];
            rzText.append(toWord+" ");
        }
        return rzText.toString();
    }
}

檔案:MT2.java

package ccc;
import java.util.*;
import java.util.regex.*;
 
public class MT {
    public static void main(String[] args) throws Exception {
        String dictionary = STR.file2text("MT\\MT.dic");
        TreeMap map  = STR.text2map(dictionary);
        String  text = STR.file2text("MT\\MT.txt");
        String  tText= MT.translateText(text, map);
        System.out.println(tText);
        String html  = STR.file2text("MT\\MT.htm");
        String tHtml = MT.translateXml(html, map);
        System.out.println(tHtml);
        STR.text2file(tHtml, "MT\\MT_C.htm");
    }
 
    public static String translateText(String pText, Map pMap) {
        StringBuffer rzText = new StringBuffer();
        String[] lines = pText.split("\n");
        for (int i=0; i<lines.length; i++) {
            rzText.append(lines[i]+"\n");
            rzText.append("->"+translate(lines[i], pMap)+"\n");
        }
        return rzText.toString();
    }
 
    public static String translate(String pText, Map pMap) {
        pText = normalizeText(pText);
        StringBuffer rzText = new StringBuffer();
        Pattern p = Pattern.compile("\\w+");
        Matcher m = p.matcher(pText);
        int lastIdx = 0;
        while (m.find()) {
            String prev  = pText.substring(lastIdx, m.start());
            String word  = pText.substring(m.start(), m.end());
            String toWord = translateWord(word, pMap);
            if (toWord == null) toWord = word;
            rzText.append(prev+toWord);
            lastIdx = m.end();
        }
        rzText.append(pText.substring(lastIdx));
        return rzText.toString();
    }
 
    public static String normalizeText(String pText) {
        String rules = "n't = not |'re = are |'m = am |'ve = have |home page=home_page|web browser=browser|source code=source_code|on-line=on_line|world wide web=world_wide_web|much more=much_more";
        return STR.expand(pText,rules);
    }
 
    public static String translateWord(String pWord, Map pMap) {
        String tWord = pWord.toLowerCase()+" ";
        String[] macros = {" = ", "s =", "es =", "ing =", "ed =", "d =", "ly =", "ies =y", "al ="};
        String toWord = null;
        for (int i=0; i<macros.length; i++) {
            String suffix = STR.head(macros[i], "=");
            String toSuffix= STR.tail(macros[i], "=");
            String word = STR.replace(tWord, suffix, toSuffix).trim();
            toWord = (String) pMap.get(word);
            if (toWord == null) 
              continue;
            else {
              if (toWord.trim().length()==0) return pWord; // a=_ means we do not translate this word.
              return toWord; // not defined, we do not translate it, too.
            }
        }
        return null;                    
    }
 
    public static String translateXml(String pXml, Map pMap) {
        StringBuffer rzXml = new StringBuffer();
        for (int i=0; i<pXml.length();) {
            if (pXml.charAt(i) == '>') {
                int tagLen = pXml.substring(i).indexOf("<");
                if (tagLen < 0) tagLen = pXml.substring(i).length();
                String text = pXml.substring(i+1, i+tagLen);
                rzXml.append(">"+translate(text, pMap));
                i+=tagLen;
            } else
                rzXml.append(pXml.charAt(i++));
        }
        return rzXml.toString();
    }
}

檔案:MT3.java

package ccc;
import java.util.*;
 
public class MT3 {
    public static void main(String[] args) throws Exception {
          NET.setProxy("proxy.internal", "3128");          
        String dictionary = STR.file2text("MT\\MT.dic").toLowerCase();
        TreeMap map  = UTIL.text2map(dictionary);
        String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"};
          String text, html, tHtml;
          for (int i=0; i<files.length; i++) {
             html  = STR.file2text("MT\\Test\\"+files[i]);            // translate html file
            tHtml = MT3.translate(html, map);
            STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C"));
          }
    }
 
    static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." };
    static final int PHRASE = 6, WORD=8;
    static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 ";
 
    public static String translate(String pText, Map pMap) {
        pText = STR.expand(pText, normalizeMacros); // normalize text
        StringBuffer rzText = new StringBuffer();
        for (int i=0; i<pText.length(); ) {
            String token=null, toToken=null;
            for (int pi=0; pi<patterns.length; pi++) {
                token = REGEX.matchAt(pText, i, patterns[pi]);
                if (token == null) continue;
                if (pi<PHRASE || pi>WORD) break;
                // the following code for PHRASE and WORD only.
                token = STR.replace(token, "-", " ");
                toToken = translateWord(token, pMap);
                if (toToken != null) break;
                if (pi==WORD) break;
            }
            if (toToken == null || toToken.length()==0) 
                toToken = token;
            rzText.append(toToken);
            i+=token.length();
        }
        return rzText.toString();
    }
 
    static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "};
 
    public static String translateWord(String pWord, Map pMap) {
        if (pWord == null) return null;
        String extWord = pWord.toLowerCase()+";";
        for (int i=0; i<suffixMacros.length; i++) {
            String suffix = STR.head(suffixMacros[i], "=");
            String toSuffix= STR.tail(suffixMacros[i], "=");
            String word = STR.replace(extWord, suffix, toSuffix).trim();
            String toWord = (String) pMap.get(word);
            if (toWord == null) 
                continue;
            else
                  return toWord.replace('_', ' ').trim();
        }
        return null;
    }
}

檔案:MT4.java

package ccc;
import java.util.*;
 
public class MT4 {
    public static void main(String[] args) throws Exception {
          NET.setProxy("proxy.internal", "3128");          
        String dicText = STR.file2text("MT\\MT.csv").toLowerCase();
        dicText = REGEX.transform(dicText, "(.*?)=(.*?),.*?\\n", "(?1)=(?2)\n");
        TreeMap map  = UTIL.text2map(dicText);
        String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"};
          String text, html, tHtml;
          for (int i=0; i<files.length; i++) {
              System.out.println("translate : "+files[i]);
             html  = STR.file2text("MT\\Test\\"+files[i]);            // translate html file
            tHtml = MT4.translate(html, map);
            STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C"));
          }
    }
 
    static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." };
    static final int PHRASE = 6, WORD=8;
    static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 ";
 
    public static String translate(String pText, Map pMap) {
        pText = STR.expand(pText, normalizeMacros); // normalize text
        StringBuffer rzText = new StringBuffer();
        for (int i=0; i<pText.length(); ) {
            String token=null, toToken=null;
            for (int pi=0; pi<patterns.length; pi++) {
                token = REGEX.matchAt(pText, i, patterns[pi]);
                if (token == null) continue;
                if (pi<PHRASE || pi>WORD) break;
                // the following code for PHRASE and WORD only.
                token = STR.replace(token, "-", " ");
                toToken = translateWord(token, pMap);
                if (toToken != null) break;
                if (pi==WORD) break;
            }
            if (toToken == null || toToken.length()==0) 
                toToken = token;
            rzText.append(toToken);
            i+=token.length();
        }
        return rzText.toString();
    }
 
    static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "ning;= ", "ming;= ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "};
 
    public static String translateWord(String pWord, Map pMap) {
        if (pWord == null) return null;
        String extWord = pWord.toLowerCase()+";";
        for (int i=0; i<suffixMacros.length; i++) {
            String suffix = STR.head(suffixMacros[i], "=");
            String toSuffix= STR.tail(suffixMacros[i], "=");
            String word = STR.replace(extWord, suffix, toSuffix).trim();
            String toWord = (String) pMap.get(word);
            if (toWord == null) 
                continue;
            else
                  return STR.head(toWord, ";").trim();
        }
        return null;
    }
}

檔案:MT5.java

package ccc;
import java.util.*;
import java.io.*;
import java.util.regex.*;
 
/*        Table table = new Table(), eFreqTable=new Table(), cFreqTable=new Table();
        table.load("類,日,英,中,聲", "MT\\J_E_C.txt", "UTF8");
        eFreqTable.load("id,英,頻", "MT\\E_Freq.txt", "ISO8859_1");
        cFreqTable.load("id,英,頻,比", "MT\\C_Freq.txt", "ISO8859_1"); */
 
public class MT5 extends TreeMap {
    public static void main(String[] args) throws Exception {
        String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"};
          String text, html, tHtml;
          for (int i=0; i<files.length; i++) {
              System.out.println("Translate file :"+files[i]);
             html  = STR.file2text("MT\\Test\\"+files[i]);            // translate html file
            tHtml = MT5.translate(html, table, "", "");
            STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C"));
          }
    }
 
    static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." };
    static final int PHRASE = 6, WORD=8;
    static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 ";
 
    public static String translate(String pText, Table pTable, String pFromField, String pToField) throws Exception {
        pText = STR.expand(pText, normalizeMacros); // normalize text
        StringBuffer rzText = new StringBuffer();
        for (int i=0; i<pText.length(); ) {
            String token=null, toToken=null;
            for (int pi=0; pi<patterns.length; pi++) {
                token = STR.matchAt(pText, i, patterns[pi]);
                if (token == null) continue;
                if (pi<PHRASE || pi>WORD) break;
                // the following code for PHRASE and WORD only.
                token = STR.replace(token, "-", " ");
                toToken = translateWord(token, pTable, pFromField, pToField);
                if (toToken != null) break;
                if (pi==WORD) break;
            }
            if (toToken == null || toToken.length()==0) 
                toToken = token;
            rzText.append(toToken);
            i+=token.length();
        }
        return rzText.toString();
    }
 
    static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "};
 
    public static String translateWord(String pWord, Table pTable, String pFromField, String pToField) throws Exception {
        int toFieldIdx = pTable.field2idx(pToField);
        if (pWord == null) return null;
        String extWord = pWord.toLowerCase()+";";
        for (int i=0; i<suffixMacros.length; i++) {
            String suffix = STR.head(suffixMacros[i], "=");
            String toSuffix= STR.tail(suffixMacros[i], "=");
            String word = STR.replace(extWord, suffix, toSuffix).trim();
            Record toRecord = pTable.find(pFromField, word);
            if (toRecord == null)
                continue;
            else
                  return toRecord.get(toFieldIdx).trim();
        }
        return null;
    }
}

Facebook

Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License