陳鍾誠的程式作品 -- 機器翻譯範例集
程式作品C 語言JavaC#JavaScript常用函數文字處理遊戲程式衛星定位系統程式資料結構網路程式自然語言人工智慧機率統計資訊安全等待完成訊息相關網站參考文獻最新修改簡體版English |
檔案:MT1.javaimport java.util.*; public class MT1 { public static void main(String[] args) { TreeMap map = STR.text2map("book=書\nis=是\nthis=這\na=一\n"); System.out.println(MT1.translate("this is a excellent book", map)); } public static String translate(String pText, Map map) { StringBuffer rzText = new StringBuffer(); String[] words = pText.split("\\W+"); for (int i=0; i<words.length;i++) { String toWord = (String) map.get(words[i]); if (toWord == null) toWord = words[i]; rzText.append(toWord+" "); } return rzText.toString(); } } 檔案:MT2.javapackage ccc; import java.util.*; import java.util.regex.*; public class MT { public static void main(String[] args) throws Exception { String dictionary = STR.file2text("MT\\MT.dic"); TreeMap map = STR.text2map(dictionary); String text = STR.file2text("MT\\MT.txt"); String tText= MT.translateText(text, map); System.out.println(tText); String html = STR.file2text("MT\\MT.htm"); String tHtml = MT.translateXml(html, map); System.out.println(tHtml); STR.text2file(tHtml, "MT\\MT_C.htm"); } public static String translateText(String pText, Map pMap) { StringBuffer rzText = new StringBuffer(); String[] lines = pText.split("\n"); for (int i=0; i<lines.length; i++) { rzText.append(lines[i]+"\n"); rzText.append("->"+translate(lines[i], pMap)+"\n"); } return rzText.toString(); } public static String translate(String pText, Map pMap) { pText = normalizeText(pText); StringBuffer rzText = new StringBuffer(); Pattern p = Pattern.compile("\\w+"); Matcher m = p.matcher(pText); int lastIdx = 0; while (m.find()) { String prev = pText.substring(lastIdx, m.start()); String word = pText.substring(m.start(), m.end()); String toWord = translateWord(word, pMap); if (toWord == null) toWord = word; rzText.append(prev+toWord); lastIdx = m.end(); } rzText.append(pText.substring(lastIdx)); return rzText.toString(); } public static String normalizeText(String pText) { String rules = "n't = not |'re = are |'m = am |'ve = have |home page=home_page|web browser=browser|source code=source_code|on-line=on_line|world wide web=world_wide_web|much more=much_more"; return STR.expand(pText,rules); } public static String translateWord(String pWord, Map pMap) { String tWord = pWord.toLowerCase()+" "; String[] macros = {" = ", "s =", "es =", "ing =", "ed =", "d =", "ly =", "ies =y", "al ="}; String toWord = null; for (int i=0; i<macros.length; i++) { String suffix = STR.head(macros[i], "="); String toSuffix= STR.tail(macros[i], "="); String word = STR.replace(tWord, suffix, toSuffix).trim(); toWord = (String) pMap.get(word); if (toWord == null) continue; else { if (toWord.trim().length()==0) return pWord; // a=_ means we do not translate this word. return toWord; // not defined, we do not translate it, too. } } return null; } public static String translateXml(String pXml, Map pMap) { StringBuffer rzXml = new StringBuffer(); for (int i=0; i<pXml.length();) { if (pXml.charAt(i) == '>') { int tagLen = pXml.substring(i).indexOf("<"); if (tagLen < 0) tagLen = pXml.substring(i).length(); String text = pXml.substring(i+1, i+tagLen); rzXml.append(">"+translate(text, pMap)); i+=tagLen; } else rzXml.append(pXml.charAt(i++)); } return rzXml.toString(); } } 檔案:MT3.javapackage ccc; import java.util.*; public class MT3 { public static void main(String[] args) throws Exception { NET.setProxy("proxy.internal", "3128"); String dictionary = STR.file2text("MT\\MT.dic").toLowerCase(); TreeMap map = UTIL.text2map(dictionary); String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"}; String text, html, tHtml; for (int i=0; i<files.length; i++) { html = STR.file2text("MT\\Test\\"+files[i]); // translate html file tHtml = MT3.translate(html, map); STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C")); } } static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." }; static final int PHRASE = 6, WORD=8; static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 "; public static String translate(String pText, Map pMap) { pText = STR.expand(pText, normalizeMacros); // normalize text StringBuffer rzText = new StringBuffer(); for (int i=0; i<pText.length(); ) { String token=null, toToken=null; for (int pi=0; pi<patterns.length; pi++) { token = REGEX.matchAt(pText, i, patterns[pi]); if (token == null) continue; if (pi<PHRASE || pi>WORD) break; // the following code for PHRASE and WORD only. token = STR.replace(token, "-", " "); toToken = translateWord(token, pMap); if (toToken != null) break; if (pi==WORD) break; } if (toToken == null || toToken.length()==0) toToken = token; rzText.append(toToken); i+=token.length(); } return rzText.toString(); } static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "}; public static String translateWord(String pWord, Map pMap) { if (pWord == null) return null; String extWord = pWord.toLowerCase()+";"; for (int i=0; i<suffixMacros.length; i++) { String suffix = STR.head(suffixMacros[i], "="); String toSuffix= STR.tail(suffixMacros[i], "="); String word = STR.replace(extWord, suffix, toSuffix).trim(); String toWord = (String) pMap.get(word); if (toWord == null) continue; else return toWord.replace('_', ' ').trim(); } return null; } } 檔案:MT4.javapackage ccc; import java.util.*; public class MT4 { public static void main(String[] args) throws Exception { NET.setProxy("proxy.internal", "3128"); String dicText = STR.file2text("MT\\MT.csv").toLowerCase(); dicText = REGEX.transform(dicText, "(.*?)=(.*?),.*?\\n", "(?1)=(?2)\n"); TreeMap map = UTIL.text2map(dicText); String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"}; String text, html, tHtml; for (int i=0; i<files.length; i++) { System.out.println("translate : "+files[i]); html = STR.file2text("MT\\Test\\"+files[i]); // translate html file tHtml = MT4.translate(html, map); STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C")); } } static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." }; static final int PHRASE = 6, WORD=8; static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 "; public static String translate(String pText, Map pMap) { pText = STR.expand(pText, normalizeMacros); // normalize text StringBuffer rzText = new StringBuffer(); for (int i=0; i<pText.length(); ) { String token=null, toToken=null; for (int pi=0; pi<patterns.length; pi++) { token = REGEX.matchAt(pText, i, patterns[pi]); if (token == null) continue; if (pi<PHRASE || pi>WORD) break; // the following code for PHRASE and WORD only. token = STR.replace(token, "-", " "); toToken = translateWord(token, pMap); if (toToken != null) break; if (pi==WORD) break; } if (toToken == null || toToken.length()==0) toToken = token; rzText.append(toToken); i+=token.length(); } return rzText.toString(); } static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "ning;= ", "ming;= ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "}; public static String translateWord(String pWord, Map pMap) { if (pWord == null) return null; String extWord = pWord.toLowerCase()+";"; for (int i=0; i<suffixMacros.length; i++) { String suffix = STR.head(suffixMacros[i], "="); String toSuffix= STR.tail(suffixMacros[i], "="); String word = STR.replace(extWord, suffix, toSuffix).trim(); String toWord = (String) pMap.get(word); if (toWord == null) continue; else return STR.head(toWord, ";").trim(); } return null; } } 檔案:MT5.javapackage ccc; import java.util.*; import java.io.*; import java.util.regex.*; /* Table table = new Table(), eFreqTable=new Table(), cFreqTable=new Table(); table.load("類,日,英,中,聲", "MT\\J_E_C.txt", "UTF8"); eFreqTable.load("id,英,頻", "MT\\E_Freq.txt", "ISO8859_1"); cFreqTable.load("id,英,頻,比", "MT\\C_Freq.txt", "ISO8859_1"); */ public class MT5 extends TreeMap { public static void main(String[] args) throws Exception { String[] files = {"MT_E.htm", "Patent1_E.htm", "Patent2_E.htm", "Patent3_E.htm", "Patent4_E.htm", "Yahoo_E.htm", "Tom_E.txt"}; String text, html, tHtml; for (int i=0; i<files.length; i++) { System.out.println("Translate file :"+files[i]); html = STR.file2text("MT\\Test\\"+files[i]); // translate html file tHtml = MT5.translate(html, table, "英", "中"); STR.text2file(tHtml, "MT\\Test\\"+STR.replace(files[i], "_E", "_C")); } } static final String[] patterns={ /*0. comment*/ "<!--.*?-->", /*1.script*/ "<script.+?</script>", /*2.style*/ "<style.+?</style>", /*3.mark-up*/ "<.+?>", /*4.url*/"http://.{10,30}/?", /*5.-abc-, .abc-*/ "[\\.\\/-]\\w+[\\.\\/-]", /*6.phrase=three words*/"\\p{Alpha}+[\\s-]\\p{Alpha}+[\\s-]\\p{Alpha}+", /*7.phrase=two words*/ "\\p{Alpha}+[\\s-]\\p{Alpha}+", /*8.word*/"\\p{Alpha}+", /*9.others*/"." }; static final int PHRASE = 6, WORD=8; static final String normalizeMacros=" \n= |-\n=|n't = not |'re = are |'m = am |'ve = have |'d =would |can't =can not |Intern'l =International |U.S.=United States |U.K.=United Kingdom |Appl. No.=申請號 |Mar.=三月|May.=五月 |Dec.=十月 |Dr.=博士|FIG.=圖示 |FIGS.=圖示 "; public static String translate(String pText, Table pTable, String pFromField, String pToField) throws Exception { pText = STR.expand(pText, normalizeMacros); // normalize text StringBuffer rzText = new StringBuffer(); for (int i=0; i<pText.length(); ) { String token=null, toToken=null; for (int pi=0; pi<patterns.length; pi++) { token = STR.matchAt(pText, i, patterns[pi]); if (token == null) continue; if (pi<PHRASE || pi>WORD) break; // the following code for PHRASE and WORD only. token = STR.replace(token, "-", " "); toToken = translateWord(token, pTable, pFromField, pToField); if (toToken != null) break; if (pi==WORD) break; } if (toToken == null || toToken.length()==0) toToken = token; rzText.append(toToken); i+=token.length(); } return rzText.toString(); } static final String[] suffixMacros = {";= ", "s;= ", "es;= ", "ies;=y ", "ing;= ", "ing;=e ", "er;=e ", "est;=e;", "ed;= ", "d;= ", "ly;= ", "ies;=y ", "al;= ", "able;= "}; public static String translateWord(String pWord, Table pTable, String pFromField, String pToField) throws Exception { int toFieldIdx = pTable.field2idx(pToField); if (pWord == null) return null; String extWord = pWord.toLowerCase()+";"; for (int i=0; i<suffixMacros.length; i++) { String suffix = STR.head(suffixMacros[i], "="); String toSuffix= STR.tail(suffixMacros[i], "="); String word = STR.replace(extWord, suffix, toSuffix).trim(); Record toRecord = pTable.find(pFromField, word); if (toRecord == null) continue; else return toRecord.get(toFieldIdx).trim(); } return null; } } |
page revision: 1, last edited: 05 Nov 2010 08:39
Post preview:
Close preview