維基百科資料的處理程式

程式作品

C 語言

Java

C#

JavaScript

常用函數

文字處理

遊戲程式

衛星定位

系統程式

資料結構

網路程式

自然語言

人工智慧

機率統計

資訊安全

等待完成

訊息

相關網站

參考文獻

最新修改

簡體版

English

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;

namespace OpenCL
{
    class WikiProcess
    {
        public static void Main(string[] args)
        {
            wikiXmlToPageFiles("corpus/wikinews", "zh");
            wikiXmlToPageFiles("corpus/wikinews", "en");
//            wikiXmlToPageFiles("corpus/wikipedia", "zh");
//            wikiXmlToPageFiles("corpus/wikisource", "zh");
            //            wikiXmlToPageFiles("corpus/wikinews/enwikinews.xml", "corpus/wikinews/enwikinews");
//            buildWikiChineseEnglishCorpus("corpus/wikinews/zhwikinews.xml", "corpus/wikinews/zhwikinews");
        }

        public static String titleToFilePath(String title, String lang) 
        {
            String pageFileName = FileSystem.strToFileName(title).ToLower();
            Match m = Regex.Match(pageFileName, @"^[_\s]*(([a-z0-9][a-z0-9_]|.)(.+?))[_\s]*$");
            String fileName = m.Groups[1].Value;
            String dir = m.Groups[2].Value;
            return "page/"+ dir + "/" + fileName + "." + lang + ".xml";
            //            return pageFileName.Substring(0, 1) + "/" + pageFileName+".txt";
        }

        static String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n";
        static String listXslHeader = "<?xml-stylesheet type=\"text/xsl\" href=\"../WikiList.xsl\"?>\r\n";
        static String pageXslHeader = "<?xml-stylesheet type=\"text/xsl\" href=\"../../../WikiPage.xsl\"?>\r\n";

        public static void wikiXmlToPageFiles(string wikiDir, string lang)
        {
            String refLang;
            if (lang.Equals("en")) {
                refLang = "zh"; 
            } else {
                refLang = "en";
            }
            TextFile listFile = new TextFile(wikiDir + "/"+ lang + "List.xml", FileMode.Create);
            listFile.writeln(xmlHeader+listXslHeader+"<list>");
            WikiFile wikiFile = new WikiFile(wikiDir + "/" + lang + ".xml");
            foreach (string page in wikiFile.pages())
            {
                String id = STR.innerText(page, "<id>", "</id>");
                String title = STR.innerText(page, "<title>", "</title>");
                // 抽取該頁內的中文或英文連結。
                StringBuilder refXml = new StringBuilder();
                String refTitle = Regexp.matchFirst(@"\[\[(" + refLang + @"):(.*?)\]\]", page, 2);
                if (refTitle == null) 
                    continue;

                String refFileName = titleToFilePath(refTitle, refLang);
                refXml.Append(String.Format("<{0} file=\"{1}\" title=\"{2}\"/>\r\n", refLang, refFileName, refTitle));
                String enTitle = lang.Equals("en")?title:refTitle;
                String fileName = titleToFilePath(enTitle, lang);
                String filePath = wikiDir + "/" + fileName;
                FileSystem.createDir(filePath);
                TextFile.textToFile(xmlHeader + pageXslHeader + page.ToString(), filePath);
                String itemXml = String.Format("<translation>\r\n <{0} file=\"{1}\" title=\"{2}\"/>\r\n" +
                                               " {3}</translation>\r\n", lang, fileName, title, refXml);
                Debug.println("id=" + id + " offset=" + wikiFile.stream.Position + " file " + filePath);
                listFile.write(itemXml);
            }
            wikiFile.close();
            listFile.writeln("</list>\r\n");
            listFile.close();
        }
    }

    class WikiFile : TextFile
    {
        public WikiFile(String pWikiFileName) : base(pWikiFileName) { }

        public IEnumerable<String> pages()
        {
            StringBuilder page = new StringBuilder();
            foreach (string line in lines())
            {
                if (line.Trim().Equals("<page>"))
                {
                    page.Length = 0;
                    page.Append(line + "\r\n");
                }
                else if (line.Trim().Equals("</page>"))
                {
                    page.Append(line + "\r\n");
                    yield return page.ToString();
                }
                else
                    page.Append(line + "\r\n");
            }
        }

    }
}

Facebook

Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License