commit f942969bc96eb9355fe40e2227457a9b99f634a5 from: Matthias L. Jugel date: Thu Feb 23 06:34:27 2012 UTC initial commit commit - /dev/null commit + f942969bc96eb9355fe40e2227457a9b99f634a5 blob - /dev/null blob + e9bf6a7875d926c09a8c475fa6a06d40c2ad6abf (mode 644) --- /dev/null +++ README @@ -0,0 +1,3 @@ +This simple project can be used to convert wikipedia dumps to plain text. + +usage: java -Xmx2G -Dfile.encoding=UTF-8 -jar wiki2text-1.0-jar-with-dependencies.jar nlwiki-20120203-pages-articles.xml.bz2 > nl.txt blob - /dev/null blob + 8556f9bfd15dc1a404dbaad3dedb0a3bce6aa9c8 (mode 644) --- /dev/null +++ pom.xml @@ -0,0 +1,68 @@ + + + + 4.0.0 + twimpact + wiki2text + 1.0 + Wiki2Text Converter + + 2012 + + UTF-8 + + + + + info-bliki-repository + http://gwtwiki.googlecode.com/svn/maven-snapshot-repository/ + + false + + + true + + + + + + + info.bliki.wiki + bliki-core + 3.0.16 + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.2-beta-5 + + + jar-with-dependencies + + + + WikipediaDump + + + + + + package + + single + + + + + + + + blob - /dev/null blob + fa152d0cef2d347e3e3a1e9a6d32213272911eb5 (mode 644) --- /dev/null +++ src/main/java/WikipediaDump.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved. + */ + + +import info.bliki.wiki.dump.IArticleFilter; +import info.bliki.wiki.dump.Siteinfo; +import info.bliki.wiki.dump.WikiArticle; +import info.bliki.wiki.dump.WikiXMLParser; +import info.bliki.wiki.filter.PlainTextConverter; +import info.bliki.wiki.model.WikiModel; +import org.xml.sax.SAXException; + +/** + * >>Describe Class<< + * + * @author Matthias L. Jugel + */ +public class WikipediaDump { + + /** + * Print title an content of all the wiki pages in the dump. + */ + static class ArticleFilter implements IArticleFilter { + + private static final int MIN_WORDS = 40; + WikiModel wikiModel = new WikiModel("http://www.mywiki.com/wiki/${image}", "http://www.mywiki.com/wiki/${title}"); + + long max = 0L; + long cnt = 0L; + + public ArticleFilter(long bytes) { + max = bytes; + } + + public void process(WikiArticle page, Siteinfo siteinfo) throws SAXException { + if (page.isMain()) { + String text = wikiModel.render(new PlainTextConverter(true), page.getText()); + for (String l : text.split("\n")) { + l = l.replaceAll("https?://[^/\\p{Space}]+(?:/[/a-zA-Z0-9_=\\-?~%,.!$()+*|\\\\\\\"\\'#]*)?", " "); + l = l.replaceAll("\\{\\{.*?\\}\\}", " "); + if (cnt < max) { + if (l.split(" ").length > MIN_WORDS) { + System.out.println(l); + System.err.print("\r" + cnt + " ("+ (100 * cnt / max)+ "%)"); + cnt += l.length(); + } + } else { + System.err.println(); + System.exit(0); + } + } + } + } + } + + public static void main(String[] args) { + if (args.length != 1) { + System.err.println("Usage: Parser "); + System.exit(-1); + } + // String bz2Filename = + // "c:\\temp\\dewikiversity-20100401-pages-articles.xml.bz2"; + String bz2Filename = args[0]; + try { + IArticleFilter handler = new ArticleFilter(1073741824L); + WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler); + wxp.parse(); + } catch (Exception e) { + e.printStackTrace(); + } + } +}