commit - /dev/null
commit + f942969bc96eb9355fe40e2227457a9b99f634a5
blob - /dev/null
blob + e9bf6a7875d926c09a8c475fa6a06d40c2ad6abf (mode 644)
--- /dev/null
+++ README
+This simple project can be used to convert wikipedia dumps to plain text.
+
+usage: java -Xmx2G -Dfile.encoding=UTF-8 -jar wiki2text-1.0-jar-with-dependencies.jar nlwiki-20120203-pages-articles.xml.bz2 > nl.txt
blob - /dev/null
blob + 8556f9bfd15dc1a404dbaad3dedb0a3bce6aa9c8 (mode 644)
--- /dev/null
+++ pom.xml
+<!--
+ ~ Copyright (c) 2011 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>twimpact</groupId>
+ <artifactId>wiki2text</artifactId>
+ <version>1.0</version>
+ <name>Wiki2Text Converter</name>
+
+ <inceptionYear>2012</inceptionYear>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <repositories>
+ <repository>
+ <id>info-bliki-repository</id>
+ <url>http://gwtwiki.googlecode.com/svn/maven-snapshot-repository/</url>
+ <releases>
+ <enabled>false</enabled>
+ </releases>
+ <snapshots>
+ <enabled>true</enabled>
+ </snapshots>
+ </repository>
+ </repositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>info.bliki.wiki</groupId>
+ <artifactId>bliki-core</artifactId>
+ <version>3.0.16</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>2.2-beta-5</version>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ <archive>
+ <manifest>
+ <mainClass>WikipediaDump</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
+
blob - /dev/null
blob + fa152d0cef2d347e3e3a1e9a6d32213272911eb5 (mode 644)
--- /dev/null
+++ src/main/java/WikipediaDump.java
+/*
+ * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ */
+
+
+import info.bliki.wiki.dump.IArticleFilter;
+import info.bliki.wiki.dump.Siteinfo;
+import info.bliki.wiki.dump.WikiArticle;
+import info.bliki.wiki.dump.WikiXMLParser;
+import info.bliki.wiki.filter.PlainTextConverter;
+import info.bliki.wiki.model.WikiModel;
+import org.xml.sax.SAXException;
+
+/**
+ * >>Describe Class<<
+ *
+ * @author Matthias L. Jugel
+ */
+public class WikipediaDump {
+
+ /**
+ * Print title an content of all the wiki pages in the dump.
+ */
+ static class ArticleFilter implements IArticleFilter {
+
+ private static final int MIN_WORDS = 40;
+ WikiModel wikiModel = new WikiModel("http://www.mywiki.com/wiki/${image}", "http://www.mywiki.com/wiki/${title}");
+
+ long max = 0L;
+ long cnt = 0L;
+
+ public ArticleFilter(long bytes) {
+ max = bytes;
+ }
+
+ public void process(WikiArticle page, Siteinfo siteinfo) throws SAXException {
+ if (page.isMain()) {
+ String text = wikiModel.render(new PlainTextConverter(true), page.getText());
+ for (String l : text.split("\n")) {
+ l = l.replaceAll("https?://[^/\\p{Space}]+(?:/[/a-zA-Z0-9_=\\-?~%,.!$()+*|\\\\\\\"\\'#]*)?", " ");
+ l = l.replaceAll("\\{\\{.*?\\}\\}", " ");
+ if (cnt < max) {
+ if (l.split(" ").length > MIN_WORDS) {
+ System.out.println(l);
+ System.err.print("\r" + cnt + " ("+ (100 * cnt / max)+ "%)");
+ cnt += l.length();
+ }
+ } else {
+ System.err.println();
+ System.exit(0);
+ }
+ }
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ if (args.length != 1) {
+ System.err.println("Usage: Parser <XML-FILE>");
+ System.exit(-1);
+ }
+ // String bz2Filename =
+ // "c:\\temp\\dewikiversity-20100401-pages-articles.xml.bz2";
+ String bz2Filename = args[0];
+ try {
+ IArticleFilter handler = new ArticleFilter(1073741824L);
+ WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
+ wxp.parse();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}