Commit Diff


commit - /dev/null
commit + f942969bc96eb9355fe40e2227457a9b99f634a5
blob - /dev/null
blob + e9bf6a7875d926c09a8c475fa6a06d40c2ad6abf (mode 644)
--- /dev/null
+++ README
@@ -0,0 +1,3 @@
+This simple project can be used to convert wikipedia dumps to plain text.
+
+usage: java -Xmx2G -Dfile.encoding=UTF-8 -jar wiki2text-1.0-jar-with-dependencies.jar nlwiki-20120203-pages-articles.xml.bz2 > nl.txt
blob - /dev/null
blob + 8556f9bfd15dc1a404dbaad3dedb0a3bce6aa9c8 (mode 644)
--- /dev/null
+++ pom.xml
@@ -0,0 +1,68 @@
+<!--
+  ~ Copyright (c) 2011 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>twimpact</groupId>
+  <artifactId>wiki2text</artifactId>
+  <version>1.0</version>
+  <name>Wiki2Text Converter</name>
+
+  <inceptionYear>2012</inceptionYear>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <repositories>
+    <repository>
+      <id>info-bliki-repository</id>
+      <url>http://gwtwiki.googlecode.com/svn/maven-snapshot-repository/</url>
+      <releases>
+        <enabled>false</enabled>
+      </releases>
+      <snapshots>
+        <enabled>true</enabled>
+      </snapshots>
+    </repository>
+  </repositories>
+
+  <dependencies>
+    <dependency>
+      <groupId>info.bliki.wiki</groupId>
+      <artifactId>bliki-core</artifactId>
+      <version>3.0.16</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.2-beta-5</version>
+        <configuration>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+          <archive>
+            <manifest>
+              <mainClass>WikipediaDump</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+
blob - /dev/null
blob + fa152d0cef2d347e3e3a1e9a6d32213272911eb5 (mode 644)
--- /dev/null
+++ src/main/java/WikipediaDump.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ */
+
+
+import info.bliki.wiki.dump.IArticleFilter;
+import info.bliki.wiki.dump.Siteinfo;
+import info.bliki.wiki.dump.WikiArticle;
+import info.bliki.wiki.dump.WikiXMLParser;
+import info.bliki.wiki.filter.PlainTextConverter;
+import info.bliki.wiki.model.WikiModel;
+import org.xml.sax.SAXException;
+
+/**
+ * >>Describe Class<<
+ *
+ * @author Matthias L. Jugel
+ */
+public class WikipediaDump {
+
+  /**
+   * Print title an content of all the wiki pages in the dump.
+   */
+  static class ArticleFilter implements IArticleFilter {
+
+    private static final int MIN_WORDS = 40;
+    WikiModel wikiModel = new WikiModel("http://www.mywiki.com/wiki/${image}", "http://www.mywiki.com/wiki/${title}");
+
+    long max = 0L;
+    long cnt = 0L;
+
+    public ArticleFilter(long bytes) {
+      max = bytes;
+    }
+
+    public void process(WikiArticle page, Siteinfo siteinfo) throws SAXException {
+      if (page.isMain()) {
+        String text = wikiModel.render(new PlainTextConverter(true), page.getText());
+        for (String l : text.split("\n")) {
+          l = l.replaceAll("https?://[^/\\p{Space}]+(?:/[/a-zA-Z0-9_=\\-?~%,.!$()+*|\\\\\\\"\\'#]*)?", " ");
+          l = l.replaceAll("\\{\\{.*?\\}\\}", " ");
+          if (cnt < max) {
+            if (l.split(" ").length > MIN_WORDS) {
+              System.out.println(l);
+              System.err.print("\r" + cnt + " ("+ (100 * cnt / max)+ "%)");
+              cnt += l.length();
+            }
+          } else {
+            System.err.println();
+            System.exit(0);
+          }
+        }
+      }
+    }
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      System.err.println("Usage: Parser <XML-FILE>");
+      System.exit(-1);
+    }
+    // String bz2Filename =
+    // "c:\\temp\\dewikiversity-20100401-pages-articles.xml.bz2";
+    String bz2Filename = args[0];
+    try {
+      IArticleFilter handler = new ArticleFilter(1073741824L);
+      WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
+      wxp.parse();
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}