Commit Diff


commit - /dev/null
commit + 56f7be9a447392598b27296e90f5ba238f2a1854
blob - /dev/null
blob + 0f2b988452354af4483aba16eaf0e2772b2fc318 (mode 644)
--- /dev/null
+++ README
@@ -0,0 +1,12 @@
+jsonv
+
+We are receiving data from the Twitter Streaming API and sometimes save them for later analysis.
+To extract readable information from those files I've writtern this little tool.
+
+* To extract info on the contens (displays json keys):
+
+  java -jar jsonv.jar -i dumpfile.json.gz
+
+* To extract actual values in tab separated form:
+
+  java -jar jsonv.jar created_at,user.screen_name,text dumpfile.json.gz
\ No newline at end of file
blob - /dev/null
blob + 2c0ecb372a38f1b492cf6eb1f4eb571f9783f12f (mode 644)
--- /dev/null
+++ pom.xml
@@ -0,0 +1,123 @@
+<!--
+  ~ Copyright (c) 2011 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>twimpact</groupId>
+  <artifactId>jsonv</artifactId>
+  <version>1.0</version>
+  <name>JSON File Viewer (Twitter Dumps)</name>
+
+  <inceptionYear>2012</inceptionYear>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <scala.version>2.9.0-1</scala.version>
+    <json-smart.version>1.0.9</json-smart.version>
+    <grizzled.version>0.6.6</grizzled.version>
+    <logback.version>0.9.29</logback.version>
+    <slf4j.version>1.6.2</slf4j.version>
+  </properties>
+
+  <repositories>
+    <repository>
+      <id>scala-tools.org</id>
+      <name>Scala-Tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </repository>
+  </repositories>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>scala-tools.org</id>
+      <name>Scala-Tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <dependencies>
+    <dependency>
+      <groupId>net.minidev</groupId>
+      <artifactId>json-smart</artifactId>
+      <version>${json-smart.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.clapper</groupId>
+      <artifactId>grizzled-slf4j_2.9.0</artifactId>
+      <version>${grizzled.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>${slf4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>ch.qos.logback</groupId>
+      <artifactId>logback-classic</artifactId>
+      <version>${logback.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.3.2</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+          <encoding>UTF-8</encoding>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <scalaClassName>scala.tools.nsc.CompileClient</scalaClassName>
+          <scalaVersion>${scala.version}</scalaVersion>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.2-beta-5</version>
+        <configuration>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+          <archive>
+            <manifest>
+              <mainClass>twimpact.jsonv.Main</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+
blob - /dev/null
blob + 461b3e4d977e8162c110de52d91184780c6b056f (mode 644)
--- /dev/null
+++ src/main/resources/logback.xml
@@ -0,0 +1,23 @@
+<!--
+  ~ Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+  -->
+
+<configuration debug="false">
+  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+    <target>System.out</target>
+    <encoder>
+      <pattern>%msg%n</pattern>
+    </encoder>
+  </appender>
+
+  <appender name="STDERR" class="ch.qos.logback.core.ConsoleAppender">
+    <target>System.err</target>
+    <encoder>
+      <pattern>%msg%n</pattern>
+    </encoder>
+  </appender>
+
+  <root level="INFO">
+    <appender-ref ref="STDERR"/>
+  </root>
+</configuration>
\ No newline at end of file
blob - /dev/null
blob + c2cf4c97f8cdd1dce821545b00610cc60446438e (mode 644)
--- /dev/null
+++ src/main/scala/twimpact/jsonv/Main.scala
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ */
+
+package twimpact.jsonv
+
+import collection.JavaConverters._
+import grizzled.slf4j.Logging
+import net.minidev.json.parser.JSONParser
+import net.minidev.json.{JSONArray, JSONObject}
+import java.io._
+import java.util.zip.GZIPInputStream
+
+/**
+ * >>Describe Class<<
+ *
+ * @author Matthias L. Jugel
+ */
+
+object Main extends Logging {
+  private val jsonParser = new JSONParser(JSONParser.DEFAULT_PERMISSIVE_MODE)
+
+  private def usage(message: Option[String] = None) {
+    message.foreach(error(_))
+    error("usage: jsonv [-i] <dump>")
+    error("       jvonv [-csv|-tsv] <fields> <dump>")
+    error("")
+    error("Start by looking up the fields info from the dump using -i.")
+    error("Then dump your information in CSV or TSV format by providing the")
+    error("fields (comma separated list with no spaces) and the dump file.")
+    System.exit(0)
+  }
+
+  private def getFileReader(dump: File): BufferedReader = {
+    dump match {
+      case f if (f.getName.endsWith("gz")) =>
+        new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))
+      case f =>
+        new BufferedReader(new InputStreamReader(new FileInputStream(f)))
+    }
+  }
+
+  val SEPARATOR = "sep"
+
+  def main(args: Array[String]) {
+    info("jsonv - (c) 2012 Matthias L. Jugel")
+    if (args.length == 0) usage()
+    var printInfo = false
+    var settings: Map[String, String] = Map(SEPARATOR -> "\t")
+    var input: List[String] = Nil
+    args.foreach {
+      arg => arg match {
+        case "-i" =>
+          printInfo = true
+        case "-csv" =>
+          settings += (SEPARATOR -> ";")
+        case "-tsv" =>
+          settings += (SEPARATOR -> "\t")
+        case i =>
+          input = i :: input
+      }
+    }
+
+    input = input.reverse
+
+    if (input.lastOption.isEmpty) {
+      usage(Some("error: please provide a dump file name"))
+    }
+    val dump = new File(input.last)
+    if (!dump.exists()) {
+      usage(Some("error: dump file does not exist"))
+    }
+
+    if (printInfo) {
+      info("Dumping field information from %s".format(dump))
+      val r = getFileReader(dump)
+      var line = r.readLine
+
+      while (line != null) {
+        // skip empty lines
+        if (line.trim.length > 0) {
+          try {
+            val json = jsonParser.parse(line)
+            json match {
+              case j: JSONObject =>
+                println(j.keySet.asScala.mkString(","))
+              case j: JSONArray =>
+                println("@array")
+              case j: Object =>
+                info("Only found primitive type in json data: %s".format(j))
+            }
+            // stop
+            line = null
+          } catch {
+            case e: Exception =>
+              debug("line can't be parsed: %s".format(e.getMessage))
+              line = r.readLine
+          }
+        } else {
+          line = r.readLine
+        }
+      }
+      r.close()
+      System.exit(0)
+    } else {
+      val fields = input.head.split(",").toList
+      info("dumping fields: %s".format(fields))
+      val r = getFileReader(dump)
+      var line = r.readLine
+      while (line != null) {
+        val trimmedLine = line.trim
+        if (trimmedLine.length > 0)
+          try {
+            val json = jsonParser.parse(trimmedLine)
+            json match {
+              case j: JSONArray if (fields.contains("@array")) =>
+                println(j.toArray.mkString(settings(SEPARATOR)))
+              case j: JSONObject =>
+                println(fields.flatMap(f => value(j, f.split("\\."))).map(_.replaceAll("[\\n\\r]+", " "))
+                    .mkString(settings(SEPARATOR)))
+              case j: Object =>
+                info("Only found primitive type in json data: %s".format(j))
+            }
+          } catch {
+            case e: Exception => debug("line can't be parsed: %s".format(e.getMessage))
+          }
+        line = r.readLine
+      }
+    }
+  }
+
+  private def value(o: Object, keys: Seq[String]): Option[String] = {
+    if (keys.length > 1 && o.isInstanceOf[JSONObject])
+      value(o.asInstanceOf[JSONObject].get(keys.head), keys.drop(1))
+    else {
+      val key = keys.head
+      o match {
+        case j: JSONArray if (key == "@array") =>
+          error("%s: sub-arrays not supported".format(key))
+          None
+        case j: JSONObject =>
+          Some(j.get(key).toString)
+        case j: Object =>
+          Some(j.toString)
+      }
+    }
+  }
+}