commit - /dev/null
commit + 56f7be9a447392598b27296e90f5ba238f2a1854
blob - /dev/null
blob + 0f2b988452354af4483aba16eaf0e2772b2fc318 (mode 644)
--- /dev/null
+++ README
+jsonv
+
+We are receiving data from the Twitter Streaming API and sometimes save them for later analysis.
+To extract readable information from those files I've writtern this little tool.
+
+* To extract info on the contens (displays json keys):
+
+ java -jar jsonv.jar -i dumpfile.json.gz
+
+* To extract actual values in tab separated form:
+
+ java -jar jsonv.jar created_at,user.screen_name,text dumpfile.json.gz
\ No newline at end of file
blob - /dev/null
blob + 2c0ecb372a38f1b492cf6eb1f4eb571f9783f12f (mode 644)
--- /dev/null
+++ pom.xml
+<!--
+ ~ Copyright (c) 2011 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>twimpact</groupId>
+ <artifactId>jsonv</artifactId>
+ <version>1.0</version>
+ <name>JSON File Viewer (Twitter Dumps)</name>
+
+ <inceptionYear>2012</inceptionYear>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <scala.version>2.9.0-1</scala.version>
+ <json-smart.version>1.0.9</json-smart.version>
+ <grizzled.version>0.6.6</grizzled.version>
+ <logback.version>0.9.29</logback.version>
+ <slf4j.version>1.6.2</slf4j.version>
+ </properties>
+
+ <repositories>
+ <repository>
+ <id>scala-tools.org</id>
+ <name>Scala-Tools Maven2 Repository</name>
+ <url>http://scala-tools.org/repo-releases</url>
+ </repository>
+ </repositories>
+
+ <pluginRepositories>
+ <pluginRepository>
+ <id>scala-tools.org</id>
+ <name>Scala-Tools Maven2 Repository</name>
+ <url>http://scala-tools.org/repo-releases</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>net.minidev</groupId>
+ <artifactId>json-smart</artifactId>
+ <version>${json-smart.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.scala-lang</groupId>
+ <artifactId>scala-library</artifactId>
+ <version>${scala.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.clapper</groupId>
+ <artifactId>grizzled-slf4j_2.9.0</artifactId>
+ <version>${grizzled.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>${logback.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ <encoding>UTF-8</encoding>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.scala-tools</groupId>
+ <artifactId>maven-scala-plugin</artifactId>
+ <version>2.15.2</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>compile</goal>
+ <goal>testCompile</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <scalaClassName>scala.tools.nsc.CompileClient</scalaClassName>
+ <scalaVersion>${scala.version}</scalaVersion>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>2.2-beta-5</version>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ <archive>
+ <manifest>
+ <mainClass>twimpact.jsonv.Main</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
+
blob - /dev/null
blob + 461b3e4d977e8162c110de52d91184780c6b056f (mode 644)
--- /dev/null
+++ src/main/resources/logback.xml
+<!--
+ ~ Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ -->
+
+<configuration debug="false">
+ <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+ <target>System.out</target>
+ <encoder>
+ <pattern>%msg%n</pattern>
+ </encoder>
+ </appender>
+
+ <appender name="STDERR" class="ch.qos.logback.core.ConsoleAppender">
+ <target>System.err</target>
+ <encoder>
+ <pattern>%msg%n</pattern>
+ </encoder>
+ </appender>
+
+ <root level="INFO">
+ <appender-ref ref="STDERR"/>
+ </root>
+</configuration>
\ No newline at end of file
blob - /dev/null
blob + c2cf4c97f8cdd1dce821545b00610cc60446438e (mode 644)
--- /dev/null
+++ src/main/scala/twimpact/jsonv/Main.scala
+/*
+ * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved.
+ */
+
+package twimpact.jsonv
+
+import collection.JavaConverters._
+import grizzled.slf4j.Logging
+import net.minidev.json.parser.JSONParser
+import net.minidev.json.{JSONArray, JSONObject}
+import java.io._
+import java.util.zip.GZIPInputStream
+
+/**
+ * >>Describe Class<<
+ *
+ * @author Matthias L. Jugel
+ */
+
+object Main extends Logging {
+ private val jsonParser = new JSONParser(JSONParser.DEFAULT_PERMISSIVE_MODE)
+
+ private def usage(message: Option[String] = None) {
+ message.foreach(error(_))
+ error("usage: jsonv [-i] <dump>")
+ error(" jvonv [-csv|-tsv] <fields> <dump>")
+ error("")
+ error("Start by looking up the fields info from the dump using -i.")
+ error("Then dump your information in CSV or TSV format by providing the")
+ error("fields (comma separated list with no spaces) and the dump file.")
+ System.exit(0)
+ }
+
+ private def getFileReader(dump: File): BufferedReader = {
+ dump match {
+ case f if (f.getName.endsWith("gz")) =>
+ new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f))))
+ case f =>
+ new BufferedReader(new InputStreamReader(new FileInputStream(f)))
+ }
+ }
+
+ val SEPARATOR = "sep"
+
+ def main(args: Array[String]) {
+ info("jsonv - (c) 2012 Matthias L. Jugel")
+ if (args.length == 0) usage()
+ var printInfo = false
+ var settings: Map[String, String] = Map(SEPARATOR -> "\t")
+ var input: List[String] = Nil
+ args.foreach {
+ arg => arg match {
+ case "-i" =>
+ printInfo = true
+ case "-csv" =>
+ settings += (SEPARATOR -> ";")
+ case "-tsv" =>
+ settings += (SEPARATOR -> "\t")
+ case i =>
+ input = i :: input
+ }
+ }
+
+ input = input.reverse
+
+ if (input.lastOption.isEmpty) {
+ usage(Some("error: please provide a dump file name"))
+ }
+ val dump = new File(input.last)
+ if (!dump.exists()) {
+ usage(Some("error: dump file does not exist"))
+ }
+
+ if (printInfo) {
+ info("Dumping field information from %s".format(dump))
+ val r = getFileReader(dump)
+ var line = r.readLine
+
+ while (line != null) {
+ // skip empty lines
+ if (line.trim.length > 0) {
+ try {
+ val json = jsonParser.parse(line)
+ json match {
+ case j: JSONObject =>
+ println(j.keySet.asScala.mkString(","))
+ case j: JSONArray =>
+ println("@array")
+ case j: Object =>
+ info("Only found primitive type in json data: %s".format(j))
+ }
+ // stop
+ line = null
+ } catch {
+ case e: Exception =>
+ debug("line can't be parsed: %s".format(e.getMessage))
+ line = r.readLine
+ }
+ } else {
+ line = r.readLine
+ }
+ }
+ r.close()
+ System.exit(0)
+ } else {
+ val fields = input.head.split(",").toList
+ info("dumping fields: %s".format(fields))
+ val r = getFileReader(dump)
+ var line = r.readLine
+ while (line != null) {
+ val trimmedLine = line.trim
+ if (trimmedLine.length > 0)
+ try {
+ val json = jsonParser.parse(trimmedLine)
+ json match {
+ case j: JSONArray if (fields.contains("@array")) =>
+ println(j.toArray.mkString(settings(SEPARATOR)))
+ case j: JSONObject =>
+ println(fields.flatMap(f => value(j, f.split("\\."))).map(_.replaceAll("[\\n\\r]+", " "))
+ .mkString(settings(SEPARATOR)))
+ case j: Object =>
+ info("Only found primitive type in json data: %s".format(j))
+ }
+ } catch {
+ case e: Exception => debug("line can't be parsed: %s".format(e.getMessage))
+ }
+ line = r.readLine
+ }
+ }
+ }
+
+ private def value(o: Object, keys: Seq[String]): Option[String] = {
+ if (keys.length > 1 && o.isInstanceOf[JSONObject])
+ value(o.asInstanceOf[JSONObject].get(keys.head), keys.drop(1))
+ else {
+ val key = keys.head
+ o match {
+ case j: JSONArray if (key == "@array") =>
+ error("%s: sub-arrays not supported".format(key))
+ None
+ case j: JSONObject =>
+ Some(j.get(key).toString)
+ case j: Object =>
+ Some(j.toString)
+ }
+ }
+ }
+}