commit 56f7be9a447392598b27296e90f5ba238f2a1854 from: Matthias L. Jugel date: Sat Feb 25 22:17:10 2012 UTC initial commit commit - /dev/null commit + 56f7be9a447392598b27296e90f5ba238f2a1854 blob - /dev/null blob + 0f2b988452354af4483aba16eaf0e2772b2fc318 (mode 644) --- /dev/null +++ README @@ -0,0 +1,12 @@ +jsonv + +We are receiving data from the Twitter Streaming API and sometimes save them for later analysis. +To extract readable information from those files I've writtern this little tool. + +* To extract info on the contens (displays json keys): + + java -jar jsonv.jar -i dumpfile.json.gz + +* To extract actual values in tab separated form: + + java -jar jsonv.jar created_at,user.screen_name,text dumpfile.json.gz \ No newline at end of file blob - /dev/null blob + 2c0ecb372a38f1b492cf6eb1f4eb571f9783f12f (mode 644) --- /dev/null +++ pom.xml @@ -0,0 +1,123 @@ + + + + 4.0.0 + twimpact + jsonv + 1.0 + JSON File Viewer (Twitter Dumps) + + 2012 + + UTF-8 + 2.9.0-1 + 1.0.9 + 0.6.6 + 0.9.29 + 1.6.2 + + + + + scala-tools.org + Scala-Tools Maven2 Repository + http://scala-tools.org/repo-releases + + + + + + scala-tools.org + Scala-Tools Maven2 Repository + http://scala-tools.org/repo-releases + + + + + + net.minidev + json-smart + ${json-smart.version} + + + org.scala-lang + scala-library + ${scala.version} + + + org.clapper + grizzled-slf4j_2.9.0 + ${grizzled.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + ch.qos.logback + logback-classic + ${logback.version} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + UTF-8 + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + + + compile + testCompile + + + + + scala.tools.nsc.CompileClient + ${scala.version} + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.2-beta-5 + + + jar-with-dependencies + + + + twimpact.jsonv.Main + + + + + + package + + single + + + + + + + + blob - /dev/null blob + 461b3e4d977e8162c110de52d91184780c6b056f (mode 644) --- /dev/null +++ src/main/resources/logback.xml @@ -0,0 +1,23 @@ + + + + + System.out + + %msg%n + + + + + System.err + + %msg%n + + + + + + + \ No newline at end of file blob - /dev/null blob + c2cf4c97f8cdd1dce821545b00610cc60446438e (mode 644) --- /dev/null +++ src/main/scala/twimpact/jsonv/Main.scala @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2012 TWIMPACT UG (haftungsbeschraenkt). All rights reserved. + */ + +package twimpact.jsonv + +import collection.JavaConverters._ +import grizzled.slf4j.Logging +import net.minidev.json.parser.JSONParser +import net.minidev.json.{JSONArray, JSONObject} +import java.io._ +import java.util.zip.GZIPInputStream + +/** + * >>Describe Class<< + * + * @author Matthias L. Jugel + */ + +object Main extends Logging { + private val jsonParser = new JSONParser(JSONParser.DEFAULT_PERMISSIVE_MODE) + + private def usage(message: Option[String] = None) { + message.foreach(error(_)) + error("usage: jsonv [-i] ") + error(" jvonv [-csv|-tsv] ") + error("") + error("Start by looking up the fields info from the dump using -i.") + error("Then dump your information in CSV or TSV format by providing the") + error("fields (comma separated list with no spaces) and the dump file.") + System.exit(0) + } + + private def getFileReader(dump: File): BufferedReader = { + dump match { + case f if (f.getName.endsWith("gz")) => + new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f)))) + case f => + new BufferedReader(new InputStreamReader(new FileInputStream(f))) + } + } + + val SEPARATOR = "sep" + + def main(args: Array[String]) { + info("jsonv - (c) 2012 Matthias L. Jugel") + if (args.length == 0) usage() + var printInfo = false + var settings: Map[String, String] = Map(SEPARATOR -> "\t") + var input: List[String] = Nil + args.foreach { + arg => arg match { + case "-i" => + printInfo = true + case "-csv" => + settings += (SEPARATOR -> ";") + case "-tsv" => + settings += (SEPARATOR -> "\t") + case i => + input = i :: input + } + } + + input = input.reverse + + if (input.lastOption.isEmpty) { + usage(Some("error: please provide a dump file name")) + } + val dump = new File(input.last) + if (!dump.exists()) { + usage(Some("error: dump file does not exist")) + } + + if (printInfo) { + info("Dumping field information from %s".format(dump)) + val r = getFileReader(dump) + var line = r.readLine + + while (line != null) { + // skip empty lines + if (line.trim.length > 0) { + try { + val json = jsonParser.parse(line) + json match { + case j: JSONObject => + println(j.keySet.asScala.mkString(",")) + case j: JSONArray => + println("@array") + case j: Object => + info("Only found primitive type in json data: %s".format(j)) + } + // stop + line = null + } catch { + case e: Exception => + debug("line can't be parsed: %s".format(e.getMessage)) + line = r.readLine + } + } else { + line = r.readLine + } + } + r.close() + System.exit(0) + } else { + val fields = input.head.split(",").toList + info("dumping fields: %s".format(fields)) + val r = getFileReader(dump) + var line = r.readLine + while (line != null) { + val trimmedLine = line.trim + if (trimmedLine.length > 0) + try { + val json = jsonParser.parse(trimmedLine) + json match { + case j: JSONArray if (fields.contains("@array")) => + println(j.toArray.mkString(settings(SEPARATOR))) + case j: JSONObject => + println(fields.flatMap(f => value(j, f.split("\\."))).map(_.replaceAll("[\\n\\r]+", " ")) + .mkString(settings(SEPARATOR))) + case j: Object => + info("Only found primitive type in json data: %s".format(j)) + } + } catch { + case e: Exception => debug("line can't be parsed: %s".format(e.getMessage)) + } + line = r.readLine + } + } + } + + private def value(o: Object, keys: Seq[String]): Option[String] = { + if (keys.length > 1 && o.isInstanceOf[JSONObject]) + value(o.asInstanceOf[JSONObject].get(keys.head), keys.drop(1)) + else { + val key = keys.head + o match { + case j: JSONArray if (key == "@array") => + error("%s: sub-arrays not supported".format(key)) + None + case j: JSONObject => + Some(j.get(key).toString) + case j: Object => + Some(j.toString) + } + } + } +}