Revision: 16881
http://sourceforge.net/p/gate/code/16881
Author: adamfunk
Date: 2013-09-12 14:31:26 +0000 (Thu, 12 Sep 2013)
Log Message:
-----------
Splitting functionality & modelling up to allow for expansion
Added Paths:
-----------
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/JSONTweetFormat.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
Removed Paths:
-------------
gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
Deleted: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2013-09-12 12:50:30 UTC (rev 16880)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2013-09-12 14:31:26 UTC (rev 16881)
@@ -1,269 +0,0 @@
-/*
- * JSONTweetFormat.java
- *
- * Copyright (c) 1995-2013, The University of Sheffield. See the file
- * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
- *
- * This file is part of GATE (see http://gate.ac.uk/), and is free
- * software, licenced under the GNU Library General Public License,
- * Version 2, June 1991 (in the distribution as file licence.html,
- * and also available at http://gate.ac.uk/gate/licence.html).
- */
-package gate.corpora;
-
-import gate.AnnotationSet;
-import gate.DocumentContent;
-import gate.Factory;
-import gate.FeatureMap;
-import gate.GateConstants;
-import gate.Resource;
-import gate.creole.ResourceInstantiationException;
-import gate.creole.metadata.AutoInstance;
-import gate.creole.metadata.CreoleResource;
-import gate.util.DocumentFormatException;
-import gate.util.InvalidOffsetException;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.lang.StringEscapeUtils;
-import org.apache.commons.lang.StringUtils;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.node.ArrayNode;
-
-
-// JSON API
-// http://json-lib.sourceforge.net/apidocs/jdk15/index.html
-// Jackson API
-// http://wiki.fasterxml.com/JacksonHome
-
-// Standard: RFC 4627
-// https://tools.ietf.org/html/rfc4627
-
-/** Document format for handling JSON tweets: either one
- * object {...} or a list [{tweet...}, {tweet...}, ...].
- */
-@CreoleResource(name = "GATE JSON Tweet Document Format", isPrivate = true,
- autoinstances = {@AutoInstance(hidden = true)})
-
-public class JSONTweetFormat extends TextualDocumentFormat {
- private static final long serialVersionUID = 6878020036304333918L;
-
- public static final String TEXT_ATTRIBUTE = "text";
-
- /** Default construction */
- public JSONTweetFormat() { super();}
-
- /** Initialise this resource, and return it. */
- public Resource init() throws ResourceInstantiationException{
- // Register ad hoc MIME-type
- // There is an application/json mime type, but I don't think
- // we want everything to be handled this way?
- MimeType mime = new MimeType("text","x-json-twitter");
- // Register the class handler for this MIME-type
- mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
this);
- // Register the mime type with string
- mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
- // Register file suffixes for this mime type
- suffixes2mimeTypeMap.put("json", mime);
- // Register magic numbers for this mime type
- //magic2mimeTypeMap.put("Subject:",mime);
- // Set the mimeType for this language resource
- setMimeType(mime);
- return this;
- }
-
- @Override
- public void cleanup() {
- super.cleanup();
-
- MimeType mime = getMimeType();
-
- mimeString2ClassHandlerMap.remove(mime.getType()+ "/" + mime.getSubtype());
- mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
- suffixes2mimeTypeMap.remove("json");
- }
-
- @Override
- public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
- if ( (doc == null) || (doc.getSourceUrl() == null && doc.getContent() ==
null) ) {
- throw new DocumentFormatException("GATE document is null or no content
found. Nothing to parse!");
- }
-
- setNewLineProperty(doc);
- String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
- try {
- // Parse the String
- List<Tweet> tweets = readTweets(jsonString);
-
- // Put them all together to make the unpacked document content
- StringBuilder concatenation = new StringBuilder();
- for (Tweet tweet : tweets) {
- tweet.setStart(concatenation.length());
- concatenation.append(tweet.getString()).append("\n\n");
- }
-
- // Set new document content
- DocumentContent newContent = new
DocumentContentImpl(concatenation.toString());
- doc.edit(0L, doc.getContent().size(), newContent);
-
- AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
- // Create Original markups annotations for each tweet
- for (Tweet tweet : tweets) {
- originalMarkups.add(tweet.getStart(), tweet.getEnd(), "Tweet",
tweet.getFeatures());
- }
- }
- catch (InvalidOffsetException e) {
- throw new DocumentFormatException(e);
- }
- catch(IOException e) {
- throw new DocumentFormatException(e);
- }
- }
-
-
-
- private List<Tweet> readTweetList(String string) throws IOException {
- ObjectMapper mapper = new ObjectMapper();
- List<Tweet> tweets = new ArrayList<Tweet>();
- ArrayNode jarray = (ArrayNode) mapper.readTree(string);
- for (JsonNode jnode : jarray) {
- tweets.add(new Tweet(jnode));
- }
- return tweets;
- }
-
-
- private List<Tweet> readTweets(String string) throws IOException {
- if (string.startsWith("[")) {
- return readTweetList(string);
- }
-
- // implied else
- return readTweetLines(string);
- }
-
-
- private List<Tweet>readTweetLines(String string) throws IOException {
- ObjectMapper mapper = new ObjectMapper();
-
- List<Tweet> tweets = new ArrayList<Tweet>();
-
- // just not null, so we can use it in the loop
- String[] lines = string.split("[\\n\\r]+");
- for (String line : lines) {
- if (line.length() > 0) {
- JsonNode jnode = mapper.readTree(line);
- tweets.add(new Tweet(jnode));
- }
- }
-
- return tweets;
- }
-
-}
-
-
-class Tweet {
- private String string;
- private FeatureMap features;
- private long start;
-
- public int getLength() {
- return this.string.length();
- }
-
- public String getString() {
- return this.string;
- }
-
- public FeatureMap getFeatures() {
- return this.features;
- }
-
- public void setStart(long start) {
- this.start = start;
- }
-
- public long getStart() {
- return this.start;
- }
-
- public long getEnd() {
- return this.start + this.string.length();
- }
-
-
- public Tweet(JsonNode json) {
- string = "";
- Iterator<String> keys = json.fieldNames();
- features = Factory.newFeatureMap();
-
- while (keys.hasNext()) {
- String key = keys.next();
- if (key.equals("text")) {
- string = StringEscapeUtils.unescapeHtml(json.get(key).asText());
- }
- else {
- features.put(key.toString(), process(json.get(key)));
- }
- }
- }
-
-
- public Tweet() {
- string = "";
- features = Factory.newFeatureMap();
- }
-
-
- private Object process(JsonNode node) {
- /* JSON types: number, string, boolean, array, object (dict/map),
- * null. All map keys are strings.
- */
-
- if (node.isBoolean()) {
- return node.asBoolean();
- }
- if (node.isDouble()) {
- return node.asDouble();
- }
- if (node.isInt()) {
- return node.asInt();
- }
- if (node.isTextual()) {
- return node.asText();
- }
-
- if (node.isNull()) {
- return null;
- }
-
- if (node.isArray()) {
- List<Object> list = new ArrayList<Object>();
- for (JsonNode item : node) {
- list.add(process(item));
- }
- return list;
- }
-
- if (node.isObject()) {
- Map<String, Object> map = new HashMap<String, Object>();
- Iterator<String> keys = node.fieldNames();
- while (keys.hasNext()) {
- String key = keys.next();
- map.put(key, process(node.get(key)));
- }
- return map;
- }
-
- return node.toString();
- }
-
-}
Copied:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/JSONTweetFormat.java (from
rev 16878, gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java)
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/JSONTweetFormat.java
(rev 0)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/JSONTweetFormat.java
2013-09-12 14:31:26 UTC (rev 16881)
@@ -0,0 +1,109 @@
+/*
+ * JSONTweetFormat.java
+ *
+ * Copyright (c) 1995-2013, The University of Sheffield. See the file
+ * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ * This file is part of GATE (see http://gate.ac.uk/), and is free
+ * software, licenced under the GNU Library General Public License,
+ * Version 2, June 1991 (in the distribution as file licence.html,
+ * and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ * $Id$
+ */
+package gate.corpora.twitter;
+
+import gate.*;
+import gate.creole.ResourceInstantiationException;
+import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleResource;
+import gate.util.DocumentFormatException;
+import gate.util.InvalidOffsetException;
+import gate.corpora.*;
+import java.io.IOException;
+import java.util.*;
+import org.apache.commons.lang.StringUtils;
+
+
+/** Document format for handling JSON tweets: either one
+ * object {...} or a list [{tweet...}, {tweet...}, ...].
+ */
+@CreoleResource(name = "GATE JSON Tweet Document Format", isPrivate = true,
+ autoinstances = {@AutoInstance(hidden = true)})
+
+public class JSONTweetFormat extends TextualDocumentFormat {
+ private static final long serialVersionUID = 6878020036304333918L;
+
+ public static final String TEXT_ATTRIBUTE = "text";
+
+ /** Default construction */
+ public JSONTweetFormat() { super();}
+
+ /** Initialise this resource, and return it. */
+ public Resource init() throws ResourceInstantiationException{
+ // Register ad hoc MIME-type
+ // There is an application/json mime type, but I don't think
+ // we want everything to be handled this way?
+ MimeType mime = new MimeType("text","x-json-twitter");
+ // Register the class handler for this MIME-type
+ mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
this);
+ // Register the mime type with string
+ mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
+ // Register file suffixes for this mime type
+ suffixes2mimeTypeMap.put("json", mime);
+ // Register magic numbers for this mime type
+ //magic2mimeTypeMap.put("Subject:",mime);
+ // Set the mimeType for this language resource
+ setMimeType(mime);
+ return this;
+ }
+
+ @Override
+ public void cleanup() {
+ super.cleanup();
+
+ MimeType mime = getMimeType();
+
+ mimeString2ClassHandlerMap.remove(mime.getType()+ "/" + mime.getSubtype());
+ mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
+ suffixes2mimeTypeMap.remove("json");
+ }
+
+ @Override
+ public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
+ if ( (doc == null) || (doc.getSourceUrl() == null && doc.getContent() ==
null) ) {
+ throw new DocumentFormatException("GATE document is null or no content
found. Nothing to parse!");
+ }
+
+ setNewLineProperty(doc);
+ String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
+ try {
+ // Parse the String
+ List<Tweet> tweets = TweetUtils.readTweets(jsonString);
+
+ // Put them all together to make the unpacked document content
+ StringBuilder concatenation = new StringBuilder();
+ for (Tweet tweet : tweets) {
+ tweet.setStart(concatenation.length());
+ concatenation.append(tweet.getString()).append("\n\n");
+ }
+
+ // Set new document content
+ DocumentContent newContent = new
DocumentContentImpl(concatenation.toString());
+ doc.edit(0L, doc.getContent().size(), newContent);
+
+ AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
+ // Create Original markups annotations for each tweet
+ for (Tweet tweet : tweets) {
+ originalMarkups.add(tweet.getStart(), tweet.getEnd(), "Tweet",
tweet.getFeatures());
+ }
+ }
+ catch (InvalidOffsetException e) {
+ throw new DocumentFormatException(e);
+ }
+ catch(IOException e) {
+ throw new DocumentFormatException(e);
+ }
+ }
+
+}
Copied: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java (from
rev 16878, gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java)
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
(rev 0)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2013-09-12 14:31:26 UTC (rev 16881)
@@ -0,0 +1,137 @@
+/*
+ * Tweet.java
+ *
+ * Copyright (c) 1995-2013, The University of Sheffield. See the file
+ * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ * This file is part of GATE (see http://gate.ac.uk/), and is free
+ * software, licenced under the GNU Library General Public License,
+ * Version 2, June 1991 (in the distribution as file licence.html,
+ * and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ * $Id$
+ */
+package gate.corpora.twitter;
+
+import gate.*;
+import gate.util.DocumentFormatException;
+import gate.util.InvalidOffsetException;
+import java.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.commons.lang.StringUtils;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+
+
+// JSON API
+// http://json-lib.sourceforge.net/apidocs/jdk15/index.html
+// Jackson API
+// http://wiki.fasterxml.com/JacksonHome
+
+// Standard: RFC 4627
+// https://tools.ietf.org/html/rfc4627
+
+
+public class Tweet {
+ private String string;
+ private FeatureMap features;
+ private long start;
+
+ public int getLength() {
+ return this.string.length();
+ }
+
+ public String getString() {
+ return this.string;
+ }
+
+ public FeatureMap getFeatures() {
+ return this.features;
+ }
+
+ public void setStart(long start) {
+ this.start = start;
+ }
+
+ public long getStart() {
+ return this.start;
+ }
+
+ public long getEnd() {
+ return this.start + this.string.length();
+ }
+
+
+ public Tweet(JsonNode json) {
+ string = "";
+ Iterator<String> keys = json.fieldNames();
+ features = Factory.newFeatureMap();
+
+ while (keys.hasNext()) {
+ String key = keys.next();
+ if (key.equals(JSONTweetFormat.TEXT_ATTRIBUTE)) {
+ string = StringEscapeUtils.unescapeHtml(json.get(key).asText());
+ }
+ else {
+ features.put(key.toString(), process(json.get(key)));
+ }
+ }
+ }
+
+
+ public Tweet() {
+ string = "";
+ features = Factory.newFeatureMap();
+ }
+
+
+ private Object process(JsonNode node) {
+ /* JSON types: number, string, boolean, array, object (dict/map),
+ * null. All map keys are strings.
+ */
+
+ if (node.isBoolean()) {
+ return node.asBoolean();
+ }
+ if (node.isDouble()) {
+ return node.asDouble();
+ }
+ if (node.isInt()) {
+ return node.asInt();
+ }
+ if (node.isTextual()) {
+ return node.asText();
+ }
+
+ if (node.isNull()) {
+ return null;
+ }
+
+ if (node.isArray()) {
+ List<Object> list = new ArrayList<Object>();
+ for (JsonNode item : node) {
+ list.add(process(item));
+ }
+ return list;
+ }
+
+ if (node.isObject()) {
+ Map<String, Object> map = new HashMap<String, Object>();
+ Iterator<String> keys = node.fieldNames();
+ while (keys.hasNext()) {
+ String key = keys.next();
+ map.put(key, process(node.get(key)));
+ }
+ return map;
+ }
+
+ return node.toString();
+ }
+
+
+ //public DocumentImpl toDocument(List<String> keepFeatures, FeatureMap
contentItems) {
+ //}
+
+
+}
Copied: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
(from rev 16878,
gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java)
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
(rev 0)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2013-09-12 14:31:26 UTC (rev 16881)
@@ -0,0 +1,81 @@
+/*
+ * TweetUtils.java
+ *
+ * Copyright (c) 1995-2013, The University of Sheffield. See the file
+ * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ * This file is part of GATE (see http://gate.ac.uk/), and is free
+ * software, licenced under the GNU Library General Public License,
+ * Version 2, June 1991 (in the distribution as file licence.html,
+ * and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ * $Id$
+ */
+package gate.corpora.twitter;
+
+import gate.*;
+import gate.creole.ResourceInstantiationException;
+import gate.util.DocumentFormatException;
+import gate.util.InvalidOffsetException;
+import gate.corpora.*;
+import java.io.IOException;
+import java.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.commons.lang.StringUtils;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+
+
+// JSON API
+// http://json-lib.sourceforge.net/apidocs/jdk15/index.html
+// Jackson API
+// http://wiki.fasterxml.com/JacksonHome
+
+// Standard: RFC 4627
+// https://tools.ietf.org/html/rfc4627
+
+public class TweetUtils {
+
+
+ public static List<Tweet> readTweetList(String string) throws IOException {
+ ObjectMapper mapper = new ObjectMapper();
+ List<Tweet> tweets = new ArrayList<Tweet>();
+ ArrayNode jarray = (ArrayNode) mapper.readTree(string);
+ for (JsonNode jnode : jarray) {
+ tweets.add(new Tweet(jnode));
+ }
+ return tweets;
+ }
+
+
+ public static List<Tweet> readTweets(String string) throws IOException {
+ if (string.startsWith("[")) {
+ return readTweetList(string);
+ }
+
+ // implied else
+ return readTweetLines(string);
+ }
+
+
+ public static List<Tweet>readTweetLines(String string) throws IOException {
+ ObjectMapper mapper = new ObjectMapper();
+
+ List<Tweet> tweets = new ArrayList<Tweet>();
+
+ // just not null, so we can use it in the loop
+ String[] lines = string.split("[\\n\\r]+");
+ for (String line : lines) {
+ if (line.length() > 0) {
+ JsonNode jnode = mapper.readTree(line);
+ tweets.add(new Tweet(jnode));
+ }
+ }
+
+ return tweets;
+ }
+
+
+
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
How ServiceNow helps IT people transform IT departments:
1. Consolidate legacy IT systems to a single system of record for IT
2. Standardize and globalize service processes across IT
3. Implement zero-touch automation to replace manual, redundant tasks
http://pubads.g.doubleclick.net/gampad/clk?id=51271111&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs