Revision: 16883
http://sourceforge.net/p/gate/code/16883
Author: adamfunk
Date: 2013-09-12 15:58:26 +0000 (Thu, 12 Sep 2013)
Log Message:
-----------
WIP on new Twitter population
Modified Paths:
--------------
gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2013-09-12 14:33:16 UTC (rev 16882)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2013-09-12 15:58:26 UTC (rev 16883)
@@ -27,6 +27,8 @@
/** Document format for handling JSON tweets: either one
* object {...} or a list [{tweet...}, {tweet...}, ...].
+ *
+ * This format produces one GATE document from one JSON file.
*/
@CreoleResource(name = "GATE JSON Tweet Document Format", isPrivate = true,
autoinstances = {@AutoInstance(hidden = true)})
@@ -35,6 +37,7 @@
private static final long serialVersionUID = 6878020036304333918L;
public static final String TEXT_ATTRIBUTE = "text";
+ public static final String TWEET_ANNOTATION_TYPE = "Tweet";
/** Default construction */
public JSONTweetFormat() { super();}
@@ -95,7 +98,9 @@
AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create Original markups annotations for each tweet
for (Tweet tweet : tweets) {
- originalMarkups.add(tweet.getStart(), tweet.getEnd(), "Tweet",
tweet.getFeatures());
+ FeatureMap features = tweet.getFeatures();
+ features.remove(TEXT_ATTRIBUTE);
+ originalMarkups.add(tweet.getStart(), tweet.getEnd(),
TWEET_ANNOTATION_TYPE, features);
}
}
catch (InvalidOffsetException e) {
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2013-09-12 14:33:16 UTC (rev 16882)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2013-09-12 15:58:26 UTC (rev 16883)
@@ -14,15 +14,12 @@
package gate.corpora.twitter;
import gate.*;
-import gate.util.DocumentFormatException;
-import gate.util.InvalidOffsetException;
+import gate.util.*;
+import gate.corpora.*;
import gate.corpora.JSONTweetFormat;
import java.util.*;
import org.apache.commons.lang.StringEscapeUtils;
-import org.apache.commons.lang.StringUtils;
import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.node.ArrayNode;
// JSON API
@@ -39,6 +36,9 @@
private FeatureMap features;
private long start;
+ public static String PATH_SEPARATOR = ":";
+
+
public int getLength() {
return this.string.length();
}
@@ -51,6 +51,14 @@
return this.features;
}
+ public FeatureMap getFlattenedFeatures() {
+ return TweetUtils.flatten(this.features, PATH_SEPARATOR);
+ }
+
+ public FeatureMap getFilteredFeatures(Collection<String> keepKeys) {
+ return TweetUtils.filterFeatures(this.getFlattenedFeatures(), keepKeys);
+ }
+
public void setStart(long start) {
this.start = start;
}
@@ -71,12 +79,11 @@
while (keys.hasNext()) {
String key = keys.next();
+ features.put(key.toString(), process(json.get(key)));
+
if (key.equals(JSONTweetFormat.TEXT_ATTRIBUTE)) {
string = StringEscapeUtils.unescapeHtml(json.get(key).asText());
}
- else {
- features.put(key.toString(), process(json.get(key)));
- }
}
}
@@ -87,7 +94,8 @@
}
- private Object process(JsonNode node) {
+
+ private static Object process(JsonNode node) {
/* JSON types: number, string, boolean, array, object (dict/map),
* null. All map keys are strings.
*/
@@ -131,8 +139,25 @@
}
- //public DocumentImpl toDocument(List<String> keepFeatures, FeatureMap
contentItems) {
- //}
+
+// public Document toDocument(List<String> keepFeatures, FeatureMap
contentItems) throws GateException {
+// FeatureMap parameters = Factory.newFeatureMap();
+// parameters.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, "");
+// Document doc = (Document)
Factory.createResource(DocumentImpl.class.getName(), parameters);
+// //doc.setSourceUrl(sourceUrl);
+//
+// // this is wrong: we need various strings with content annotations over
them
+// DocumentContent newContent= new DocumentContentImpl(this.getString());
+// doc.setContent(newContent);
+// AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
+//
+// originalMarkups.add(0L, newContent.size(),
JSONTweetFormat.TWEET_ANNOTATION_TYPE, Factory.newFeatureMap());
+//
+// // TODO: copy all the keepFeatures
+//
+// return doc;
+// }
+
}
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2013-09-12 14:33:16 UTC (rev 16882)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2013-09-12 15:58:26 UTC (rev 16883)
@@ -14,14 +14,8 @@
package gate.corpora.twitter;
import gate.*;
-import gate.creole.ResourceInstantiationException;
-import gate.util.DocumentFormatException;
-import gate.util.InvalidOffsetException;
-import gate.corpora.*;
import java.io.IOException;
import java.util.*;
-import org.apache.commons.lang.StringEscapeUtils;
-import org.apache.commons.lang.StringUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
@@ -77,5 +71,40 @@
}
+
+ public static FeatureMap filterFeatures(FeatureMap source,
Collection<String> keep) {
+ FeatureMap result = Factory.newFeatureMap();
+ for (Object key : source.keySet()) {
+ if (keep.contains(key.toString())) {
+ result.put(key, source.get(key));
+ }
+ }
+ return result;
+ }
+
+
+ public static FeatureMap flatten(FeatureMap features, String separator) {
+ return flatten(features, "", separator);
+ }
+
+
+ private static FeatureMap flatten(Map<?, ?> map, String prefix, String
separator) {
+ FeatureMap flattened = Factory.newFeatureMap();
+ for (Object key : map.keySet()) {
+ String flatKey = prefix + key.toString();
+ Object value = map.keySet();
+ if (value instanceof Map) {
+ flattened.putAll(flatten((Map<?, ?>) value, flatKey + separator,
separator));
+ }
+ else {
+ flattened.put(flatKey, value);
+ }
+ }
+ return flattened;
+ }
+
+
+
+
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
How ServiceNow helps IT people transform IT departments:
1. Consolidate legacy IT systems to a single system of record for IT
2. Standardize and globalize service processes across IT
3. Implement zero-touch automation to replace manual, redundant tasks
http://pubads.g.doubleclick.net/gampad/clk?id=51271111&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs