Revision: 18420
          http://sourceforge.net/p/gate/code/18420
Author:   ian_roberts
Date:     2014-10-30 19:26:45 +0000 (Thu, 30 Oct 2014)
Log Message:
-----------
Create annotations from the "entities" property of Twitter JSON

In Twitter JSON the property "entities" is a map from entity type to list of
entities, where each entity is an object with an "indices" property giving its
offsets and other properties giving features.  I've added support for turning
this structure into annotations in the Original markups set.

The DocumentFormat will have this behaviour enabled by default, the populator
currently not (principle of least surprise - I'll enable it once I've added a
checkbox to the GUI to control it).

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-10-30 18:42:13 UTC (rev 18419)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-10-30 19:26:45 UTC (rev 18420)
@@ -75,7 +75,8 @@
       input = inputUrl.openStream();
       
       // TODO Detect & handle gzipped input.
-      TweetStreamIterator tweetSource = new TweetStreamIterator(input, 
contentKeys, featureKeys, false);
+      // TODO handling of entities, once there's GUI to control it
+      TweetStreamIterator tweetSource = new TweetStreamIterator(input, 
contentKeys, featureKeys, false, false);
 
       int tweetCounter = 0;
       int tweetDocCounter = 0;

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2014-10-30 18:42:13 UTC (rev 18419)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2014-10-30 19:26:45 UTC (rev 18420)
@@ -18,9 +18,11 @@
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import org.apache.commons.lang.StringEscapeUtils;
 import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
 
 
 public class Tweet {
@@ -51,12 +53,16 @@
   
   
   public static Tweet readTweet(JsonNode json, List<String> contentKeys, 
List<String> featureKeys) {
+    return readTweet(json, contentKeys, featureKeys, true);
+  }
+  
+  public static Tweet readTweet(JsonNode json, List<String> contentKeys, 
List<String> featureKeys, boolean handleEntities) {
     if ( (contentKeys == null) || (featureKeys == null) ) {
-      return new Tweet(json);
+      return new Tweet(json, handleEntities);
     }
 
     // implied else
-    return new Tweet(json, contentKeys, featureKeys);
+    return new Tweet(json, contentKeys, featureKeys, handleEntities);
   }
 
 
@@ -64,7 +70,7 @@
    * Used by the JSONTWeetFormat; the DocumentContent contains only the main 
text;
    * the annotation feature map contains all the other JSON data, recursively.
    */
-  private Tweet(JsonNode json) {
+  private Tweet(JsonNode json, boolean handleEntities) {
     string = "";
     Iterator<String> keys = json.fieldNames();
     FeatureMap features = Factory.newFeatureMap();
@@ -75,6 +81,7 @@
 
       if (key.equals(TweetUtils.DEFAULT_TEXT_ATTRIBUTE)) {
         string = StringEscapeUtils.unescapeHtml(json.get(key).asText());
+        processEntities(json, 0L);
       }
       else {
         features.put(key.toString(), TweetUtils.process(json.get(key)));
@@ -91,7 +98,7 @@
    * @param featureKeys JSON paths whose values should be stored in the main
    * annotation's features
    */
-  private Tweet(JsonNode json, List<String> contentKeys, List<String> 
featureKeys) {
+  private Tweet(JsonNode json, List<String> contentKeys, List<String> 
featureKeys, boolean handleEntities) {
     StringBuilder content = new StringBuilder();
     List<String> keepers = new ArrayList<String>();
     keepers.addAll(contentKeys);
@@ -107,6 +114,10 @@
         // Use GATE's String conversion in case there are maps or lists.
         content.append(Strings.toString(featuresFound.get(cKey)));
         this.annotations.add(new PreAnnotation(start, content.length(), cKey));
+        if(handleEntities && TweetUtils.DEFAULT_TEXT_ATTRIBUTE.equals(cKey)) {
+          // only process entities within "text"
+          processEntities(json, start);
+        }
         content.append('\n');
       }
     }
@@ -124,5 +135,45 @@
     this.string = content.toString();
   }
 
+  /**
+   * Process the "entities" property of this json object into annotations,
+   * shifting their offsets by the specified amount.
+   * 
+   * @param json the Tweet json object
+   * @param startOffset offset correction if the text is not the first of
+   *         the content keys.
+   */
+  private void processEntities(JsonNode json, long startOffset) {
+    JsonNode entitiesNode = json.get(TweetUtils.ENTITIES_ATTRIBUTE);
+    if(entitiesNode == null || !entitiesNode.isObject()) {
+      // no entities, nothing to do
+      return;
+    }
+    Iterator<String> entityTypes = entitiesNode.fieldNames();
+    while(entityTypes.hasNext()) {
+      String entityType = entityTypes.next();
+      JsonNode entitiesOfType = entitiesNode.get(entityType);
+      if(entitiesOfType != null && entitiesOfType.isArray() && 
entitiesOfType.size() > 0) {
+        Iterator<JsonNode> it = entitiesOfType.elements();
+        while(it.hasNext()) {
+          JsonNode entity = it.next();
+          if(entity.isObject()) {
+            // process is guaranteed to return a FeatureMap given an object
+            FeatureMap features = (FeatureMap)TweetUtils.process(entity);
+            Object indices = features.get("indices");
+            if(indices != null && indices instanceof List<?>) {
+              List<?> indicesList = (List<?>)indices;
+              if(indicesList.get(0) instanceof Number && indicesList.get(1) 
instanceof Number) {
+                // finally we know we have a valid entity
+                features.remove("indices");
+                annotations.add(new PreAnnotation(startOffset + 
((Number)indicesList.get(0)).longValue(),
+                        startOffset + 
((Number)indicesList.get(1)).longValue(), entityType, features));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
   
 }

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
--- 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-10-30 18:42:13 UTC (rev 18419)
+++ 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-10-30 19:26:45 UTC (rev 18420)
@@ -42,11 +42,19 @@
   private boolean nested;
   private Iterator<JsonNode> nestedStatuses;
   private JsonNode nextNode;
+  private boolean handleEntities;
   
   public TweetStreamIterator(String json, List<String> contentKeys, 
           List<String> featureKeys) throws JsonParseException, IOException {
+    this(json, contentKeys, featureKeys, true);
+  }
+  
+  
+  public TweetStreamIterator(String json, List<String> contentKeys, 
+          List<String> featureKeys, boolean handleEntities) throws 
JsonParseException, IOException {
     this.contentKeys = contentKeys;
     this.featureKeys = featureKeys;
+    this.handleEntities = handleEntities;
     objectMapper = new ObjectMapper();
     jsonParser = objectMapper.getFactory().createParser(json);
     init();
@@ -54,8 +62,15 @@
   
   public TweetStreamIterator(InputStream input, List<String> contentKeys, 
           List<String> featureKeys, boolean gzip) throws JsonParseException, 
IOException {
+    this(input, contentKeys, featureKeys, gzip, true);
+  }
+  
+  public TweetStreamIterator(InputStream input, List<String> contentKeys, 
+          List<String> featureKeys, boolean gzip, boolean handleEntities)
+                  throws JsonParseException, IOException {
     this.contentKeys = contentKeys;
     this.featureKeys = featureKeys;
+    this.handleEntities = handleEntities;
     InputStream workingInput;
     
     // Following borrowed from gcp JSONStreamingInputHandler
@@ -103,7 +118,7 @@
     Tweet result = null;
     try {
       if (this.nested && this.nestedStatuses.hasNext()) {
-        result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, 
featureKeys);
+        result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, 
featureKeys, handleEntities);
         // Clear the nested flag once the last item in the statuses
         // value's list has been used, so that the next call to next()
         // will drop to the else if clause.
@@ -122,7 +137,7 @@
         else {
           this.nested = false;
           this.nestedStatuses = null;
-          result = Tweet.readTweet(nextNode, contentKeys, featureKeys);
+          result = Tweet.readTweet(nextNode, contentKeys, featureKeys, 
handleEntities);
         }
       }
     }

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 
2014-10-30 18:42:13 UTC (rev 18419)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 
2014-10-30 19:26:45 UTC (rev 18420)
@@ -42,6 +42,11 @@
     "created_at", "user:name"};
   public static final String[] DEFAULT_FEATURE_KEYS = {"user:screen_name", 
     "user:location", "id_str", "source", "truncated", "retweeted_status:id"};
+  
+  /**
+   * The JSON property representing entities (e.g. hashtags).
+   */
+  public static final String ENTITIES_ATTRIBUTE = "entities";
 
   
   public static List<Tweet> readTweets(String string) throws IOException {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to