Revision: 18418
          http://sourceforge.net/p/gate/code/18418
Author:   ian_roberts
Date:     2014-10-30 18:38:33 +0000 (Thu, 30 Oct 2014)
Log Message:
-----------
Made the JSON DocumentFormat use the new TweetStreamIterator approach rather 
than assuming the original JSON is one tweet per line.

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java    
2014-10-30 18:37:26 UTC (rev 18417)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java    
2014-10-30 18:38:33 UTC (rev 18418)
@@ -17,6 +17,7 @@
 import gate.Resource;
 import gate.corpora.twitter.PreAnnotation;
 import gate.corpora.twitter.Tweet;
+import gate.corpora.twitter.TweetStreamIterator;
 import gate.corpora.twitter.TweetUtils;
 import gate.creole.ResourceInstantiationException;
 import gate.creole.metadata.AutoInstance;
@@ -25,6 +26,8 @@
 import gate.util.InvalidOffsetException;
 import java.io.IOException;
 import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import org.apache.commons.lang.StringUtils;
@@ -87,12 +90,13 @@
     String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
     try {
       // Parse the String
-      List<Tweet> tweets = TweetUtils.readTweets(jsonString);
-      Map<Tweet, Long> tweetStarts = new HashMap<Tweet, Long>();
+      Iterator<Tweet> tweetSource = new TweetStreamIterator(jsonString, null, 
null);
+      Map<Tweet, Long> tweetStarts = new LinkedHashMap<Tweet, Long>();
       
       // Put them all together to make the unpacked document content
       StringBuilder concatenation = new StringBuilder();
-      for (Tweet tweet : tweets) {
+      while(tweetSource.hasNext()) {
+        Tweet tweet = tweetSource.next();
         tweetStarts.put(tweet, (long) concatenation.length());
         concatenation.append(tweet.getString()).append("\n\n");
       }
@@ -103,7 +107,7 @@
 
       AnnotationSet originalMarkups = 
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
       // Create Original markups annotations for each tweet
-      for (Tweet tweet : tweets) {
+      for (Tweet tweet : tweetStarts.keySet()) {
         for (PreAnnotation preAnn : tweet.getAnnotations()) {
           preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet));
         }

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
--- 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-10-30 18:37:26 UTC (rev 18417)
+++ 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-10-30 18:38:33 UTC (rev 18418)
@@ -41,9 +41,17 @@
   private List<String> contentKeys, featureKeys;
   private boolean nested;
   private Iterator<JsonNode> nestedStatuses;
-  private JsonNode nextNode; 
-
-
+  private JsonNode nextNode;
+  
+  public TweetStreamIterator(String json, List<String> contentKeys, 
+          List<String> featureKeys) throws JsonParseException, IOException {
+    this.contentKeys = contentKeys;
+    this.featureKeys = featureKeys;
+    objectMapper = new ObjectMapper();
+    jsonParser = objectMapper.getFactory().createParser(json);
+    init();
+  }
+  
   public TweetStreamIterator(InputStream input, List<String> contentKeys, 
           List<String> featureKeys, boolean gzip) throws JsonParseException, 
IOException {
     this.contentKeys = contentKeys;
@@ -61,6 +69,10 @@
     }
     
     jsonParser = 
objectMapper.getFactory().createParser(workingInput).enable(Feature.AUTO_CLOSE_SOURCE);
+    init();
+  }
+
+  private void init() throws JsonParseException, IOException {
     // If the first token in the stream is the start of an array ("[")
     // then assume the stream as a whole is an array of objects
     // To handle this, simply clear the token - The MappingIterator
@@ -72,7 +84,6 @@
     this.nested = false;
     this.nestedStatuses = null;
   }
-
   
   @Override
   public boolean hasNext() {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to