Revision: 18418 http://sourceforge.net/p/gate/code/18418 Author: ian_roberts Date: 2014-10-30 18:38:33 +0000 (Thu, 30 Oct 2014) Log Message: ----------- Made the JSON DocumentFormat use the new TweetStreamIterator approach rather than assuming the original JSON is one tweet per line.
Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java 2014-10-30 18:37:26 UTC (rev 18417) +++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java 2014-10-30 18:38:33 UTC (rev 18418) @@ -17,6 +17,7 @@ import gate.Resource; import gate.corpora.twitter.PreAnnotation; import gate.corpora.twitter.Tweet; +import gate.corpora.twitter.TweetStreamIterator; import gate.corpora.twitter.TweetUtils; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.AutoInstance; @@ -25,6 +26,8 @@ import gate.util.InvalidOffsetException; import java.io.IOException; import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; @@ -87,12 +90,13 @@ String jsonString = StringUtils.trimToEmpty(doc.getContent().toString()); try { // Parse the String - List<Tweet> tweets = TweetUtils.readTweets(jsonString); - Map<Tweet, Long> tweetStarts = new HashMap<Tweet, Long>(); + Iterator<Tweet> tweetSource = new TweetStreamIterator(jsonString, null, null); + Map<Tweet, Long> tweetStarts = new LinkedHashMap<Tweet, Long>(); // Put them all together to make the unpacked document content StringBuilder concatenation = new StringBuilder(); - for (Tweet tweet : tweets) { + while(tweetSource.hasNext()) { + Tweet tweet = tweetSource.next(); tweetStarts.put(tweet, (long) concatenation.length()); concatenation.append(tweet.getString()).append("\n\n"); } @@ -103,7 +107,7 @@ AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); // Create Original markups annotations for each tweet - for (Tweet tweet : tweets) { + for (Tweet tweet : tweetStarts.keySet()) { for (PreAnnotation preAnn : tweet.getAnnotations()) { preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet)); } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-10-30 18:37:26 UTC (rev 18417) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-10-30 18:38:33 UTC (rev 18418) @@ -41,9 +41,17 @@ private List<String> contentKeys, featureKeys; private boolean nested; private Iterator<JsonNode> nestedStatuses; - private JsonNode nextNode; - - + private JsonNode nextNode; + + public TweetStreamIterator(String json, List<String> contentKeys, + List<String> featureKeys) throws JsonParseException, IOException { + this.contentKeys = contentKeys; + this.featureKeys = featureKeys; + objectMapper = new ObjectMapper(); + jsonParser = objectMapper.getFactory().createParser(json); + init(); + } + public TweetStreamIterator(InputStream input, List<String> contentKeys, List<String> featureKeys, boolean gzip) throws JsonParseException, IOException { this.contentKeys = contentKeys; @@ -61,6 +69,10 @@ } jsonParser = objectMapper.getFactory().createParser(workingInput).enable(Feature.AUTO_CLOSE_SOURCE); + init(); + } + + private void init() throws JsonParseException, IOException { // If the first token in the stream is the start of an array ("[") // then assume the stream as a whole is an array of objects // To handle this, simply clear the token - The MappingIterator @@ -72,7 +84,6 @@ this.nested = false; this.nestedStatuses = null; } - @Override public boolean hasNext() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs