Revision: 18319 http://sourceforge.net/p/gate/code/18319 Author: adamfunk Date: 2014-09-11 19:51:08 +0000 (Thu, 11 Sep 2014) Log Message: ----------- WIP
Added Paths: ----------- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java Added: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java =================================================================== --- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java (rev 0) +++ gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java 2014-09-11 19:51:08 UTC (rev 18319) @@ -0,0 +1,35 @@ +/* + * Copyright (c) 1995-2014, The University of Sheffield. See the file + * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt + * + * This file is part of GATE (see http://gate.ac.uk/), and is free + * software, licenced under the GNU Library General Public License, + * Version 2, June 1991 (in the distribution as file licence.html, + * and also available at http://gate.ac.uk/gate/licence.html). + * + * $Id$ + */ +package gate.corpora.twitter; + +import java.io.InputStream; +import java.util.Iterator; + +/** + * Iterable version, just to make loops easier. + * @author adam + * + */ +public class TweetStreamIterable implements Iterable<Tweet> { + + InputStream input; + + public TweetStreamIterable(InputStream input) { + this.input = input; + } + + @Override + public Iterator<Tweet> iterator() { + return new TweetStreamIterator(input); + } + +} Property changes on: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java =================================================================== --- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java (rev 0) +++ gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-09-11 19:51:08 UTC (rev 18319) @@ -0,0 +1,112 @@ +/* + * Copyright (c) 1995-2014, The University of Sheffield. See the file + * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt + * + * This file is part of GATE (see http://gate.ac.uk/), and is free + * software, licenced under the GNU Library General Public License, + * Version 2, June 1991 (in the distribution as file licence.html, + * and also available at http://gate.ac.uk/gate/licence.html). + * + * $Id$ + */ +package gate.corpora.twitter; + +import gate.Document; +import gate.Factory; +import gate.FeatureMap; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonParser.Feature; +import com.fasterxml.jackson.core.JsonPointer; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.MappingIterator; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class TweetStreamIterator implements Iterator<Tweet> { + + // Borrowed from gcp IOConstants + public static final String ID_POINTER = "/id_str"; + + + private ObjectMapper objectMapper; + private JsonParser jsonParser; + private MappingIterator<JsonNode> iterator; + private boolean gzip; + private List<String> contentKeys, featureKeys; + protected JsonPointer idPointer; + + + public TweetStreamIterator(InputStream input, List<String> contentKeys, + List<String> featureKeys, boolean gzip) throws JsonParseException, IOException { + this.contentKeys = contentKeys; + this.featureKeys = featureKeys; + this.gzip = gzip; + + if (gzip) { + throw new IllegalArgumentException("gzip not yet supported!"); + } + // TODO support compression + + // Following borrowed from gcp JSONStreamingInputHandler + idPointer = JsonPointer.compile(ID_POINTER); + objectMapper = new ObjectMapper(); + jsonParser = objectMapper.getFactory().createParser(input).enable(Feature.AUTO_CLOSE_SOURCE); + // If the first token in the stream is the start of an array ("[") + // then + // assume the stream as a whole is an array of objects, one per + // document. + // To handle this, simply clear the token - The MappingIterator + // returned + // by readValues will cope with the rest in either form. + if(jsonParser.nextToken() == JsonToken.START_ARRAY) { + jsonParser.clearCurrentToken(); + } + iterator = objectMapper.readValues(jsonParser, JsonNode.class); + } + + + @Override + public boolean hasNext() { + return iterator.hasNext(); + // should this be hasNextValue() ? + } + + @Override + public Tweet next() { + try { + // why while not if? + while(iterator.hasNextValue()) { + JsonNode json = iterator.nextValue(); + String id = json.at(idPointer).asText(); + // Is it worth testing IDs here? + return Tweet.readTweet(json, contentKeys, featureKeys); + } + } + catch (IOException e) { + e.printStackTrace(); + } + return null; + } + + @Override + public void remove() { + // TODO Auto-generated method stub + + } + + + public void close() { + // TODO + } + + + + +} Property changes on: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Want excitement? Manually upgrade your production database. When you want reliability, choose Perforce Perforce version control. Predictably reliable. http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs