Revision: 18326 http://sourceforge.net/p/gate/code/18326 Author: adamfunk Date: 2014-09-12 14:37:07 +0000 (Fri, 12 Sep 2014) Log Message: ----------- probably ready to test soon
Modified Paths: -------------- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java Modified: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java =================================================================== --- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-09-12 13:45:45 UTC (rev 18325) +++ gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-09-12 14:37:07 UTC (rev 18326) @@ -34,8 +34,8 @@ import java.util.Map; import javax.swing.AbstractAction; import javax.swing.Action; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; @CreoleResource(name = "Twitter Corpus Populator", tool = true, autoinstances = @AutoInstance, @@ -45,7 +45,11 @@ private static final long serialVersionUID = 1443073039199794668L; + private static final Logger logger = Logger.getLogger(Population.class.getName()); + private static final int COUNTER_DIGITS = 9; + + public static void populateCorpus(final Corpus corpus, URL inputUrl, PopulationConfig config) throws ResourceInstantiationException { populateCorpus(corpus, inputUrl, config.getEncoding(), config.getContentKeys(), @@ -66,25 +70,19 @@ List<String> featureKeys, int tweetsPerDoc) throws ResourceInstantiationException { try { InputStream input = inputUrl.openStream(); - List<String> lines = IOUtils.readLines(input, encoding); - IOUtils.closeQuietly(input); - // TODO: sort this out so it processes one at a time instead of reading the - // whole hog into memory - - // For now, we assume the streaming API format (concatenated maps, not in a list) - List<Tweet> tweets = TweetUtils.readTweetStrings(lines, contentKeys, featureKeys); - - int digits = (int) Math.ceil(Math.log10(tweets.size())); + // TODO detect & handle gzipped input + TweetStreamIterable tweetSource = new TweetStreamIterable(input, contentKeys, featureKeys, false); + int tweetCounter = 0; - Document document = newDocument(inputUrl, tweetCounter, digits); + Document document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS); StringBuilder content = new StringBuilder(); Map<PreAnnotation, Integer> annotandaOffsets = new HashMap<PreAnnotation, Integer>(); - for (Tweet tweet : tweets) { + for (Tweet tweet : tweetSource) { if ( (tweetsPerDoc > 0) && (tweetCounter > 0) && ((tweetCounter % tweetsPerDoc) == 0) ) { closeDocument(document, content, annotandaOffsets, corpus); - document = newDocument(inputUrl, tweetCounter, digits); + document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS); content = new StringBuilder(); annotandaOffsets = new HashMap<PreAnnotation, Integer>(); } @@ -127,8 +125,8 @@ document.getFeatures().put("gate.SourceURL", url.toString()); return document; } + - private static void closeDocument(Document document, StringBuilder content, Map<PreAnnotation, Integer> annotandaOffsets, Corpus corpus) throws InvalidOffsetException { DocumentContent contentImpl = new DocumentContentImpl(content.toString()); document.setContent(contentImpl); @@ -177,7 +175,7 @@ } } catch(ResourceInstantiationException e) { - e.printStackTrace(); + logger.warn("Error in Twitter Population", e); } } }; @@ -185,7 +183,7 @@ thread.start(); } catch(MalformedURLException e0) { - e0.printStackTrace(); + logger.warn("Error in Twitter Population", e0); } } }); Modified: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java =================================================================== --- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java 2014-09-12 13:45:45 UTC (rev 18325) +++ gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java 2014-09-12 14:37:07 UTC (rev 18326) @@ -11,9 +11,14 @@ */ package gate.corpora.twitter; +import java.io.IOException; import java.io.InputStream; +import java.util.Collections; import java.util.Iterator; +import java.util.List; +import org.apache.log4j.Logger; + /** * Iterable version, just to make loops easier. * @author adam @@ -21,15 +26,47 @@ */ public class TweetStreamIterable implements Iterable<Tweet> { - InputStream input; + private InputStream input; + private List<String> contentKeys, featureKeys; + private boolean gzip; + private TweetStreamIterator iterator; - public TweetStreamIterable(InputStream input) { + private static final Logger logger = Logger.getLogger(TweetStreamIterable.class.getName()); + + public TweetStreamIterable(InputStream input, List<String> contentKeys, + List<String> featureKeys, boolean gzip) { + this.input = input; + this.contentKeys = contentKeys; + this.featureKeys = featureKeys; + this.gzip = gzip; + this.iterator = null; } + @Override public Iterator<Tweet> iterator() { - return new TweetStreamIterator(input); + try { + this.iterator = new TweetStreamIterator(input, contentKeys, featureKeys, gzip); + return this.iterator; + } + catch(IOException e) { + logger.warn("Internal error in TweetStreamIterator", e); + // The Override won't let us throw an exception up. + return Collections.<Tweet>emptyList().iterator(); + } } + + public void close() { + if (this.iterator != null) { + try { + this.iterator.close(); + } + catch(IOException e) { + logger.warn("Internal error in TweetStreamIterator", e); + } + } + } + } Modified: gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java =================================================================== --- gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-09-12 13:45:45 UTC (rev 18325) +++ gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-09-12 14:37:07 UTC (rev 18326) @@ -17,6 +17,8 @@ import java.util.List; import java.util.zip.GZIPInputStream; +import org.apache.log4j.Logger; + import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonParser.Feature; @@ -35,6 +37,8 @@ public static final String SEARCH_KEY = "search_metadata"; public static final String STATUS_KEY = "statuses"; + private static final Logger logger = Logger.getLogger(TweetStreamIterator.class.getName()); + private ObjectMapper objectMapper; private JsonParser jsonParser; private MappingIterator<JsonNode> iterator; @@ -119,7 +123,7 @@ } } catch (IOException e) { - e.printStackTrace(); + logger.warn("Internal error in TweetStreamIterator", e); } return result; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Want excitement? Manually upgrade your production database. When you want reliability, choose Perforce Perforce version control. Predictably reliable. http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs