Revision: 18341 http://sourceforge.net/p/gate/code/18341 Author: adamfunk Date: 2014-09-16 12:19:23 +0000 (Tue, 16 Sep 2014) Log Message: ----------- Changed the iteration through the twitter data to handle edge cases of odd data formatting.
Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java Removed Paths: ------------- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-09-16 01:19:57 UTC (rev 18340) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-09-16 12:19:23 UTC (rev 18341) @@ -75,39 +75,46 @@ input = inputUrl.openStream(); // TODO Detect & handle gzipped input. - TweetStreamIterable tweetSource = new TweetStreamIterable(input, contentKeys, featureKeys, false); + TweetStreamIterator tweetSource = new TweetStreamIterator(input, contentKeys, featureKeys, false); int tweetCounter = 0; + int tweetDocCounter = 0; Document document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS); StringBuilder content = new StringBuilder(); Map<PreAnnotation, Integer> annotandaOffsets = new HashMap<PreAnnotation, Integer>(); - // TODO Suppress empty documents (generated by 0-tweet files). + /* TweetStreamIterator.hasNext() returns true if there might be more + * tweets in the file; a concatenated set of search results might + * have an object with an empty statuses array followed by one + * with some tweet in the array; in that case, we ignore the first null + * and keep looking. */ - for (Tweet tweet : tweetSource) { - if ( (tweetsPerDoc > 0) && (tweetCounter > 0) && ((tweetCounter % tweetsPerDoc) == 0) ) { - closeDocument(document, content, annotandaOffsets, corpus); - document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS); - content = new StringBuilder(); - annotandaOffsets = new HashMap<PreAnnotation, Integer>(); + while (tweetSource.hasNext()) { + Tweet tweet = tweetSource.next(); + // next() == null means there wasn't anything ready in the stream, + // but there might be next time. + if (tweet != null) { + tweetDocCounter++; + if ( (tweetsPerDoc > 0) && (tweetDocCounter >= tweetsPerDoc) ) { + closeDocument(document, content, annotandaOffsets, corpus); + tweetDocCounter = 0; + document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS); + content = new StringBuilder(); + annotandaOffsets = new HashMap<PreAnnotation, Integer>(); + } + + int startOffset = content.length(); + content.append(tweet.getString()); + for (PreAnnotation preAnn : tweet.getAnnotations()) { + annotandaOffsets.put(preAnn, startOffset); + } + + content.append('\n'); + tweetCounter++; } - - int startOffset = content.length(); - content.append(tweet.getString()); - for (PreAnnotation preAnn : tweet.getAnnotations()) { - annotandaOffsets.put(preAnn, startOffset); - } - - content.append('\n'); - tweetCounter++; } // end of Tweet loop - if (content.length() > 0) { - closeDocument(document, content, annotandaOffsets, corpus); - } - else { - Factory.deleteResource(document); - } + closeDocument(document, content, annotandaOffsets, corpus); if(corpus.getDataStore() != null) { corpus.getDataStore().sync(corpus); @@ -145,18 +152,23 @@ private static void closeDocument(Document document, StringBuilder content, Map<PreAnnotation, Integer> annotandaOffsets, Corpus corpus) throws InvalidOffsetException { - DocumentContent contentImpl = new DocumentContentImpl(content.toString()); - document.setContent(contentImpl); - AnnotationSet originalMarkups = document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME); - for (PreAnnotation preAnn : annotandaOffsets.keySet()) { - preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn)); - } - corpus.add(document); - - if (corpus.getLRPersistenceId() != null) { - corpus.unloadDocument(document); + if (content.length() == 0) { Factory.deleteResource(document); } + else { + DocumentContent contentImpl = new DocumentContentImpl(content.toString()); + document.setContent(contentImpl); + AnnotationSet originalMarkups = document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME); + for (PreAnnotation preAnn : annotandaOffsets.keySet()) { + preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn)); + } + corpus.add(document); + + if (corpus.getLRPersistenceId() != null) { + corpus.unloadDocument(document); + Factory.deleteResource(document); + } + } } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java 2014-09-16 01:19:57 UTC (rev 18340) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java 2014-09-16 12:19:23 UTC (rev 18341) @@ -13,7 +13,6 @@ import gate.Gate; -import gate.gui.MainFrame; import gate.swing.XJFileChooser; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; Deleted: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java 2014-09-16 01:19:57 UTC (rev 18340) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java 2014-09-16 12:19:23 UTC (rev 18341) @@ -1,72 +0,0 @@ -/* - * Copyright (c) 1995-2014, The University of Sheffield. See the file - * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt - * - * This file is part of GATE (see http://gate.ac.uk/), and is free - * software, licenced under the GNU Library General Public License, - * Version 2, June 1991 (in the distribution as file licence.html, - * and also available at http://gate.ac.uk/gate/licence.html). - * - * $Id$ - */ -package gate.corpora.twitter; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import org.apache.log4j.Logger; - - -/** - * Iterable version, just to make loops easier. - * @author adam - * - */ -public class TweetStreamIterable implements Iterable<Tweet> { - - private InputStream input; - private List<String> contentKeys, featureKeys; - private boolean gzip; - private TweetStreamIterator iterator; - - private static final Logger logger = Logger.getLogger(TweetStreamIterable.class.getName()); - - public TweetStreamIterable(InputStream input, List<String> contentKeys, - List<String> featureKeys, boolean gzip) { - - this.input = input; - this.contentKeys = contentKeys; - this.featureKeys = featureKeys; - this.gzip = gzip; - this.iterator = null; - } - - - @Override - public Iterator<Tweet> iterator() { - try { - this.iterator = new TweetStreamIterator(input, contentKeys, featureKeys, gzip); - return this.iterator; - } - catch(IOException e) { - logger.warn("Internal error in TweetStreamIterator", e); - // The Override won't let us throw an exception up. - return Collections.<Tweet>emptyList().iterator(); - } - } - - - public void close() { - if (this.iterator != null) { - try { - this.iterator.close(); - } - catch(IOException e) { - logger.warn("Internal error in TweetStreamIterator", e); - } - } - } - -} Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-09-16 01:19:57 UTC (rev 18340) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-09-16 12:19:23 UTC (rev 18341) @@ -39,7 +39,7 @@ private JsonParser jsonParser; private MappingIterator<JsonNode> iterator; private List<String> contentKeys, featureKeys; - private boolean nested, hasNextNode; + private boolean nested; private Iterator<JsonNode> nestedStatuses; private JsonNode nextNode; @@ -71,20 +71,17 @@ iterator = objectMapper.readValues(jsonParser, JsonNode.class); this.nested = false; this.nestedStatuses = null; - this.hasNextNode = this.iterator.hasNext(); - if (this.hasNextNode) { - this.nextNode = this.iterator.next(); - } } @Override public boolean hasNext() { - /* Using this.iterator.hasNext() did not work for search result format, because - * it returns true if there is a JSON node with an empty statuses array. So we - * have to read ahead a bit in order to let the loop in Population *not* run in - * that case (so we can suppress the empty document). */ - return (this.hasNextNode && nonEmpty(this.nextNode)) || + /* Suppressing empty documents in Population.populateCorpus is tricky. + * So hasNext() returns true if their *could* be more tweets in the + * file, and next() returns null if there are none in the current + * main JsonNode; populateCorpus has to text for null. + */ + return this.iterator.hasNext() || (this.nested && (this.nestedStatuses != null) && this.nestedStatuses.hasNext()); // Belt & braces: this.nested should suffice. } @@ -102,30 +99,21 @@ this.nested = this.nestedStatuses.hasNext(); } - else if (this.hasNext()) { + else if (this.iterator.hasNext()) { + this.nextNode = this.iterator.next(); + if (isSearchResultList(this.nextNode)) { this.nestedStatuses = getStatuses(this.nextNode).iterator(); this.nested = this.nestedStatuses.hasNext(); // Set the nested flag according as there is anything left - // in thee statuses value array (which could be empty). + // in the statuses value array (which could be empty). } - - // Now let's test nested: true IFF we are in a search result thingy AND - // the thingy's statuses array is non-empty. - if (this.nested) { - result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, featureKeys); - // Set the nested flag again for the next call to next() - this.nested = this.nestedStatuses.hasNext(); - } else { - result = Tweet.readTweet(this.nextNode, contentKeys, featureKeys); + this.nested = false; + this.nestedStatuses = null; + result = Tweet.readTweet(nextNode, contentKeys, featureKeys); } } - - if (! this.nested) { - hasNextNode = this.iterator.hasNext(); - nextNode = hasNextNode ? this.iterator.next() : null; - } } catch (IOException e) { logger.warn("Internal error in TweetStreamIterator", e); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Want excitement? Manually upgrade your production database. When you want reliability, choose Perforce. Perforce version control. Predictably reliable. http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs