Revision: 18341
          http://sourceforge.net/p/gate/code/18341
Author:   adamfunk
Date:     2014-09-16 12:19:23 +0000 (Tue, 16 Sep 2014)
Log Message:
-----------
Changed the iteration through the twitter data to handle edge cases of
odd data formatting.

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java

Removed Paths:
-------------
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-09-16 01:19:57 UTC (rev 18340)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-09-16 12:19:23 UTC (rev 18341)
@@ -75,39 +75,46 @@
       input = inputUrl.openStream();
       
       // TODO Detect & handle gzipped input.
-      TweetStreamIterable tweetSource = new TweetStreamIterable(input, 
contentKeys, featureKeys, false);
+      TweetStreamIterator tweetSource = new TweetStreamIterator(input, 
contentKeys, featureKeys, false);
 
       int tweetCounter = 0;
+      int tweetDocCounter = 0;
       Document document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
       StringBuilder content = new StringBuilder();
       Map<PreAnnotation, Integer> annotandaOffsets = new 
HashMap<PreAnnotation, Integer>();
       
-      // TODO Suppress empty documents (generated by 0-tweet files).
+      /* TweetStreamIterator.hasNext() returns true if there might be more
+       * tweets in the file; a concatenated set of search results might
+       * have an object with an empty statuses array followed by one 
+       * with some tweet in the array; in that case, we ignore the first null
+       * and keep looking.       */
       
-      for (Tweet tweet : tweetSource) {
-        if ( (tweetsPerDoc > 0) && (tweetCounter > 0) && ((tweetCounter % 
tweetsPerDoc) == 0) ) {
-          closeDocument(document, content, annotandaOffsets, corpus);
-          document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
-          content = new StringBuilder();
-          annotandaOffsets = new HashMap<PreAnnotation, Integer>();
+      while (tweetSource.hasNext()) {
+        Tweet tweet = tweetSource.next();
+        // next() == null means there wasn't anything ready in the stream,
+        // but there might be next time.
+        if (tweet != null) {
+          tweetDocCounter++;
+          if ( (tweetsPerDoc > 0) && (tweetDocCounter >= tweetsPerDoc) ) {
+            closeDocument(document, content, annotandaOffsets, corpus);
+            tweetDocCounter = 0;
+            document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
+            content = new StringBuilder();
+            annotandaOffsets = new HashMap<PreAnnotation, Integer>();
+          }
+          
+          int startOffset = content.length();
+          content.append(tweet.getString());
+          for (PreAnnotation preAnn : tweet.getAnnotations()) {
+            annotandaOffsets.put(preAnn, startOffset);
+          }
+          
+          content.append('\n');
+          tweetCounter++;
         }
-
-        int startOffset = content.length();
-        content.append(tweet.getString());
-        for (PreAnnotation preAnn : tweet.getAnnotations()) {
-          annotandaOffsets.put(preAnn, startOffset);
-        }
-
-        content.append('\n');
-        tweetCounter++;
       } // end of Tweet loop
       
-      if (content.length() > 0) {
-        closeDocument(document, content, annotandaOffsets, corpus);
-      }
-      else {
-        Factory.deleteResource(document);
-      }
+      closeDocument(document, content, annotandaOffsets, corpus);
       
       if(corpus.getDataStore() != null) {
         corpus.getDataStore().sync(corpus);
@@ -145,18 +152,23 @@
 
   
   private static void closeDocument(Document document, StringBuilder content, 
Map<PreAnnotation, Integer> annotandaOffsets, Corpus corpus) throws 
InvalidOffsetException {
-    DocumentContent contentImpl = new DocumentContentImpl(content.toString());
-    document.setContent(contentImpl);
-    AnnotationSet originalMarkups = 
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
-    for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
-      preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
-    }
-    corpus.add(document);
-    
-    if (corpus.getLRPersistenceId() != null) {
-      corpus.unloadDocument(document);
+    if (content.length() == 0) {
       Factory.deleteResource(document);
     }
+    else {    
+      DocumentContent contentImpl = new 
DocumentContentImpl(content.toString());
+      document.setContent(contentImpl);
+      AnnotationSet originalMarkups = 
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
+      for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
+        preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
+      }
+      corpus.add(document);
+      
+      if (corpus.getLRPersistenceId() != null) {
+        corpus.unloadDocument(document);
+        Factory.deleteResource(document);
+      }
+    }
   }
 
   

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java   
2014-09-16 01:19:57 UTC (rev 18340)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java   
2014-09-16 12:19:23 UTC (rev 18341)
@@ -13,7 +13,6 @@
 
 
 import gate.Gate;
-import gate.gui.MainFrame;
 import gate.swing.XJFileChooser;
 import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;

Deleted: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
===================================================================
--- 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java    
    2014-09-16 01:19:57 UTC (rev 18340)
+++ 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java    
    2014-09-16 12:19:23 UTC (rev 18341)
@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 1995-2014, The University of Sheffield. See the file
- *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
- *
- *  This file is part of GATE (see http://gate.ac.uk/), and is free
- *  software, licenced under the GNU Library General Public License,
- *  Version 2, June 1991 (in the distribution as file licence.html,
- *  and also available at http://gate.ac.uk/gate/licence.html).
- *  
- *  $Id$
- */
-package gate.corpora.twitter;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import org.apache.log4j.Logger;
-
-
-/**
- * Iterable version, just to make loops easier.
- * @author adam
- *
- */
-public class TweetStreamIterable implements Iterable<Tweet> {
-
-  private InputStream input;
-  private List<String> contentKeys, featureKeys;
-  private boolean gzip;
-  private TweetStreamIterator iterator;
-  
-  private static final Logger logger = 
Logger.getLogger(TweetStreamIterable.class.getName());
-  
-  public TweetStreamIterable(InputStream input, List<String> contentKeys,
-          List<String> featureKeys, boolean gzip) {
-    
-    this.input = input;
-    this.contentKeys = contentKeys;
-    this.featureKeys = featureKeys;
-    this.gzip = gzip;
-    this.iterator = null;
-  }
-
-  
-  @Override
-  public Iterator<Tweet> iterator() {
-    try {
-      this.iterator = new TweetStreamIterator(input, contentKeys, featureKeys, 
gzip);
-      return this.iterator;
-    } 
-    catch(IOException e) {
-      logger.warn("Internal error in TweetStreamIterator", e);
-      // The Override won't let us throw an exception up.
-      return Collections.<Tweet>emptyList().iterator();
-    }
-  }
-
-  
-  public void close() {
-    if (this.iterator != null) {
-      try {
-        this.iterator.close();
-      } 
-      catch(IOException e) {
-        logger.warn("Internal error in TweetStreamIterator", e);
-      }
-    }
-  }
-  
-}

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
--- 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-09-16 01:19:57 UTC (rev 18340)
+++ 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java    
    2014-09-16 12:19:23 UTC (rev 18341)
@@ -39,7 +39,7 @@
   private JsonParser jsonParser;
   private MappingIterator<JsonNode> iterator;
   private List<String> contentKeys, featureKeys;
-  private boolean nested, hasNextNode;
+  private boolean nested;
   private Iterator<JsonNode> nestedStatuses;
   private JsonNode nextNode; 
 
@@ -71,20 +71,17 @@
     iterator = objectMapper.readValues(jsonParser, JsonNode.class);
     this.nested = false;
     this.nestedStatuses = null;
-    this.hasNextNode = this.iterator.hasNext();
-    if (this.hasNextNode) {
-      this.nextNode = this.iterator.next();
-    }
   }
 
   
   @Override
   public boolean hasNext() {
-    /* Using this.iterator.hasNext() did not work for search result format, 
because
-     * it returns true if there is a JSON node with an empty statuses array.  
So we 
-     * have to read ahead a bit in order to let the loop in Population *not* 
run in
-     * that case (so we can suppress the empty document).  */
-    return (this.hasNextNode && nonEmpty(this.nextNode)) || 
+    /* Suppressing empty documents in Population.populateCorpus is tricky.
+     * So hasNext() returns true if their *could* be more tweets in the 
+     * file, and next() returns null if there are none in the current 
+     * main JsonNode; populateCorpus has to text for null.
+     */
+    return this.iterator.hasNext()  || 
             (this.nested && (this.nestedStatuses != null) && 
this.nestedStatuses.hasNext());
     // Belt & braces: this.nested should suffice.
   }
@@ -102,30 +99,21 @@
         this.nested = this.nestedStatuses.hasNext();
       }
       
-      else if (this.hasNext()) {
+      else if (this.iterator.hasNext()) {
+        this.nextNode = this.iterator.next();
+
         if (isSearchResultList(this.nextNode)) {
           this.nestedStatuses = getStatuses(this.nextNode).iterator();
           this.nested = this.nestedStatuses.hasNext();
           // Set the nested flag according as there is anything left
-          // in thee statuses value array (which could be empty).
+          // in the statuses value array (which could be empty).
         }
-        
-        // Now let's test nested: true IFF we are in a search result thingy AND
-        // the thingy's statuses array is non-empty.
-        if (this.nested) {
-          result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, 
featureKeys);
-          // Set the nested flag again for the next call to next()
-          this.nested = this.nestedStatuses.hasNext();
-        }
         else {
-          result = Tweet.readTweet(this.nextNode, contentKeys, featureKeys);
+          this.nested = false;
+          this.nestedStatuses = null;
+          result = Tweet.readTweet(nextNode, contentKeys, featureKeys);
         }
       }
-      
-      if (! this.nested) {
-        hasNextNode = this.iterator.hasNext();
-        nextNode = hasNextNode ? this.iterator.next() : null;
-      }
     }
     catch (IOException e) {
       logger.warn("Internal error in TweetStreamIterator", e);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Want excitement?
Manually upgrade your production database.
When you want reliability, choose Perforce.
Perforce version control. Predictably reliable.
http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to