Revision: 18326
          http://sourceforge.net/p/gate/code/18326
Author:   adamfunk
Date:     2014-09-12 14:37:07 +0000 (Fri, 12 Sep 2014)
Log Message:
-----------
probably ready to test soon

Modified Paths:
--------------
    
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java
    
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
    
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java

Modified: 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java
      2014-09-12 13:45:45 UTC (rev 18325)
+++ 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/Population.java
      2014-09-12 14:37:07 UTC (rev 18326)
@@ -34,8 +34,8 @@
 import java.util.Map;
 import javax.swing.AbstractAction;
 import javax.swing.Action;
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
 
 
 @CreoleResource(name = "Twitter Corpus Populator", tool = true, autoinstances 
= @AutoInstance,
@@ -45,7 +45,11 @@
 
   private static final long serialVersionUID = 1443073039199794668L;
 
+  private static final Logger logger = 
Logger.getLogger(Population.class.getName());
   
+  private static final int COUNTER_DIGITS = 9;
+
+  
   public static void populateCorpus(final Corpus corpus, URL inputUrl, 
PopulationConfig config) 
       throws ResourceInstantiationException {
     populateCorpus(corpus, inputUrl, config.getEncoding(), 
config.getContentKeys(), 
@@ -66,25 +70,19 @@
       List<String> featureKeys, int tweetsPerDoc) throws 
ResourceInstantiationException {
     try {
       InputStream input = inputUrl.openStream();
-      List<String> lines = IOUtils.readLines(input, encoding);
-      IOUtils.closeQuietly(input);
       
-      // TODO: sort this out so it processes one at a time instead of reading 
the
-      // whole hog into memory
-      
-      // For now, we assume the streaming API format (concatenated maps, not 
in a list)
-      List<Tweet> tweets = TweetUtils.readTweetStrings(lines, contentKeys, 
featureKeys);
-      
-      int digits = (int) Math.ceil(Math.log10(tweets.size()));
+      // TODO detect & handle gzipped input
+      TweetStreamIterable tweetSource = new TweetStreamIterable(input, 
contentKeys, featureKeys, false);
+
       int tweetCounter = 0;
-      Document document = newDocument(inputUrl, tweetCounter, digits);
+      Document document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
       StringBuilder content = new StringBuilder();
       Map<PreAnnotation, Integer> annotandaOffsets = new 
HashMap<PreAnnotation, Integer>();
       
-      for (Tweet tweet : tweets) {
+      for (Tweet tweet : tweetSource) {
         if ( (tweetsPerDoc > 0) && (tweetCounter > 0) && ((tweetCounter % 
tweetsPerDoc) == 0) ) {
           closeDocument(document, content, annotandaOffsets, corpus);
-          document = newDocument(inputUrl, tweetCounter, digits);
+          document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
           content = new StringBuilder();
           annotandaOffsets = new HashMap<PreAnnotation, Integer>();
         }
@@ -127,8 +125,8 @@
     document.getFeatures().put("gate.SourceURL", url.toString());
     return document;
   }
+
   
-  
   private static void closeDocument(Document document, StringBuilder content, 
Map<PreAnnotation, Integer> annotandaOffsets, Corpus corpus) throws 
InvalidOffsetException {
     DocumentContent contentImpl = new DocumentContentImpl(content.toString());
     document.setContent(contentImpl);
@@ -177,7 +175,7 @@
                     } 
                   }
                   catch(ResourceInstantiationException e) {
-                    e.printStackTrace();
+                    logger.warn("Error in Twitter Population", e);
                   }
                 }
               };
@@ -185,7 +183,7 @@
           thread.start();
         }
         catch(MalformedURLException e0) {
-          e0.printStackTrace();
+          logger.warn("Error in Twitter Population", e0);
         }
       }
     });

Modified: 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
===================================================================
--- 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
     2014-09-12 13:45:45 UTC (rev 18325)
+++ 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
     2014-09-12 14:37:07 UTC (rev 18326)
@@ -11,9 +11,14 @@
  */
 package gate.corpora.twitter;
 
+import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
 import java.util.Iterator;
+import java.util.List;
+import org.apache.log4j.Logger;
 
+
 /**
  * Iterable version, just to make loops easier.
  * @author adam
@@ -21,15 +26,47 @@
  */
 public class TweetStreamIterable implements Iterable<Tweet> {
 
-  InputStream input;
+  private InputStream input;
+  private List<String> contentKeys, featureKeys;
+  private boolean gzip;
+  private TweetStreamIterator iterator;
   
-  public TweetStreamIterable(InputStream input) {
+  private static final Logger logger = 
Logger.getLogger(TweetStreamIterable.class.getName());
+  
+  public TweetStreamIterable(InputStream input, List<String> contentKeys,
+          List<String> featureKeys, boolean gzip) {
+    
     this.input = input;
+    this.contentKeys = contentKeys;
+    this.featureKeys = featureKeys;
+    this.gzip = gzip;
+    this.iterator = null;
   }
 
+  
   @Override
   public Iterator<Tweet> iterator() {
-    return new TweetStreamIterator(input);
+    try {
+      this.iterator = new TweetStreamIterator(input, contentKeys, featureKeys, 
gzip);
+      return this.iterator;
+    } 
+    catch(IOException e) {
+      logger.warn("Internal error in TweetStreamIterator", e);
+      // The Override won't let us throw an exception up.
+      return Collections.<Tweet>emptyList().iterator();
+    }
   }
+
   
+  public void close() {
+    if (this.iterator != null) {
+      try {
+        this.iterator.close();
+      } 
+      catch(IOException e) {
+        logger.warn("Internal error in TweetStreamIterator", e);
+      }
+    }
+  }
+  
 }

Modified: 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
--- 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
     2014-09-12 13:45:45 UTC (rev 18325)
+++ 
gate/branches/twitter-pop-dev/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
     2014-09-12 14:37:07 UTC (rev 18326)
@@ -17,6 +17,8 @@
 import java.util.List;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.log4j.Logger;
+
 import com.fasterxml.jackson.core.JsonParseException;
 import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.core.JsonParser.Feature;
@@ -35,6 +37,8 @@
   public static final String SEARCH_KEY = "search_metadata";
   public static final String STATUS_KEY = "statuses";
   
+  private static final Logger logger = 
Logger.getLogger(TweetStreamIterator.class.getName());
+
   private ObjectMapper objectMapper;
   private JsonParser jsonParser;
   private MappingIterator<JsonNode> iterator;
@@ -119,7 +123,7 @@
       }
     }
     catch (IOException e) {
-      e.printStackTrace();
+      logger.warn("Internal error in TweetStreamIterator", e);
     }
     return result;
   }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Want excitement?
Manually upgrade your production database.
When you want reliability, choose Perforce
Perforce version control. Predictably reliable.
http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to