Revision: 18854
          http://sourceforge.net/p/gate/code/18854
Author:   markagreenwood
Date:     2015-08-05 09:53:42 +0000 (Wed, 05 Aug 2015)
Log Message:
-----------
added support for streaming output to a single CSV file and updated the sample 
to show it in use

Modified Paths:
--------------
    gate/trunk/plugins/Format_CSV/gcp/csv4gcp.jar
    gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml

Added Paths:
-----------
    
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java

Modified: gate/trunk/plugins/Format_CSV/gcp/csv4gcp.jar
===================================================================
(Binary files differ)

Modified: gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml  2015-08-05 05:18:08 UTC 
(rev 18853)
+++ gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml  2015-08-05 09:53:42 UTC 
(rev 18854)
@@ -5,6 +5,12 @@
 
        <report file="report.xml" />
 
+       <!--
+               This input handler reads a CSV file in which the first row are 
labels
+               and builds documents by pulling the text from column 1 (i.e. the
+               second column) and adding the other columns as document 
features using
+               the labels from the first row
+       -->
        <input class="gate.cloud.io.csv.CSVStreamingInputHandler"
                srcFile="input.csv"
                encoding="UTF-8"
@@ -13,9 +19,31 @@
                column="1"
                labelledColumns="true" />
 
+       <!--
+               Stores all the processed docs in a single CSV file with columns 
for
+               the author doc feature (if it starts with a . it's a doc 
feature),
+               the document content under the first Token annotation, and the
+               category feature of the first Token annotation. You could also 
specify
+               an annotationSetName and/or an annotationType. If you specify 
the later
+               then you get one row per annotation within the document. This 
is very
+               similar to how the Configurable Exporter in the Tools plugin 
works.
+       -->
        <output
                dir="output"
                encoding="UTF-8"
+               separator=","
+               quote='"'
+               columns=".author,Token,Token.category"
+               fileExtension=".csv"
+               class="gate.cloud.io.csv.CSVStreamingOutputHandler" />
+
+       <!--
+               Stores the annotated GATE documents to ease debugging the CSV 
output
+       -->
+       <output
+               dir="output"
+               encoding="UTF-8"
                fileExtension=".GATE.xml"
                class="gate.cloud.io.file.GATEStandOffFileOutputHandler" />
+
 </batch>

Added: 
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
===================================================================
--- 
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
                              (rev 0)
+++ 
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
      2015-08-05 09:53:42 UTC (rev 18854)
@@ -0,0 +1,128 @@
+package gate.cloud.io.csv;
+
+import static gate.cloud.io.IOConstants.PARAM_ENCODING;
+import gate.Annotation;
+import gate.AnnotationSet;
+import gate.Document;
+import gate.Utils;
+import gate.cloud.batch.DocumentID;
+import gate.cloud.io.json.JSONStreamingOutputHandler;
+import gate.util.GateException;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
+import au.com.bytecode.opencsv.CSVWriter;
+
+public class CSVStreamingOutputHandler extends JSONStreamingOutputHandler {
+  
+  public static final String PARAM_SEPARATOR_CHARACTER = "separator";
+  public static final String PARAM_QUOTE_CHARACTER = "quote";  
+  public static final String PARAM_COLUMNS = "columns";
+  public static final String PARAM_ANNOTATION_SET_NAME = "annotationSetName";
+  public static final String PARAM_ANNOTATION_TYPE = "annotationType";
+  
+  
+  private static final Logger logger = Logger
+      .getLogger(CSVStreamingOutputHandler.class);
+  
+  protected String encoding;
+
+  protected char separatorChar;
+
+  protected char quoteChar;
+  
+  protected String annotationSetName, annotationType;
+  
+  protected String[] columns;
+  
+  @Override
+  protected void configImpl(Map<String, String> configData) throws IOException,
+          GateException {
+
+    super.configImpl(configData);
+    
+    encoding = configData.get(PARAM_ENCODING);
+    separatorChar = configData.get(PARAM_SEPARATOR_CHARACTER).charAt(0);
+    quoteChar = configData.get(PARAM_QUOTE_CHARACTER).charAt(0); 
+    
+    columns = configData.get(PARAM_COLUMNS).split(",");
+    
+    annotationSetName = configData.get(PARAM_ANNOTATION_SET_NAME);
+    annotationType = configData.get(PARAM_ANNOTATION_TYPE);
+  }
+  
+  @Override
+  protected void outputDocumentImpl(Document document, DocumentID documentId)
+    throws IOException, GateException {
+
+    //TODO move to a thread local to save recreating each time?
+    CSVWriter csvOut = new CSVWriter(new 
OutputStreamWriter(getFileOutputStream(documentId),encoding),separatorChar,quoteChar);
+    
+    String[] data = new String[columns.length];
+    
+    if (annotationType == null || annotationType.trim().equals("")) {
+      for (int i = 0 ; i < columns.length ; ++i) {
+        data[i] = (String)getValue(columns[i], document, null);
+      }
+      csvOut.writeNext(data);
+    } else {
+      
+      List<Annotation> sorted = 
Utils.inDocumentOrder(document.getAnnotations(annotationSetName).get(annotationType));
+      for (Annotation annotation : sorted) {
+        for (int i = 0 ; i < columns.length ; ++i) {
+          data[i] = (String)getValue(columns[i], document, annotation);
+        }
+        csvOut.writeNext(data);
+      }
+    }        
+    
+    csvOut.flush();
+    
+    //baos.get().write('\n');
+    byte[] result = baos.get().toByteArray();
+    
+    csvOut.close();
+    
+    baos.get().reset();
+    try {
+      results.put(result);
+    } catch(InterruptedException e) {
+      Thread.currentThread().interrupt();
+    }
+  }
+  
+  private Object getValue(String key, Document document, Annotation within) {
+    
+    String[] parts = key.split("\\.");
+    
+    if (parts.length > 2) {      
+      logger.log(Level.WARN, "Invalid key: "+key);
+      return null;
+    }
+    
+    if (key.startsWith(".")) {
+      return document.getFeatures().get(parts[1]);
+    } else {
+      AnnotationSet annots = 
document.getAnnotations(annotationSetName).get(parts[0]);
+      
+      if (within != null) {
+        annots = Utils.getContainedAnnotations(annots, within);
+      }
+      
+      if (annots.size() == 0) return null;
+      
+      Annotation annotation = Utils.inDocumentOrder(annots).get(0);
+      
+      if (parts.length == 1)
+        return Utils.stringFor(document, annotation);
+      
+      return annotation.getFeatures().get(parts[1]);
+    }
+  }
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to