Revision: 18854
http://sourceforge.net/p/gate/code/18854
Author: markagreenwood
Date: 2015-08-05 09:53:42 +0000 (Wed, 05 Aug 2015)
Log Message:
-----------
added support for streaming output to a single CSV file and updated the sample
to show it in use
Modified Paths:
--------------
gate/trunk/plugins/Format_CSV/gcp/csv4gcp.jar
gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml
Added Paths:
-----------
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
Modified: gate/trunk/plugins/Format_CSV/gcp/csv4gcp.jar
===================================================================
(Binary files differ)
Modified: gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml 2015-08-05 05:18:08 UTC
(rev 18853)
+++ gate/trunk/plugins/Format_CSV/gcp/sample/batch.xml 2015-08-05 09:53:42 UTC
(rev 18854)
@@ -5,6 +5,12 @@
<report file="report.xml" />
+ <!--
+ This input handler reads a CSV file in which the first row are
labels
+ and builds documents by pulling the text from column 1 (i.e. the
+ second column) and adding the other columns as document
features using
+ the labels from the first row
+ -->
<input class="gate.cloud.io.csv.CSVStreamingInputHandler"
srcFile="input.csv"
encoding="UTF-8"
@@ -13,9 +19,31 @@
column="1"
labelledColumns="true" />
+ <!--
+ Stores all the processed docs in a single CSV file with columns
for
+ the author doc feature (if it starts with a . it's a doc
feature),
+ the document content under the first Token annotation, and the
+ category feature of the first Token annotation. You could also
specify
+ an annotationSetName and/or an annotationType. If you specify
the later
+ then you get one row per annotation within the document. This
is very
+ similar to how the Configurable Exporter in the Tools plugin
works.
+ -->
<output
dir="output"
encoding="UTF-8"
+ separator=","
+ quote='"'
+ columns=".author,Token,Token.category"
+ fileExtension=".csv"
+ class="gate.cloud.io.csv.CSVStreamingOutputHandler" />
+
+ <!--
+ Stores the annotated GATE documents to ease debugging the CSV
output
+ -->
+ <output
+ dir="output"
+ encoding="UTF-8"
fileExtension=".GATE.xml"
class="gate.cloud.io.file.GATEStandOffFileOutputHandler" />
+
</batch>
Added:
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
===================================================================
---
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
(rev 0)
+++
gate/trunk/plugins/Format_CSV/gcp/src/gate/cloud/io/csv/CSVStreamingOutputHandler.java
2015-08-05 09:53:42 UTC (rev 18854)
@@ -0,0 +1,128 @@
+package gate.cloud.io.csv;
+
+import static gate.cloud.io.IOConstants.PARAM_ENCODING;
+import gate.Annotation;
+import gate.AnnotationSet;
+import gate.Document;
+import gate.Utils;
+import gate.cloud.batch.DocumentID;
+import gate.cloud.io.json.JSONStreamingOutputHandler;
+import gate.util.GateException;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
+import au.com.bytecode.opencsv.CSVWriter;
+
+public class CSVStreamingOutputHandler extends JSONStreamingOutputHandler {
+
+ public static final String PARAM_SEPARATOR_CHARACTER = "separator";
+ public static final String PARAM_QUOTE_CHARACTER = "quote";
+ public static final String PARAM_COLUMNS = "columns";
+ public static final String PARAM_ANNOTATION_SET_NAME = "annotationSetName";
+ public static final String PARAM_ANNOTATION_TYPE = "annotationType";
+
+
+ private static final Logger logger = Logger
+ .getLogger(CSVStreamingOutputHandler.class);
+
+ protected String encoding;
+
+ protected char separatorChar;
+
+ protected char quoteChar;
+
+ protected String annotationSetName, annotationType;
+
+ protected String[] columns;
+
+ @Override
+ protected void configImpl(Map<String, String> configData) throws IOException,
+ GateException {
+
+ super.configImpl(configData);
+
+ encoding = configData.get(PARAM_ENCODING);
+ separatorChar = configData.get(PARAM_SEPARATOR_CHARACTER).charAt(0);
+ quoteChar = configData.get(PARAM_QUOTE_CHARACTER).charAt(0);
+
+ columns = configData.get(PARAM_COLUMNS).split(",");
+
+ annotationSetName = configData.get(PARAM_ANNOTATION_SET_NAME);
+ annotationType = configData.get(PARAM_ANNOTATION_TYPE);
+ }
+
+ @Override
+ protected void outputDocumentImpl(Document document, DocumentID documentId)
+ throws IOException, GateException {
+
+ //TODO move to a thread local to save recreating each time?
+ CSVWriter csvOut = new CSVWriter(new
OutputStreamWriter(getFileOutputStream(documentId),encoding),separatorChar,quoteChar);
+
+ String[] data = new String[columns.length];
+
+ if (annotationType == null || annotationType.trim().equals("")) {
+ for (int i = 0 ; i < columns.length ; ++i) {
+ data[i] = (String)getValue(columns[i], document, null);
+ }
+ csvOut.writeNext(data);
+ } else {
+
+ List<Annotation> sorted =
Utils.inDocumentOrder(document.getAnnotations(annotationSetName).get(annotationType));
+ for (Annotation annotation : sorted) {
+ for (int i = 0 ; i < columns.length ; ++i) {
+ data[i] = (String)getValue(columns[i], document, annotation);
+ }
+ csvOut.writeNext(data);
+ }
+ }
+
+ csvOut.flush();
+
+ //baos.get().write('\n');
+ byte[] result = baos.get().toByteArray();
+
+ csvOut.close();
+
+ baos.get().reset();
+ try {
+ results.put(result);
+ } catch(InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ }
+
+ private Object getValue(String key, Document document, Annotation within) {
+
+ String[] parts = key.split("\\.");
+
+ if (parts.length > 2) {
+ logger.log(Level.WARN, "Invalid key: "+key);
+ return null;
+ }
+
+ if (key.startsWith(".")) {
+ return document.getFeatures().get(parts[1]);
+ } else {
+ AnnotationSet annots =
document.getAnnotations(annotationSetName).get(parts[0]);
+
+ if (within != null) {
+ annots = Utils.getContainedAnnotations(annots, within);
+ }
+
+ if (annots.size() == 0) return null;
+
+ Annotation annotation = Utils.inDocumentOrder(annots).get(0);
+
+ if (parts.length == 1)
+ return Utils.stringFor(document, annotation);
+
+ return annotation.getFeatures().get(parts[1]);
+ }
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs