Revision: 18430
          http://sourceforge.net/p/gate/code/18430
Author:   ian_roberts
Date:     2014-11-05 18:09:14 +0000 (Wed, 05 Nov 2014)
Log Message:
-----------
Improved JSON exporter

- Support for a "document annotation" covering the text to be exported.  By
  default (and in the typical use case) this will be "Tweet" in the Original
  markups annotation set.  When a document contains one or more of these
  annotations then features of the annotation will be added as additional
  properties of the exported JSON object.
- Can now export multiple Tweets from one document or a whole corpus of
  documents, into a single file containing either a JSON array of objects or
  one-object-per-line streaming style
- Can export annotations from more than one set.  One set must be nominated as
  the primary set, and unprefixed annotation types in the annotationTypes
  parameter list will be taken from that set, but other types can be specified 
as
  "setName:annType".

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java    
2014-11-05 18:03:55 UTC (rev 18429)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java    
2014-11-05 18:09:14 UTC (rev 18430)
@@ -15,52 +15,114 @@
 
 import gate.Annotation;
 import gate.AnnotationSet;
+import gate.Corpus;
+import gate.CorpusExporter;
 import gate.Document;
-import gate.DocumentExporter;
+import gate.Factory;
 import gate.FeatureMap;
+import gate.GateConstants;
+import gate.Utils;
 import gate.corpora.DocumentJsonUtils;
 import gate.creole.metadata.AutoInstance;
 import gate.creole.metadata.CreoleParameter;
 import gate.creole.metadata.CreoleResource;
 import gate.creole.metadata.Optional;
 import gate.creole.metadata.RunTime;
+import gate.util.InvalidOffsetException;
+import gate.util.LuckyException;
 
 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.io.SerializedString;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
 @CreoleResource(name = "GATE JSON Exporter", tool = true, autoinstances = 
@AutoInstance, icon = "GATEJSON")
-public class GATEJsonExporter extends DocumentExporter {
-
+public class GATEJsonExporter extends CorpusExporter {
+  
   private static final long serialVersionUID = -8087536348560365618L;
 
-  private String annotationSetName;
+  protected static final ObjectMapper MAPPER = new ObjectMapper();
   
-  private Set<String> annotationTypes;
-
-  public String getAnnotationSetName() {
-    return annotationSetName;
-  }
-
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
+  @Optional
   @RunTime
+  @CreoleParameter(comment = "The annotation set from which " +
+               "otherwise-unspecified entity annotations will be taken")
+  public void setEntitiesAnnotationSetName(String name) {}
+  public String getEntitiesAnnotationSetName() { return null; }
+  
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
+  @RunTime
+  @CreoleParameter(comment = "Annotation types to export.  " +
+               "Plain annotation types will be taken from the set named " +
+               "by the annotationSetName parameter, entries containing " +
+               "a colon are treated as setName:type (with an empty setName " +
+               "denoting the default set).")
+  public void setAnnotationTypes(Set<String> types) {}
+  public Set<String> getAnnotationTypes() { return null; }
+  
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
+  @RunTime
+  @CreoleParameter(defaultValue = "false", comment = "Whether " +
+               "to wrap the output as a JSON array.  When exporting a corpus, 
" +
+               "true will write a JSON array of objects, one per document, " +
+               "whereas false will simply output one object per document " +
+               "separated by newlines.")
+  public void setExportAsArray(Boolean array) {}
+  public Boolean getExportAsArray() { return null; }
+  
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
+  @RunTime
+  @CreoleParameter(defaultValue = "true", comment = "Whether " +
+               "to include the document text as a \"text\" property in " +
+               "the output JSON")
+  public void setIncludeText(Boolean include) {}
+  public Boolean getIncludeText() { return null; }
+  
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
+  @RunTime
   @Optional
-  @CreoleParameter
-  public void setAnnotationSetName(String annotationSetName) {
-    this.annotationSetName = annotationSetName;
-  }
+  @CreoleParameter(defaultValue = 
GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME,
+          comment = "Annotation set in which the \"document " +
+               "annotation\" can be found.  These annotations serve to delimit 
" +
+               "the parts of the document that should be output, and the 
result " +
+               "will contain one JSON object per annotation, with the " +
+               "annotation's features as additional JSON properties.")
+  public void setDocumentAnnotationASName(String asName) {}
+  public String getDocumentAnnotationASName() { return null; }
 
-  public Set<String> getAnnotationTypes() {
-    return annotationTypes;
-  }
-
+  /**
+   * No-op, exists only as a host for the parameter annotations.
+   */
   @RunTime
-  @CreoleParameter
-  public void setAnnotationTypes(Set<String> annotationTypes) {
-    this.annotationTypes = annotationTypes;
-  }
+  @Optional
+  @CreoleParameter(defaultValue = "Tweet", comment = "Annotation type " +
+               "for \"document annotations\".  These annotations serve to 
delimit " +
+      "the parts of the document that should be output, and the result " +
+      "will contain one JSON object per annotation, with the " +
+      "annotation's features as additional JSON properties.  If " +
+      "unspecified, or if a given GATE document contains none of " +
+      "these annotations, then the whole document content will be output.")
+  public void setDocumentAnnotationType(String type) {}
+  public String getDocumentAnnotationType() { return null; }
 
   public GATEJsonExporter() {
     super("GATE JSON", "json","application/json");
@@ -70,18 +132,100 @@
   @Override
   public void export(Document doc, OutputStream out, FeatureMap options)
     throws IOException {
+    try(JsonGenerator generator = openGenerator(out, options)) {
+      export(doc, generator, options);
+    }
+  }
+  
+  public void export(Corpus corpus, OutputStream out, FeatureMap options)
+    throws IOException {
+    try(JsonGenerator generator = openGenerator(out, options)) {
+      Iterator<Document> docIter = corpus.iterator();
+      int currentDocIndex = 0;
+      while(docIter.hasNext()) {
+        boolean docWasLoaded =
+                corpus.isDocumentLoaded(currentDocIndex);
+        Document currentDoc = docIter.next();
+        try {
+          export(currentDoc, generator, options);
+        } finally {
+          // unload if necessary
+          if(!docWasLoaded) {
+            corpus.unloadDocument(currentDoc);
+            Factory.deleteResource(currentDoc);
+          }
+          currentDocIndex++;
+        }
+      }
+    }
+  }
+  
+  /**
+   * Create a JsonGenerator ready to write to the given output stream.
+   * If the specified options indicate that we want to wrap the output
+   * in an array then output the array start event in preparation.
+   */
+  protected JsonGenerator openGenerator(OutputStream out, FeatureMap options)
+    throws IOException {
+    JsonGenerator generator = MAPPER.getFactory().createGenerator(out);
+    generator.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
+    generator.enable(JsonGenerator.Feature.AUTO_CLOSE_JSON_CONTENT);
+    if(options.containsKey("exportAsArray") && 
((Boolean)options.get("exportAsArray")).booleanValue()) {
+      generator.writeStartArray();
+    } else {
+      // writing concatenated, put newlines in between
+      generator.setRootValueSeparator(new SerializedString("\n"));
+    }
+    
+    return generator;
+  }
 
-    AnnotationSet annots =
-      doc.getAnnotations((String)options.get("annotationSetName"));
-    
-    Collection<String> types = 
(Collection<String>)options.get("annotationTypes");
-    
-    Map<String,Collection<Annotation>> annotationsMap = new 
HashMap<String,Collection<Annotation>>();
-    
-    for (String type : types) {
-      annotationsMap.put(type, annots.get(type));
+  public void export(Document doc, JsonGenerator generator, FeatureMap options)
+    throws IOException {
+    try {
+      AnnotationSet defaultEntitiesAS =
+        doc.getAnnotations((String)options.get("entitiesAnnotationSetName"));
+      
+      Collection<String> types = 
(Collection<String>)options.get("annotationTypes");
+      
+      boolean includeText = (options.containsKey("includeText")
+              ? ((Boolean)options.get("includeText")).booleanValue() : true);
+
+      Map<String,Collection<Annotation>> annotationsMap = new HashMap<>();
+      
+      for (String type : types) {
+        String[] setAndType = type.split(":", 2);
+        if(setAndType.length == 1) {
+          annotationsMap.put(type, defaultEntitiesAS.get(type));
+        } else {
+          annotationsMap.put(type, 
doc.getAnnotations(setAndType[0]).get(setAndType[1]));
+        }
+      }
+      
+      // look for document annotations
+      AnnotationSet docAnnots = null;
+      if(options.containsKey("documentAnnotationType")) {
+        docAnnots = 
doc.getAnnotations((String)options.get("documentAnnotationASName"))
+                .get((String)options.get("documentAnnotationType"));
+      }
+      if(docAnnots == null || docAnnots.isEmpty()) {
+        // no document annotations, write everything
+        DocumentJsonUtils.writeDocument(doc, 0L, Utils.end(doc), 
annotationsMap, null, null, includeText, generator);
+      } else {
+        for(Annotation docAnnot : Utils.inDocumentOrder(docAnnots)) {
+          Map<String, Collection<Annotation>> coveredAnnotations = new 
HashMap<>();
+          for(Map.Entry<String, Collection<Annotation>> entry : 
annotationsMap.entrySet()) {
+            coveredAnnotations.put(entry.getKey(),
+                    ((AnnotationSet)entry.getValue()).getContained(
+                            Utils.start(docAnnot), Utils.end(docAnnot)));
+          }
+          DocumentJsonUtils.writeDocument(doc, Utils.start(docAnnot), 
Utils.end(docAnnot),
+                  coveredAnnotations, docAnnot.getFeatures(), null, 
includeText, generator);
+        }
+      }
+    } catch(InvalidOffsetException e) {
+      // should never happen, as all offsets come from the document itself
+      throw new LuckyException("Invalid offset found within document", e);
     }
-
-    DocumentJsonUtils.writeDocument(doc, annotationsMap, out);
   }
 }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to