Revision: 17207
          http://sourceforge.net/p/gate/code/17207
Author:   ian_roberts
Date:     2014-01-03 16:57:06 +0000 (Fri, 03 Jan 2014)
Log Message:
-----------
First cut of a utility class to output GATE document text and annotations in a
JSON format (deliberately) close to that used by Twitter to represent entities
in tweets:

{
  "text":"Text of the document ....",
  "entities":{
    "Person":[
      {"indices":[0,10], "gender":"male"},
      {"indices":[25,32], "gender":"female"}
    ],
    "Location":[
      {"indices":[75,88], "locType":"country"}
    ]
  }
}

Each annotation is represented as an object with "indices":[start,end] and the
annotation's features represented as other properties of the object.

Also includes support for (more or less) preserving the existing JSON structure
if the original document was a Tweet.

Note that this pulls the Jackson dependency up from the Twitter plugin into
gate-core, so I've removed the conflicting libraries from the plugin.

Modified Paths:
--------------
    gate/trunk/ivy.xml
    gate/trunk/plugins/Twitter/creole.xml

Added Paths:
-----------
    gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java

Removed Paths:
-------------
    gate/trunk/plugins/Twitter/lib/jackson-annotations-2.2.3.jar
    gate/trunk/plugins/Twitter/lib/jackson-core-2.2.3.jar
    gate/trunk/plugins/Twitter/lib/jackson-databind-2.2.3.jar

Modified: gate/trunk/ivy.xml
===================================================================
--- gate/trunk/ivy.xml  2013-12-24 16:30:52 UTC (rev 17206)
+++ gate/trunk/ivy.xml  2014-01-03 16:57:06 UTC (rev 17207)
@@ -90,6 +90,9 @@
 
     <!-- required for Spring Framework support in GATE -->
     <dependency org="org.springframework" name="spring-aop" rev="2.0.8" />
+    
+    <!-- required for outputting documents as JSON -->
+    <dependency org="com.fasterxml.jackson.core" name="jackson-databind" 
rev="2.2.3" />
 
     <!-- required for indexing datastores (not used by ANNIC) -->
     <dependency org="org.apache.lucene" name="lucene-core" rev="3.5.0" />

Modified: gate/trunk/plugins/Twitter/creole.xml
===================================================================
--- gate/trunk/plugins/Twitter/creole.xml       2013-12-24 16:30:52 UTC (rev 
17206)
+++ gate/trunk/plugins/Twitter/creole.xml       2014-01-03 16:57:06 UTC (rev 
17207)
@@ -2,8 +2,5 @@
 <CREOLE-DIRECTORY>  
   <JAR SCAN="true">twitter.jar</JAR>
   <JAR>lib/jaspell.jar</JAR>
-  <JAR>lib/jackson-core-2.2.3.jar</JAR>
-  <JAR>lib/jackson-databind-2.2.3.jar</JAR>
-  <JAR>lib/jackson-annotations-2.2.3.jar</JAR>
   <REQUIRES>../Tagger_Stanford</REQUIRES>
 </CREOLE-DIRECTORY> 

Deleted: gate/trunk/plugins/Twitter/lib/jackson-annotations-2.2.3.jar
===================================================================
(Binary files differ)

Deleted: gate/trunk/plugins/Twitter/lib/jackson-core-2.2.3.jar
===================================================================
(Binary files differ)

Deleted: gate/trunk/plugins/Twitter/lib/jackson-databind-2.2.3.jar
===================================================================
(Binary files differ)

Added: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java
===================================================================
--- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java                     
        (rev 0)
+++ gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java     2014-01-03 
16:57:06 UTC (rev 17207)
@@ -0,0 +1,189 @@
+/*
+ *  DocumentJsonUtils.java
+ *
+ *  Copyright (c) 1995-2012, The University of Sheffield. See the file
+ *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ *  This file is part of GATE (see http://gate.ac.uk/), and is free
+ *  software, licenced under the GNU Library General Public License,
+ *  Version 2, June 1991 (in the distribution as file licence.html,
+ *  and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ *  Ian Roberts, 20/Dec/2013
+ *
+ *  $Id$
+ */
+package gate.corpora;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+import com.fasterxml.jackson.core.JsonGenerationException;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.ObjectWriter;
+
+import gate.Annotation;
+import gate.Document;
+import gate.util.GateRuntimeException;
+import gate.util.InvalidOffsetException;
+
+/**
+ * This class contains utility methods to output GATE documents in a
+ * JSON format which is (deliberately) close to the format used by
+ * Twitter to represent entities such as user mentions and hashtags in
+ * Tweets.
+ * 
+ * @author ian
+ * 
+ */
+public class DocumentJsonUtils {
+
+  private static final ObjectMapper MAPPER = new ObjectMapper();
+
+  /**
+   * Write a GATE document to the specified JsonGenerator. The document
+   * text will be written as a property named "text" and the specified
+   * annotations will be written as "entities".
+   * 
+   * @param doc the document to write
+   * @param annotationsMap annotations to write.
+   * @param json the {@link JsonGenerator} to write to.
+   * @throws JsonGenerationException if a problem occurs while
+   *           generating the JSON
+   * @throws IOException if an I/O error occurs.
+   */
+  public static void writeDocument(Document doc,
+          Map<String, Collection<Annotation>> annotationsMap, JsonGenerator 
json)
+          throws JsonGenerationException, IOException {
+    try {
+      writeDocument(doc, 0L, doc.getContent().size(), annotationsMap, json);
+    } catch(InvalidOffsetException e) {
+      // shouldn't happen
+      throw new GateRuntimeException(
+              "Got invalid offset exception when passing "
+                      + "offsets that are known to be valid");
+    }
+  }
+
+  /**
+   * Write a substring of a GATE document to the specified
+   * JsonGenerator. The specified window of document text will be
+   * written as a property named "text" and the specified annotations
+   * will be written as "entities", with their offsets adjusted to be
+   * relative to the specified window.
+   * 
+   * @param doc the document to write
+   * @param start the start offset of the segment to write
+   * @param end the end offset of the segment to write
+   * @param annotationsMap annotations to write.
+   * @param json the {@link JsonGenerator} to write to.
+   * @throws JsonGenerationException if a problem occurs while
+   *           generating the JSON
+   * @throws IOException if an I/O error occurs.
+   */
+  public static void writeDocument(Document doc, Long start, Long end,
+          Map<String, Collection<Annotation>> annotationsMap, JsonGenerator 
json)
+          throws JsonGenerationException, IOException, InvalidOffsetException {
+    writeDocument(doc, start, end, annotationsMap, null, json);
+  }
+
+  /**
+   * Write a substring of a GATE document to the specified
+   * JsonGenerator. The specified window of document text will be
+   * written as a property named "text" and the specified annotations
+   * will be written as "entities", with their offsets adjusted to be
+   * relative to the specified window.
+   * 
+   * @param doc the document to write
+   * @param start the start offset of the segment to write
+   * @param end the end offset of the segment to write
+   * @param annotations annotations to write.
+   * @param extraFeatures additional properties to add to the generated
+   *          JSON. If the map includes a "text" key this will be
+   *          ignored, and if it contains a key "entities" whose value
+   *          is a map then these entities will be merged with the
+   *          generated ones derived from the annotationsMap. This would
+   *          typically be used for documents that were originally
+   *          derived from Twitter data, to re-create the original JSON.
+   * @param json the {@link JsonGenerator} to write to.
+   * @throws JsonGenerationException if a problem occurs while
+   *           generating the JSON
+   * @throws IOException if an I/O error occurs.
+   */
+  public static void writeDocument(Document doc, Long start, Long end,
+          Map<String, Collection<Annotation>> annotationsMap,
+          Map<?, ?> extraFeatures, JsonGenerator json)
+          throws JsonGenerationException, IOException, InvalidOffsetException {
+
+    ObjectWriter writer = MAPPER.writer();
+
+    json.writeStartObject();
+    json.writeStringField("text", doc.getContent().getContent(start, end)
+            .toString());
+    json.writeFieldName("entities");
+    json.writeStartObject();
+    // if the extraFeatures already includes entities, merge them with
+    // the new ones we create
+    Object entitiesExtraFeature = extraFeatures.get("entities");
+    Map<?, ?> entitiesMap = null;
+    if(entitiesExtraFeature instanceof Map) {
+      entitiesMap = (Map<?, ?>)entitiesExtraFeature;
+    }
+    for(Map.Entry<String, Collection<Annotation>> annsByType : annotationsMap
+            .entrySet()) {
+      String annotationType = annsByType.getKey();
+      Collection<Annotation> annotations = annsByType.getValue();
+      json.writeFieldName(annotationType);
+      json.writeStartArray();
+      for(Annotation a : annotations) {
+        json.writeStartObject();
+        // indices:[start, end], corrected to match the sub-range of
+        // text we're writing
+        json.writeArrayFieldStart("indices");
+        json.writeNumber(a.getStartNode().getOffset() - start);
+        json.writeNumber(a.getEndNode().getOffset() - start);
+        json.writeEndArray(); // end of indices
+        // other features
+        for(Map.Entry<?, ?> feature : a.getFeatures().entrySet()) {
+          json.writeFieldName(String.valueOf(feature.getKey()));
+          writer.writeValue(json, feature.getValue());
+        }
+        json.writeEndObject(); // end of annotation
+      }
+      // add any entities from the extraFeatures map
+      if(entitiesMap != null
+              && entitiesMap.get(annotationType) instanceof Collection) {
+        for(Object ent : (Collection<?>)entitiesMap.get(annotationType)) {
+          writer.writeValue(json, ent);
+        }
+      }
+      json.writeEndArray();
+    }
+    if(entitiesMap != null) {
+      for(Map.Entry<?, ?> entitiesEntry : entitiesMap.entrySet()) {
+        if(!annotationsMap.containsKey(entitiesEntry.getKey())) {
+          // not an entity type we've already seen
+          json.writeFieldName(String.valueOf(entitiesEntry.getKey()));
+          writer.writeValue(json, entitiesEntry.getValue());
+        }
+      }
+    }
+
+    json.writeEndObject(); // end of entities
+
+    if(extraFeatures != null) {
+      for(Map.Entry<?, ?> feature : extraFeatures.entrySet()) {
+        if("text".equals(feature.getKey())
+                || "entities".equals(feature.getKey())) {
+          // already dealt with text and entities
+          continue;
+        }
+        json.writeFieldName(String.valueOf(feature.getKey()));
+        writer.writeValue(json, feature.getValue());
+      }
+    }
+    json.writeEndObject(); // end of document
+  }
+}


Property changes on: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to