Revision: 18945
          http://sourceforge.net/p/gate/code/18945
Author:   ian_roberts
Date:     2015-10-11 20:40:05 +0000 (Sun, 11 Oct 2015)
Log Message:
-----------
Twitter JSON "entities" count their offsets in terms of Unicode characters, but 
GATE annotations count their offsets in terms of Java char values (UTF-16 code 
units).  Re-implemented the offset adjustment logic to account for this, to fix 
odd off-by-one errors decoding entities in tweets with supplementary characters 
like emoji.

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
    gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java    
2015-10-11 01:19:50 UTC (rev 18944)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java    
2015-10-11 20:40:05 UTC (rev 18945)
@@ -33,6 +33,7 @@
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.io.OutputStreamWriter;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -158,7 +159,7 @@
    */
   protected JsonGenerator openGenerator(OutputStream out, FeatureMap options)
     throws IOException {
-    JsonGenerator generator = MAPPER.getFactory().createGenerator(out);
+    JsonGenerator generator = MAPPER.getFactory().createGenerator(new 
OutputStreamWriter(out, "UTF-8"));
     generator.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
     generator.enable(JsonGenerator.Feature.AUTO_CLOSE_JSON_CONTENT);
     if(options.containsKey("exportAsArray") && 
((Boolean)options.get("exportAsArray")).booleanValue()) {

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2015-10-11 01:19:50 UTC (rev 18944)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2015-10-11 20:40:05 UTC (rev 18945)
@@ -147,42 +147,67 @@
     this.string = content.toString();
   }
   
-  private static Pattern XML_ENTITY_PATTERN = Pattern.compile("&(amp|lt|gt);");
+  /**
+   * Characters to account for in unescaping - HTML-encoded ampersand and angle
+   * brackets, and supplementary characters (which don't need "unescaping" but 
do
+   * need to be accounted for in the repos info).
+   */
+  private static Pattern UNESCAPE_PATTERN = 
Pattern.compile("&(?:amp|lt|gt);|[\\x{" +
+    Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ "}-\\x{" +
+    Integer.toHexString(Character.MAX_CODE_POINT) + "}]");
   
   /**
    * Un-escape &, > and < in the given string, populating
-   * the supplied {@link RepositioningInfo} to describe the offset changes.
+   * the supplied {@link RepositioningInfo} to describe the offset changes.  
Also
+   * record the position of any Unicode supplementary characters, as Twitter's
+   * entities format counts in characters (so a supplementary is 1) whereas 
GATE
+   * annotations count in Java <code>char</code> values (UTF-16 code units, so
+   * a supplementary counts as two).
    * @param str string, possibly including escaped ampersands or angle brackets
    * @param repos {@link RepositioningInfo} to hold offset changes
    * @return the unescaped string
    */
   private String unescape(String str, RepositioningInfo repos) {
     StringBuffer buf = new StringBuffer();
-    int correction = 0;
-    int lastMatchEnd = 0;
-    Matcher mat = XML_ENTITY_PATTERN.matcher(str);
+    int origOffset = 0;
+    int extractedOffset = 0;
+    Matcher mat = UNESCAPE_PATTERN.matcher(str);
     while(mat.find()) {
-      if(mat.start() != lastMatchEnd) {
+      if(mat.start() != origOffset) {
         // repositioning record for the span from end of previous match to 
start of this one
-        int nonMatchLen = mat.start() - lastMatchEnd;
-        repos.addPositionInfo(lastMatchEnd, nonMatchLen, lastMatchEnd - 
correction, nonMatchLen);
-      }
-      // repositioning record covering this match
-      repos.addPositionInfo(mat.start(), mat.end() - mat.start(), mat.start() 
- correction, 1);
-      correction += mat.end() - mat.start() - 1;
+        int nonMatchLen = mat.start() - origOffset;
+        repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset, 
nonMatchLen);
+        origOffset += nonMatchLen;
+        extractedOffset += nonMatchLen;
+      }      
+      
+      // in most cases the original length is the number of code units the 
pattern matched
+      int origLen = mat.end() - mat.start();
+      // and the extracted result is one code unit
+      int extractedLen = 1;
       String replace = "?";
-      switch(mat.group(1)) {
-        case "amp": replace = "&"; break;
-        case "gt": replace = ">"; break;
-        case "lt": replace = "<"; break;
+      switch(mat.group()) {
+        case "&amp;": replace = "&"; break;
+        case "&gt;": replace = ">"; break;
+        case "&lt;": replace = "<"; break;
+        default:
+          // but in the case of supplementary characters, the original length
+          // (in *characters*) is 1 but the extracted length (in code units) 
is 2
+          replace = mat.group();
+          origLen = 1;
+          extractedLen = 2;
       }
       mat.appendReplacement(buf, replace);
-      lastMatchEnd = mat.end();
+      // repositioning record covering this match
+      repos.addPositionInfo(origOffset, origLen, extractedOffset, 
extractedLen);
+
+      origOffset += origLen;
+      extractedOffset += extractedLen;
     }
-    int tailLen = str.length() - lastMatchEnd;
+    int tailLen = str.length() - origOffset;
     if(tailLen > 0) {
       // repositioning record covering everything after the last match
-      repos.addPositionInfo(lastMatchEnd, tailLen, lastMatchEnd - correction, 
tailLen);
+      repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen 
+ 1);
     }
     mat.appendTail(buf);
     return buf.toString();

Modified: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java
===================================================================
--- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java     2015-10-11 
01:19:50 UTC (rev 18944)
+++ gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java     2015-10-11 
20:40:05 UTC (rev 18945)
@@ -383,40 +383,69 @@
     json.flush();
   }
 
-  private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&]");
+  /**
+   * Characters to account for when escaping - ampersand, angle brackets, and 
supplementaries
+   */
+  private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&\\x{" +
+          Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ 
"}-\\x{" +
+          Integer.toHexString(Character.MAX_CODE_POINT) + "}]");
   
   /**
    * Escape all angle brackets and ampersands in the given string,
    * recording the adjustments to character offsets within the
-   * given {@link RepositioningInfo}.
+   * given {@link RepositioningInfo}.  Also record supplementary
+   * characters (above U+FFFF), which count as two in terms of
+   * GATE annotation offsets (which count in Java chars) but one
+   * in terms of JSON (counting in Unicode characters).
    */
   private static String escape(String str, RepositioningInfo repos) {
     StringBuffer buf = new StringBuffer();
-    int correction = 0;
-    int lastMatchEnd = 0;
+    int origOffset = 0;
+    int extractedOffset = 0;
     Matcher mat = CHARS_TO_ESCAPE.matcher(str);
     while(mat.find()) {
-      if(mat.start() != lastMatchEnd) {
+      if(mat.start() != extractedOffset) {
         // repositioning record for the span from end of previous match to 
start of this one
-        int nonMatchLen = mat.start() - lastMatchEnd;
-        repos.addPositionInfo(lastMatchEnd + correction, nonMatchLen, 
lastMatchEnd, nonMatchLen);
+        int nonMatchLen = mat.start() - extractedOffset;
+        repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset, 
nonMatchLen);
+        origOffset += nonMatchLen;
+        extractedOffset += nonMatchLen;
       }
+
+      // the extracted length is the number of code units matched by the 
pattern
+      int extractedLen = mat.end() - mat.start();
+      int origLen = 0;
       String replace = "?";
       switch(mat.group()) {
-        case "&": replace = "&amp;"; break;
-        case ">": replace = "&gt;"; break;
-        case "<": replace = "&lt;"; break;
+        case "&":
+          replace = "&amp;";
+          origLen = 5;
+          break;
+        case ">":
+          replace = "&gt;";
+          origLen = 4;
+          break;
+        case "<":
+          replace = "&lt;";
+          origLen = 4;
+          break;
+        default:
+          // supplementary character, so no escaping but need to account for
+          // it in repositioning info
+          replace = mat.group();
+          origLen = 1;
       }
       // repositioning record covering this match
-      repos.addPositionInfo(mat.start() + correction, replace.length(), 
mat.start(), 1);
-      correction += replace.length() - 1;
+      repos.addPositionInfo(origOffset, origLen, extractedOffset, 
extractedLen);
       mat.appendReplacement(buf, replace);
-      lastMatchEnd = mat.end();
+      origOffset += origLen;
+      extractedOffset += extractedLen;
+
     }
-    int tailLen = str.length() - lastMatchEnd;
+    int tailLen = str.length() - extractedOffset;
     if(tailLen > 0) {
       // repositioning record covering everything after the last match
-      repos.addPositionInfo(lastMatchEnd + correction, tailLen + 1, 
lastMatchEnd, tailLen + 1);
+      repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen 
+ 1);
     }
     mat.appendTail(buf);
     return buf.toString();

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to