Revision: 18945 http://sourceforge.net/p/gate/code/18945 Author: ian_roberts Date: 2015-10-11 20:40:05 +0000 (Sun, 11 Oct 2015) Log Message: ----------- Twitter JSON "entities" count their offsets in terms of Unicode characters, but GATE annotations count their offsets in terms of Java char values (UTF-16 code units). Re-implemented the offset adjustment logic to account for this, to fix odd off-by-one errors decoding entities in tweets with supplementary characters like emoji.
Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java 2015-10-11 01:19:50 UTC (rev 18944) +++ gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java 2015-10-11 20:40:05 UTC (rev 18945) @@ -33,6 +33,7 @@ import java.io.IOException; import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; @@ -158,7 +159,7 @@ */ protected JsonGenerator openGenerator(OutputStream out, FeatureMap options) throws IOException { - JsonGenerator generator = MAPPER.getFactory().createGenerator(out); + JsonGenerator generator = MAPPER.getFactory().createGenerator(new OutputStreamWriter(out, "UTF-8")); generator.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET); generator.enable(JsonGenerator.Feature.AUTO_CLOSE_JSON_CONTENT); if(options.containsKey("exportAsArray") && ((Boolean)options.get("exportAsArray")).booleanValue()) { Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2015-10-11 01:19:50 UTC (rev 18944) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2015-10-11 20:40:05 UTC (rev 18945) @@ -147,42 +147,67 @@ this.string = content.toString(); } - private static Pattern XML_ENTITY_PATTERN = Pattern.compile("&(amp|lt|gt);"); + /** + * Characters to account for in unescaping - HTML-encoded ampersand and angle + * brackets, and supplementary characters (which don't need "unescaping" but do + * need to be accounted for in the repos info). + */ + private static Pattern UNESCAPE_PATTERN = Pattern.compile("&(?:amp|lt|gt);|[\\x{" + + Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ "}-\\x{" + + Integer.toHexString(Character.MAX_CODE_POINT) + "}]"); /** * Un-escape &amp;, &gt; and &lt; in the given string, populating - * the supplied {@link RepositioningInfo} to describe the offset changes. + * the supplied {@link RepositioningInfo} to describe the offset changes. Also + * record the position of any Unicode supplementary characters, as Twitter's + * entities format counts in characters (so a supplementary is 1) whereas GATE + * annotations count in Java <code>char</code> values (UTF-16 code units, so + * a supplementary counts as two). * @param str string, possibly including escaped ampersands or angle brackets * @param repos {@link RepositioningInfo} to hold offset changes * @return the unescaped string */ private String unescape(String str, RepositioningInfo repos) { StringBuffer buf = new StringBuffer(); - int correction = 0; - int lastMatchEnd = 0; - Matcher mat = XML_ENTITY_PATTERN.matcher(str); + int origOffset = 0; + int extractedOffset = 0; + Matcher mat = UNESCAPE_PATTERN.matcher(str); while(mat.find()) { - if(mat.start() != lastMatchEnd) { + if(mat.start() != origOffset) { // repositioning record for the span from end of previous match to start of this one - int nonMatchLen = mat.start() - lastMatchEnd; - repos.addPositionInfo(lastMatchEnd, nonMatchLen, lastMatchEnd - correction, nonMatchLen); - } - // repositioning record covering this match - repos.addPositionInfo(mat.start(), mat.end() - mat.start(), mat.start() - correction, 1); - correction += mat.end() - mat.start() - 1; + int nonMatchLen = mat.start() - origOffset; + repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset, nonMatchLen); + origOffset += nonMatchLen; + extractedOffset += nonMatchLen; + } + + // in most cases the original length is the number of code units the pattern matched + int origLen = mat.end() - mat.start(); + // and the extracted result is one code unit + int extractedLen = 1; String replace = "?"; - switch(mat.group(1)) { - case "amp": replace = "&"; break; - case "gt": replace = ">"; break; - case "lt": replace = "<"; break; + switch(mat.group()) { + case "&": replace = "&"; break; + case ">": replace = ">"; break; + case "<": replace = "<"; break; + default: + // but in the case of supplementary characters, the original length + // (in *characters*) is 1 but the extracted length (in code units) is 2 + replace = mat.group(); + origLen = 1; + extractedLen = 2; } mat.appendReplacement(buf, replace); - lastMatchEnd = mat.end(); + // repositioning record covering this match + repos.addPositionInfo(origOffset, origLen, extractedOffset, extractedLen); + + origOffset += origLen; + extractedOffset += extractedLen; } - int tailLen = str.length() - lastMatchEnd; + int tailLen = str.length() - origOffset; if(tailLen > 0) { // repositioning record covering everything after the last match - repos.addPositionInfo(lastMatchEnd, tailLen, lastMatchEnd - correction, tailLen); + repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen + 1); } mat.appendTail(buf); return buf.toString(); Modified: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java =================================================================== --- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2015-10-11 01:19:50 UTC (rev 18944) +++ gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2015-10-11 20:40:05 UTC (rev 18945) @@ -383,40 +383,69 @@ json.flush(); } - private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&]"); + /** + * Characters to account for when escaping - ampersand, angle brackets, and supplementaries + */ + private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&\\x{" + + Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ "}-\\x{" + + Integer.toHexString(Character.MAX_CODE_POINT) + "}]"); /** * Escape all angle brackets and ampersands in the given string, * recording the adjustments to character offsets within the - * given {@link RepositioningInfo}. + * given {@link RepositioningInfo}. Also record supplementary + * characters (above U+FFFF), which count as two in terms of + * GATE annotation offsets (which count in Java chars) but one + * in terms of JSON (counting in Unicode characters). */ private static String escape(String str, RepositioningInfo repos) { StringBuffer buf = new StringBuffer(); - int correction = 0; - int lastMatchEnd = 0; + int origOffset = 0; + int extractedOffset = 0; Matcher mat = CHARS_TO_ESCAPE.matcher(str); while(mat.find()) { - if(mat.start() != lastMatchEnd) { + if(mat.start() != extractedOffset) { // repositioning record for the span from end of previous match to start of this one - int nonMatchLen = mat.start() - lastMatchEnd; - repos.addPositionInfo(lastMatchEnd + correction, nonMatchLen, lastMatchEnd, nonMatchLen); + int nonMatchLen = mat.start() - extractedOffset; + repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset, nonMatchLen); + origOffset += nonMatchLen; + extractedOffset += nonMatchLen; } + + // the extracted length is the number of code units matched by the pattern + int extractedLen = mat.end() - mat.start(); + int origLen = 0; String replace = "?"; switch(mat.group()) { - case "&": replace = "&"; break; - case ">": replace = ">"; break; - case "<": replace = "<"; break; + case "&": + replace = "&"; + origLen = 5; + break; + case ">": + replace = ">"; + origLen = 4; + break; + case "<": + replace = "<"; + origLen = 4; + break; + default: + // supplementary character, so no escaping but need to account for + // it in repositioning info + replace = mat.group(); + origLen = 1; } // repositioning record covering this match - repos.addPositionInfo(mat.start() + correction, replace.length(), mat.start(), 1); - correction += replace.length() - 1; + repos.addPositionInfo(origOffset, origLen, extractedOffset, extractedLen); mat.appendReplacement(buf, replace); - lastMatchEnd = mat.end(); + origOffset += origLen; + extractedOffset += extractedLen; + } - int tailLen = str.length() - lastMatchEnd; + int tailLen = str.length() - extractedOffset; if(tailLen > 0) { // repositioning record covering everything after the last match - repos.addPositionInfo(lastMatchEnd + correction, tailLen + 1, lastMatchEnd, tailLen + 1); + repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen + 1); } mat.appendTail(buf); return buf.toString(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs