Revision: 18444 http://sourceforge.net/p/gate/code/18444 Author: ian_roberts Date: 2014-11-07 17:39:34 +0000 (Fri, 07 Nov 2014) Log Message: ----------- Escape &, < and > as HTML entities &, < and > when saving, and adjust entity indices to match. This reverses the transformation that the Twitter plugin's document format parser performs, and makes the generated JSON consistent with JSON that comes from Twitter.
Modified Paths: -------------- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java Modified: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java =================================================================== --- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2014-11-07 17:06:12 UTC (rev 18443) +++ gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2014-11-07 17:39:34 UTC (rev 18444) @@ -22,6 +22,8 @@ import java.io.Writer; import java.util.Collection; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import com.fasterxml.jackson.core.JsonEncoding; import com.fasterxml.jackson.core.JsonFactory; @@ -295,8 +297,10 @@ ObjectWriter writer = MAPPER.writer(); json.writeStartObject(); - json.writeStringField("text", doc.getContent().getContent(start, end) - .toString()); + RepositioningInfo repos = new RepositioningInfo(); + String text = escape(doc.getContent().getContent(start, end) + .toString(), repos); + json.writeStringField("text", text); json.writeFieldName("entities"); json.writeStartObject(); // if the extraFeatures already includes entities, merge them with @@ -318,8 +322,8 @@ // indices:[start, end], corrected to match the sub-range of // text we're writing json.writeArrayFieldStart("indices"); - json.writeNumber(a.getStartNode().getOffset() - start); - json.writeNumber(a.getEndNode().getOffset() - start); + json.writeNumber(repos.getOriginalPos(a.getStartNode().getOffset() - start)); + json.writeNumber(repos.getOriginalPos(a.getEndNode().getOffset() - start)); json.writeEndArray(); // end of indices if(annotationTypeProperty != null) { json.writeStringField(annotationTypeProperty, a.getType()); @@ -378,4 +382,43 @@ // as the data never leaves the JsonGenerator json.flush(); } + + private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&]"); + + /** + * Escape all angle brackets and ampersands in the given string, + * recording the adjustments to character offsets within the + * given {@link RepositioningInfo}. + */ + private static String escape(String str, RepositioningInfo repos) { + StringBuffer buf = new StringBuffer(); + int correction = 0; + int lastMatchEnd = 0; + Matcher mat = CHARS_TO_ESCAPE.matcher(str); + while(mat.find()) { + if(mat.start() != lastMatchEnd) { + // repositioning record for the span from end of previous match to start of this one + int nonMatchLen = mat.start() - lastMatchEnd; + repos.addPositionInfo(lastMatchEnd + correction, nonMatchLen, lastMatchEnd, nonMatchLen); + } + String replace = "?"; + switch(mat.group()) { + case "&": replace = "&"; break; + case ">": replace = ">"; break; + case "<": replace = "<"; break; + } + // repositioning record covering this match + repos.addPositionInfo(mat.start() + correction, replace.length(), mat.start(), 1); + correction += replace.length() - 1; + mat.appendReplacement(buf, replace); + lastMatchEnd = mat.end(); + } + int tailLen = str.length() - lastMatchEnd; + if(tailLen > 0) { + // repositioning record covering everything after the last match + repos.addPositionInfo(lastMatchEnd + correction, tailLen, lastMatchEnd, tailLen); + } + mat.appendTail(buf); + return buf.toString(); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs