Revision: 18442 http://sourceforge.net/p/gate/code/18442 Author: ian_roberts Date: 2014-11-07 16:48:19 +0000 (Fri, 07 Nov 2014) Log Message: ----------- Massage entity offsets correctly in the presence of escaped &, < and > characters.
Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-11-07 14:38:41 UTC (rev 18441) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-11-07 16:48:19 UTC (rev 18442) @@ -13,6 +13,7 @@ import gate.Factory; import gate.FeatureMap; +import gate.corpora.RepositioningInfo; import gate.util.Strings; import java.util.ArrayList; import java.util.HashSet; @@ -20,6 +21,9 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.apache.commons.lang.StringEscapeUtils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -80,10 +84,12 @@ String key = keys.next(); if (key.equals(TweetUtils.DEFAULT_TEXT_ATTRIBUTE)) { - string = StringEscapeUtils.unescapeHtml(json.get(key).asText()); - processEntities(json, 0L); - } - else { + RepositioningInfo repos = new RepositioningInfo(); + string = unescape(json.get(key).asText(), repos); + if(handleEntities) processEntities(json, 0L, repos); + } else if(key.equals("entities") && handleEntities) { + // do nothing - don't add entities as a feature + } else { features.put(key.toString(), TweetUtils.process(json.get(key))); } } @@ -112,11 +118,17 @@ if (featuresFound.containsKey(cKey)) { int start = content.length(); // Use GATE's String conversion in case there are maps or lists. - content.append(Strings.toString(featuresFound.get(cKey))); + String str = Strings.toString(featuresFound.get(cKey)); + RepositioningInfo repos = null; + if(TweetUtils.DEFAULT_TEXT_ATTRIBUTE.equals(cKey)) { + repos = new RepositioningInfo(); + str = unescape(str, repos); + } + content.append(str); this.annotations.add(new PreAnnotation(start, content.length(), cKey)); if(handleEntities && TweetUtils.DEFAULT_TEXT_ATTRIBUTE.equals(cKey)) { // only process entities within "text" - processEntities(json, start); + processEntities(json, start, repos); } content.append('\n'); } @@ -134,6 +146,47 @@ this.annotations.add(new PreAnnotation(0, content.length(), TweetUtils.TWEET_ANNOTATION_TYPE, annoFeatures)); this.string = content.toString(); } + + private static Pattern XML_ENTITY_PATTERN = Pattern.compile("&(amp|lt|gt);"); + + /** + * Un-escape &amp;, &gt; and &lt; in the given string, populating + * the supplied {@link RepositioningInfo} to describe the offset changes. + * @param str string, possibly including escaped ampersands or angle brackets + * @param repos {@link RepositioningInfo} to hold offset changes + * @return the unescaped string + */ + private String unescape(String str, RepositioningInfo repos) { + StringBuffer buf = new StringBuffer(); + int correction = 0; + int lastMatchEnd = 0; + Matcher mat = XML_ENTITY_PATTERN.matcher(str); + while(mat.find()) { + if(mat.start() != lastMatchEnd) { + // repositioning record for the span from end of previous match to start of this one + int nonMatchLen = mat.start() - lastMatchEnd; + repos.addPositionInfo(lastMatchEnd, nonMatchLen, lastMatchEnd - correction, nonMatchLen); + } + // repositioning record covering this match + repos.addPositionInfo(mat.start(), mat.end() - mat.start(), mat.start() - correction, 1); + correction += mat.end() - mat.start() - 1; + String replace = "?"; + switch(mat.group(1)) { + case "amp": replace = "&"; break; + case "gt": replace = ">"; break; + case "lt": replace = "<"; break; + } + mat.appendReplacement(buf, replace); + lastMatchEnd = mat.end(); + } + int tailLen = str.length() - lastMatchEnd; + if(tailLen > 0) { + // repositioning record covering everything after the last match + repos.addPositionInfo(lastMatchEnd, tailLen, lastMatchEnd - correction, tailLen); + } + mat.appendTail(buf); + return buf.toString(); + } /** * Process the "entities" property of this json object into annotations, @@ -143,7 +196,7 @@ * @param startOffset offset correction if the text is not the first of * the content keys. */ - private void processEntities(JsonNode json, long startOffset) { + private void processEntities(JsonNode json, long startOffset, RepositioningInfo repos) { JsonNode entitiesNode = json.get(TweetUtils.ENTITIES_ATTRIBUTE); if(entitiesNode == null || !entitiesNode.isObject()) { // no entities, nothing to do @@ -168,8 +221,8 @@ if(indicesList.get(0) instanceof Number && indicesList.get(1) instanceof Number) { // finally we know we have a valid entity features.remove("indices"); - long annStart = startOffset + ((Number)indicesList.get(0)).longValue(); - long annEnd = startOffset + ((Number)indicesList.get(1)).longValue(); + long annStart = repos.getExtractedPos(startOffset + ((Number)indicesList.get(0)).longValue()); + long annEnd = repos.getExtractedPos(startOffset + ((Number)indicesList.get(1)).longValue()); if(setAndType.length == 2) { // explicit annotation set name annotations.add(new PreAnnotation(annStart, annEnd, setAndType[0], setAndType[1], features)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs