Revision: 18420 http://sourceforge.net/p/gate/code/18420 Author: ian_roberts Date: 2014-10-30 19:26:45 +0000 (Thu, 30 Oct 2014) Log Message: ----------- Create annotations from the "entities" property of Twitter JSON
In Twitter JSON the property "entities" is a map from entity type to list of entities, where each entity is an object with an "indices" property giving its offsets and other properties giving features. I've added support for turning this structure into annotations in the Original markups set. The DocumentFormat will have this behaviour enabled by default, the populator currently not (principle of least surprise - I'll enable it once I've added a checkbox to the GUI to control it). Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-10-30 18:42:13 UTC (rev 18419) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-10-30 19:26:45 UTC (rev 18420) @@ -75,7 +75,8 @@ input = inputUrl.openStream(); // TODO Detect & handle gzipped input. - TweetStreamIterator tweetSource = new TweetStreamIterator(input, contentKeys, featureKeys, false); + // TODO handling of entities, once there's GUI to control it + TweetStreamIterator tweetSource = new TweetStreamIterator(input, contentKeys, featureKeys, false, false); int tweetCounter = 0; int tweetDocCounter = 0; Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-10-30 18:42:13 UTC (rev 18419) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-10-30 19:26:45 UTC (rev 18420) @@ -18,9 +18,11 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringEscapeUtils; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; public class Tweet { @@ -51,12 +53,16 @@ public static Tweet readTweet(JsonNode json, List<String> contentKeys, List<String> featureKeys) { + return readTweet(json, contentKeys, featureKeys, true); + } + + public static Tweet readTweet(JsonNode json, List<String> contentKeys, List<String> featureKeys, boolean handleEntities) { if ( (contentKeys == null) || (featureKeys == null) ) { - return new Tweet(json); + return new Tweet(json, handleEntities); } // implied else - return new Tweet(json, contentKeys, featureKeys); + return new Tweet(json, contentKeys, featureKeys, handleEntities); } @@ -64,7 +70,7 @@ * Used by the JSONTWeetFormat; the DocumentContent contains only the main text; * the annotation feature map contains all the other JSON data, recursively. */ - private Tweet(JsonNode json) { + private Tweet(JsonNode json, boolean handleEntities) { string = ""; Iterator<String> keys = json.fieldNames(); FeatureMap features = Factory.newFeatureMap(); @@ -75,6 +81,7 @@ if (key.equals(TweetUtils.DEFAULT_TEXT_ATTRIBUTE)) { string = StringEscapeUtils.unescapeHtml(json.get(key).asText()); + processEntities(json, 0L); } else { features.put(key.toString(), TweetUtils.process(json.get(key))); @@ -91,7 +98,7 @@ * @param featureKeys JSON paths whose values should be stored in the main * annotation's features */ - private Tweet(JsonNode json, List<String> contentKeys, List<String> featureKeys) { + private Tweet(JsonNode json, List<String> contentKeys, List<String> featureKeys, boolean handleEntities) { StringBuilder content = new StringBuilder(); List<String> keepers = new ArrayList<String>(); keepers.addAll(contentKeys); @@ -107,6 +114,10 @@ // Use GATE's String conversion in case there are maps or lists. content.append(Strings.toString(featuresFound.get(cKey))); this.annotations.add(new PreAnnotation(start, content.length(), cKey)); + if(handleEntities && TweetUtils.DEFAULT_TEXT_ATTRIBUTE.equals(cKey)) { + // only process entities within "text" + processEntities(json, start); + } content.append('\n'); } } @@ -124,5 +135,45 @@ this.string = content.toString(); } + /** + * Process the "entities" property of this json object into annotations, + * shifting their offsets by the specified amount. + * + * @param json the Tweet json object + * @param startOffset offset correction if the text is not the first of + * the content keys. + */ + private void processEntities(JsonNode json, long startOffset) { + JsonNode entitiesNode = json.get(TweetUtils.ENTITIES_ATTRIBUTE); + if(entitiesNode == null || !entitiesNode.isObject()) { + // no entities, nothing to do + return; + } + Iterator<String> entityTypes = entitiesNode.fieldNames(); + while(entityTypes.hasNext()) { + String entityType = entityTypes.next(); + JsonNode entitiesOfType = entitiesNode.get(entityType); + if(entitiesOfType != null && entitiesOfType.isArray() && entitiesOfType.size() > 0) { + Iterator<JsonNode> it = entitiesOfType.elements(); + while(it.hasNext()) { + JsonNode entity = it.next(); + if(entity.isObject()) { + // process is guaranteed to return a FeatureMap given an object + FeatureMap features = (FeatureMap)TweetUtils.process(entity); + Object indices = features.get("indices"); + if(indices != null && indices instanceof List<?>) { + List<?> indicesList = (List<?>)indices; + if(indicesList.get(0) instanceof Number && indicesList.get(1) instanceof Number) { + // finally we know we have a valid entity + features.remove("indices"); + annotations.add(new PreAnnotation(startOffset + ((Number)indicesList.get(0)).longValue(), + startOffset + ((Number)indicesList.get(1)).longValue(), entityType, features)); + } + } + } + } + } + } + } } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-10-30 18:42:13 UTC (rev 18419) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java 2014-10-30 19:26:45 UTC (rev 18420) @@ -42,11 +42,19 @@ private boolean nested; private Iterator<JsonNode> nestedStatuses; private JsonNode nextNode; + private boolean handleEntities; public TweetStreamIterator(String json, List<String> contentKeys, List<String> featureKeys) throws JsonParseException, IOException { + this(json, contentKeys, featureKeys, true); + } + + + public TweetStreamIterator(String json, List<String> contentKeys, + List<String> featureKeys, boolean handleEntities) throws JsonParseException, IOException { this.contentKeys = contentKeys; this.featureKeys = featureKeys; + this.handleEntities = handleEntities; objectMapper = new ObjectMapper(); jsonParser = objectMapper.getFactory().createParser(json); init(); @@ -54,8 +62,15 @@ public TweetStreamIterator(InputStream input, List<String> contentKeys, List<String> featureKeys, boolean gzip) throws JsonParseException, IOException { + this(input, contentKeys, featureKeys, gzip, true); + } + + public TweetStreamIterator(InputStream input, List<String> contentKeys, + List<String> featureKeys, boolean gzip, boolean handleEntities) + throws JsonParseException, IOException { this.contentKeys = contentKeys; this.featureKeys = featureKeys; + this.handleEntities = handleEntities; InputStream workingInput; // Following borrowed from gcp JSONStreamingInputHandler @@ -103,7 +118,7 @@ Tweet result = null; try { if (this.nested && this.nestedStatuses.hasNext()) { - result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, featureKeys); + result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys, featureKeys, handleEntities); // Clear the nested flag once the last item in the statuses // value's list has been used, so that the next call to next() // will drop to the else if clause. @@ -122,7 +137,7 @@ else { this.nested = false; this.nestedStatuses = null; - result = Tweet.readTweet(nextNode, contentKeys, featureKeys); + result = Tweet.readTweet(nextNode, contentKeys, featureKeys, handleEntities); } } } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 2014-10-30 18:42:13 UTC (rev 18419) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 2014-10-30 19:26:45 UTC (rev 18420) @@ -42,6 +42,11 @@ "created_at", "user:name"}; public static final String[] DEFAULT_FEATURE_KEYS = {"user:screen_name", "user:location", "id_str", "source", "truncated", "retweeted_status:id"}; + + /** + * The JSON property representing entities (e.g. hashtags). + */ + public static final String ENTITIES_ATTRIBUTE = "entities"; public static List<Tweet> readTweets(String string) throws IOException { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs