Revision: 18422 http://sourceforge.net/p/gate/code/18422 Author: ian_roberts Date: 2014-10-31 17:22:29 +0000 (Fri, 31 Oct 2014) Log Message: ----------- Added entities support to corpus populator
Also enhanced handling to allow for the specification of annotation set names in entity types. An entity type that does not contain a colon will go into the Original markups annotation set as normal, but a type containing a colon will be treated as "asName:annType" (with an empty asName denoting the default set): { "text":"This is a test", "entities":{ "Mention":[{"indices":[10,14],"inst":"urn:test:Orig-markups"}], ":Mention":[{"indices":[10,14],"inst":"urn:test:default-set"}], "Key:Mention":[{"indices":[10,14],"inst":"urn:test:Key-set"}] } } Modified Paths: -------------- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -110,11 +110,10 @@ DocumentContent newContent = new DocumentContentImpl(concatenation.toString()); doc.edit(0L, doc.getContent().size(), newContent); - AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); // Create Original markups annotations for each tweet for (Tweet tweet : tweetStarts.keySet()) { for (PreAnnotation preAnn : tweet.getAnnotations()) { - preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet)); + preAnn.toAnnotation(doc, tweetStarts.get(tweet)); } } } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -11,12 +11,10 @@ */ package gate.corpora.twitter; -import gate.AnnotationSet; import gate.Corpus; import gate.Document; import gate.DocumentContent; import gate.Factory; -import gate.Gate; import gate.corpora.DocumentContentImpl; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.AutoInstance; @@ -54,7 +52,7 @@ public static void populateCorpus(final Corpus corpus, URL inputUrl, PopulationConfig config) throws ResourceInstantiationException { populateCorpus(corpus, inputUrl, config.getEncoding(), config.getContentKeys(), - config.getFeatureKeys(), config.getTweetsPerDoc()); + config.getFeatureKeys(), config.getTweetsPerDoc(), config.isProcessEntities()); } /** @@ -69,14 +67,19 @@ */ public static void populateCorpus(final Corpus corpus, URL inputUrl, String encoding, List<String> contentKeys, List<String> featureKeys, int tweetsPerDoc) throws ResourceInstantiationException { - + populateCorpus(corpus, inputUrl, encoding, contentKeys, featureKeys, tweetsPerDoc, true); + } + + public static void populateCorpus(final Corpus corpus, URL inputUrl, String encoding, List<String> contentKeys, + List<String> featureKeys, int tweetsPerDoc, boolean processEntities) throws ResourceInstantiationException { + InputStream input = null; try { input = inputUrl.openStream(); // TODO Detect & handle gzipped input. // TODO handling of entities, once there's GUI to control it - TweetStreamIterator tweetSource = new TweetStreamIterator(input, contentKeys, featureKeys, false, false); + TweetStreamIterator tweetSource = new TweetStreamIterator(input, contentKeys, featureKeys, false, processEntities); int tweetCounter = 0; int tweetDocCounter = 0; @@ -159,9 +162,8 @@ else { DocumentContent contentImpl = new DocumentContentImpl(content.toString()); document.setContent(contentImpl); - AnnotationSet originalMarkups = document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME); for (PreAnnotation preAnn : annotandaOffsets.keySet()) { - preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn)); + preAnn.toAnnotation(document, annotandaOffsets.get(preAnn)); } corpus.add(document); @@ -200,8 +202,7 @@ public void run() { try { for (URL fileUrl : fileUrls) { - populateCorpus((Corpus) handle.getTarget(), fileUrl, dialog.getEncoding(), - dialog.getContentKeys(), dialog.getFeatureKeys(), dialog.getTweetsPerDoc()); + populateCorpus((Corpus) handle.getTarget(), fileUrl, dialog.getConfig()); } } catch(ResourceInstantiationException e) { Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -31,9 +31,12 @@ public class PopulationConfig { + public static final String RESOURCE_CODE = "twitter.population.config"; + private String encoding; private List<String> featureKeys, contentKeys; private int tweetsPerDoc; + private boolean processEntities = true; public boolean getOneDocCheckbox() { @@ -48,6 +51,14 @@ this.tweetsPerDoc = tpd; } + public boolean isProcessEntities() { + return processEntities; + } + + public void setProcessEntities(boolean entities) { + this.processEntities = entities; + } + public String getEncoding() { return this.encoding; } @@ -91,8 +102,9 @@ * @param cks * @param fks */ - public PopulationConfig(int tpd, String encoding, List<String> cks, List<String> fks) { + public PopulationConfig(int tpd, boolean entities, String encoding, List<String> cks, List<String> fks) { this.tweetsPerDoc = tpd; + this.processEntities = entities; this.encoding = encoding; this.contentKeys = cks; this.featureKeys = fks; @@ -102,6 +114,7 @@ public void reload(File file) { PopulationConfig source = PopulationConfig.load(file); this.tweetsPerDoc = source.tweetsPerDoc; + this.processEntities = source.processEntities; this.encoding = source.encoding; this.contentKeys = source.contentKeys; this.featureKeys = source.featureKeys; @@ -110,6 +123,7 @@ public void reload(URL url) { PopulationConfig source = PopulationConfig.load(url); this.tweetsPerDoc = source.tweetsPerDoc; + this.processEntities = source.processEntities; this.encoding = source.encoding; this.contentKeys = source.contentKeys; this.featureKeys = source.featureKeys; @@ -143,8 +157,6 @@ class LoadConfigListener implements ActionListener { - public static final String RESOURCE_CODE = "twitter.population.config"; - PopulationDialogWrapper wrapper; public LoadConfigListener(PopulationDialogWrapper wrapper) { @@ -153,8 +165,8 @@ @Override public void actionPerformed(ActionEvent arg0) { - XJFileChooser chooser = MainFrame.getFileChooser(); - chooser.setResource(RESOURCE_CODE); + XJFileChooser chooser = new XJFileChooser(); + chooser.setResource(PopulationConfig.RESOURCE_CODE); chooser.setDialogTitle("Load XML configuration"); chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY); int chosen = chooser.showOpenDialog(this.wrapper.dialog); @@ -177,6 +189,7 @@ @Override public void actionPerformed(ActionEvent event) { XJFileChooser chooser = new XJFileChooser(); + chooser.setResource(PopulationConfig.RESOURCE_CODE); chooser.setDialogTitle("Save configuration as XML"); chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY); int chosen = chooser.showSaveDialog(this.wrapper.dialog); Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -16,6 +16,10 @@ import gate.swing.XJFileChooser; import gate.util.ExtensionFileFilter; import gate.util.Strings; + +import java.awt.GridBagConstraints; +import java.awt.GridBagLayout; +import java.awt.Insets; import java.awt.Window; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; @@ -42,7 +46,8 @@ protected JDialog dialog; protected PopulationConfig config; private JTextField encodingField; - private JCheckBox checkbox; + private JCheckBox oneDocPerTweetCheckbox; + private JCheckBox entitiesCheckbox; private XJFileChooser chooser; private List<URL> fileUrls; private ListEditor featureKeysEditor, contentKeysEditor; @@ -57,39 +62,80 @@ dialog = new JDialog(MainFrame.getInstance(), "Populate from Twitter JSON", true); MainFrame.getGuiRoots().add(dialog); dialog.getContentPane().setLayout(new BoxLayout(dialog.getContentPane(), BoxLayout.Y_AXIS)); - dialog.add(Box.createVerticalStrut(3)); - Box encodingBox = Box.createHorizontalBox(); - JLabel encodingLabel = new JLabel("Encoding:"); + GridBagLayout formLayout = new GridBagLayout(); + JPanel formPanel = new JPanel(formLayout); + GridBagConstraints labelConstraints = new GridBagConstraints(); + labelConstraints.gridx = 0; + labelConstraints.insets = new Insets(3, 3, 0, 3); + labelConstraints.anchor = GridBagConstraints.LINE_END; + + GridBagConstraints componentConstraints = new GridBagConstraints(); + componentConstraints.gridx = 1; + componentConstraints.gridwidth = GridBagConstraints.REMAINDER; + componentConstraints.insets = new Insets(3, 3, 0, 3); + componentConstraints.anchor = GridBagConstraints.LINE_START; + componentConstraints.weightx = 1.0; + componentConstraints.fill = GridBagConstraints.HORIZONTAL; + + + JLabel encodingLabel = new JLabel("Encoding"); encodingField = new JTextField(config.getEncoding()); - encodingBox.add(encodingLabel); - encodingBox.add(encodingField); - dialog.add(encodingBox); - dialog.add(Box.createVerticalStrut(4)); + formLayout.setConstraints(encodingLabel, labelConstraints); + formPanel.add(encodingLabel); + formLayout.setConstraints(encodingField, componentConstraints); + formPanel.add(encodingField); - // Default is now 1 tweet per document; changed in PopulationConfig's - // default constructor. - Box checkboxBox = Box.createHorizontalBox(); - checkboxBox.setToolTipText("If unchecked, one document per file"); - JLabel checkboxLabel = new JLabel("One document per tweet"); - checkbox = new JCheckBox(); - checkbox.setSelected(config.getOneDocCheckbox()); - checkboxBox.add(checkboxLabel); - checkboxBox.add(Box.createHorizontalGlue()); - checkboxBox.add(checkbox); - dialog.add(checkboxBox); - dialog.add(Box.createVerticalStrut(4)); + // don't need horizontal fill for checkboxes + componentConstraints.fill = GridBagConstraints.NONE; - contentKeysEditor = new ListEditor("Content keys: ", config.getContentKeys()); + JLabel odptCheckboxLabel = new JLabel("One document per tweet"); + odptCheckboxLabel.setToolTipText("If unchecked, one document per file"); + oneDocPerTweetCheckbox = new JCheckBox(); + oneDocPerTweetCheckbox.setToolTipText("If unchecked, one document per file"); + oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox()); + formLayout.setConstraints(odptCheckboxLabel, labelConstraints); + formPanel.add(odptCheckboxLabel); + + formLayout.setConstraints(oneDocPerTweetCheckbox, componentConstraints); + formPanel.add(oneDocPerTweetCheckbox); + + JLabel entitiesCheckboxLabel = new JLabel("Annotations for \"entities\""); + entitiesCheckboxLabel.setToolTipText("Create annotations based on the \"entities\" property of the JSON"); + entitiesCheckbox = new JCheckBox(); + entitiesCheckbox.setToolTipText("Create annotations based on the \"entities\" property of the JSON"); + entitiesCheckbox.setSelected(config.isProcessEntities()); + formLayout.setConstraints(entitiesCheckboxLabel, labelConstraints); + formPanel.add(entitiesCheckboxLabel); + + formLayout.setConstraints(entitiesCheckbox, componentConstraints); + formPanel.add(entitiesCheckbox); + + // restore horizontal fill + componentConstraints.fill = GridBagConstraints.HORIZONTAL; + + JLabel contentKeysLabel = new JLabel("Content keys"); + contentKeysLabel.setToolTipText("JSON key paths to be turned into DocumentContent"); + contentKeysEditor = new ListEditor(config.getContentKeys()); contentKeysEditor.setToolTipText("JSON key paths to be turned into DocumentContent"); - dialog.add(contentKeysEditor); - dialog.add(Box.createVerticalStrut(4)); + formLayout.setConstraints(contentKeysLabel, labelConstraints); + formPanel.add(contentKeysLabel); + formLayout.setConstraints(contentKeysEditor, componentConstraints); + formPanel.add(contentKeysEditor); - featureKeysEditor = new ListEditor("Feature keys: ", config.getFeatureKeys()); + + JLabel featureKeysLabel = new JLabel("Feature keys"); + featureKeysLabel.setToolTipText("JSON key paths to be turned into Tweet annotation features"); + featureKeysEditor = new ListEditor(config.getFeatureKeys()); featureKeysEditor.setToolTipText("JSON key paths to be turned into Tweet annotation features"); - dialog.add(featureKeysEditor); - dialog.add(Box.createVerticalStrut(6)); + formLayout.setConstraints(featureKeysLabel, labelConstraints); + formPanel.add(featureKeysLabel); + formLayout.setConstraints(featureKeysEditor, componentConstraints); + formPanel.add(featureKeysEditor); + dialog.add(formPanel); + dialog.add(Box.createVerticalStrut(4)); + Box configPersistenceBox = Box.createHorizontalBox(); configPersistenceBox.add(Box.createHorizontalGlue()); JButton loadConfigButton = new JButton("Load configuration"); @@ -129,34 +175,22 @@ } - public String getEncoding() { - return this.config.getEncoding(); - } - public List<URL> getFileUrls() throws MalformedURLException { return this.fileUrls; } - public int getTweetsPerDoc() { - return this.config.getTweetsPerDoc(); + public PopulationConfig getConfig() { + return this.config; } - public List<String> getContentKeys() { - return this.config.getContentKeys(); - } - - public List<String> getFeatureKeys() { - return this.config.getFeatureKeys(); - } - - protected void setNewConfig(PopulationConfig newConfig) { this.config = newConfig; this.updateGui(); } protected void updateConfig() { - this.config.setTweetsPerDoc(this.checkbox.isSelected() ? 1 : 0); + this.config.setTweetsPerDoc(this.oneDocPerTweetCheckbox.isSelected() ? 1 : 0); + this.config.setProcessEntities(this.entitiesCheckbox.isSelected()); this.config.setContentKeys(this.contentKeysEditor.getValues()); this.config.setFeatureKeys(this.featureKeysEditor.getValues()); this.config.setEncoding(this.encodingField.getText()); @@ -167,7 +201,8 @@ this.encodingField.setText(config.getEncoding()); this.contentKeysEditor.setValues(config.getContentKeys()); this.featureKeysEditor.setValues(config.getFeatureKeys()); - this.checkbox.setSelected(config.getOneDocCheckbox()); + this.oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox()); + this.entitiesCheckbox.setSelected(config.isProcessEntities()); } @@ -224,19 +259,16 @@ private JButton listButton; private ListEditorDialog listEditor; private List<String> values; - private JLabel label; private JTextField field; @Override public void setToolTipText(String text) { super.setToolTipText(text); - label.setToolTipText(text); field.setToolTipText(text); } - public ListEditor(String labelString, List<String> initialValues) { - label = new JLabel(labelString); + public ListEditor(List<String> initialValues) { field = new JTextField(); values = initialValues; field.setText(Strings.toString(initialValues)); @@ -260,7 +292,6 @@ }); this.setLayout(new BoxLayout(this, BoxLayout.X_AXIS)); - this.add(label); this.add(field); this.add(listButton); } Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -15,8 +15,10 @@ import gate.Annotation; import gate.AnnotationSet; +import gate.Document; import gate.Factory; import gate.FeatureMap; +import gate.GateConstants; import gate.util.InvalidOffsetException; @@ -28,11 +30,16 @@ */ public class PreAnnotation { private FeatureMap features; + private String asName; private String type; private long start, end; public PreAnnotation(long start, long end, String type, FeatureMap features) { + this(start, end, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME, type, features); + } + + public PreAnnotation(long start, long end, String asName, String type, FeatureMap features) { if (features == null) { this.features = Factory.newFeatureMap(); } @@ -40,6 +47,7 @@ this.features = features; } + this.asName = asName; this.type = type; this.setStart(start); this.setEnd(end); @@ -53,6 +61,9 @@ this.setEnd(end); } + public Annotation toAnnotation(Document doc, long startOffset) throws InvalidOffsetException { + return toAnnotation(doc.getAnnotations(asName), startOffset); + } public Annotation toAnnotation(AnnotationSet outputAS, long startOffset) throws InvalidOffsetException { long outputStart = this.start + startOffset; @@ -77,6 +88,10 @@ public void setFeatures(FeatureMap features) { this.features = features; } + + public String getASName() { + return asName; + } public String getType() { return this.type; Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java =================================================================== --- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-10-31 02:20:23 UTC (rev 18421) +++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java 2014-10-31 17:22:29 UTC (rev 18422) @@ -154,6 +154,8 @@ String entityType = entityTypes.next(); JsonNode entitiesOfType = entitiesNode.get(entityType); if(entitiesOfType != null && entitiesOfType.isArray() && entitiesOfType.size() > 0) { + // if the entityType is X:Y then assume X is the AS name and Y is the actual type + String[] setAndType = entityType.split(":", 2); Iterator<JsonNode> it = entitiesOfType.elements(); while(it.hasNext()) { JsonNode entity = it.next(); @@ -166,8 +168,14 @@ if(indicesList.get(0) instanceof Number && indicesList.get(1) instanceof Number) { // finally we know we have a valid entity features.remove("indices"); - annotations.add(new PreAnnotation(startOffset + ((Number)indicesList.get(0)).longValue(), - startOffset + ((Number)indicesList.get(1)).longValue(), entityType, features)); + long annStart = startOffset + ((Number)indicesList.get(0)).longValue(); + long annEnd = startOffset + ((Number)indicesList.get(1)).longValue(); + if(setAndType.length == 2) { + // explicit annotation set name + annotations.add(new PreAnnotation(annStart, annEnd, setAndType[0], setAndType[1], features)); + } else { + annotations.add(new PreAnnotation(annStart, annEnd, entityType, features)); + } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs