Revision: 18422
          http://sourceforge.net/p/gate/code/18422
Author:   ian_roberts
Date:     2014-10-31 17:22:29 +0000 (Fri, 31 Oct 2014)
Log Message:
-----------
Added entities support to corpus populator

Also enhanced handling to allow for the specification of annotation set names
in entity types.  An entity type that does not contain a colon will go into the
Original markups annotation set as normal, but a type containing a colon will
be treated as "asName:annType" (with an empty asName denoting the default set):

{
  "text":"This is a test",
  "entities":{
    "Mention":[{"indices":[10,14],"inst":"urn:test:Orig-markups"}],
    ":Mention":[{"indices":[10,14],"inst":"urn:test:default-set"}],
    "Key:Mention":[{"indices":[10,14],"inst":"urn:test:Key-set"}]
  }
}

Modified Paths:
--------------
    gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
    
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java    
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java    
2014-10-31 17:22:29 UTC (rev 18422)
@@ -110,11 +110,10 @@
       DocumentContent newContent = new 
DocumentContentImpl(concatenation.toString());
       doc.edit(0L, doc.getContent().size(), newContent);
 
-      AnnotationSet originalMarkups = 
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
       // Create Original markups annotations for each tweet
       for (Tweet tweet : tweetStarts.keySet()) {
         for (PreAnnotation preAnn : tweet.getAnnotations()) {
-          preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet));
+          preAnn.toAnnotation(doc, tweetStarts.get(tweet));
         }
       }
     }

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java 
2014-10-31 17:22:29 UTC (rev 18422)
@@ -11,12 +11,10 @@
  */
 package gate.corpora.twitter;
 
-import gate.AnnotationSet;
 import gate.Corpus;
 import gate.Document;
 import gate.DocumentContent;
 import gate.Factory;
-import gate.Gate;
 import gate.corpora.DocumentContentImpl;
 import gate.creole.ResourceInstantiationException;
 import gate.creole.metadata.AutoInstance;
@@ -54,7 +52,7 @@
   public static void populateCorpus(final Corpus corpus, URL inputUrl, 
PopulationConfig config) 
       throws ResourceInstantiationException {
     populateCorpus(corpus, inputUrl, config.getEncoding(), 
config.getContentKeys(), 
-        config.getFeatureKeys(), config.getTweetsPerDoc());
+        config.getFeatureKeys(), config.getTweetsPerDoc(), 
config.isProcessEntities());
   }
   
   /**
@@ -69,14 +67,19 @@
    */
   public static void populateCorpus(final Corpus corpus, URL inputUrl, String 
encoding, List<String> contentKeys,
       List<String> featureKeys, int tweetsPerDoc) throws 
ResourceInstantiationException {
-    
+    populateCorpus(corpus, inputUrl, encoding, contentKeys, featureKeys, 
tweetsPerDoc, true);
+  }
+
+  public static void populateCorpus(final Corpus corpus, URL inputUrl, String 
encoding, List<String> contentKeys,
+          List<String> featureKeys, int tweetsPerDoc, boolean processEntities) 
throws ResourceInstantiationException {
+
     InputStream input = null;
     try {
       input = inputUrl.openStream();
       
       // TODO Detect & handle gzipped input.
       // TODO handling of entities, once there's GUI to control it
-      TweetStreamIterator tweetSource = new TweetStreamIterator(input, 
contentKeys, featureKeys, false, false);
+      TweetStreamIterator tweetSource = new TweetStreamIterator(input, 
contentKeys, featureKeys, false, processEntities);
 
       int tweetCounter = 0;
       int tweetDocCounter = 0;
@@ -159,9 +162,8 @@
     else {    
       DocumentContent contentImpl = new 
DocumentContentImpl(content.toString());
       document.setContent(contentImpl);
-      AnnotationSet originalMarkups = 
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
       for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
-        preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
+        preAnn.toAnnotation(document, annotandaOffsets.get(preAnn));
       }
       corpus.add(document);
       
@@ -200,8 +202,7 @@
                 public void run() {
                   try {
                     for (URL fileUrl : fileUrls) {
-                      populateCorpus((Corpus) handle.getTarget(), fileUrl, 
dialog.getEncoding(), 
-                          dialog.getContentKeys(), dialog.getFeatureKeys(), 
dialog.getTweetsPerDoc());
+                      populateCorpus((Corpus) handle.getTarget(), fileUrl, 
dialog.getConfig());
                     } 
                   }
                   catch(ResourceInstantiationException e) {

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java   
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java   
2014-10-31 17:22:29 UTC (rev 18422)
@@ -31,9 +31,12 @@
 
 
 public class PopulationConfig   {
+  public static final String RESOURCE_CODE = "twitter.population.config";
+
   private String encoding;
   private List<String> featureKeys, contentKeys;
   private int tweetsPerDoc;
+  private boolean processEntities = true;
   
   
   public boolean getOneDocCheckbox() {
@@ -48,6 +51,14 @@
     this.tweetsPerDoc = tpd;
   }
   
+  public boolean isProcessEntities() {
+    return processEntities;
+  }
+  
+  public void setProcessEntities(boolean entities) {
+    this.processEntities = entities;
+  }
+  
   public String getEncoding() {
     return this.encoding;
   }
@@ -91,8 +102,9 @@
    * @param cks
    * @param fks
    */
-  public PopulationConfig(int tpd, String encoding, List<String> cks, 
List<String> fks) {
+  public PopulationConfig(int tpd, boolean entities, String encoding, 
List<String> cks, List<String> fks) {
     this.tweetsPerDoc = tpd;
+    this.processEntities = entities;
     this.encoding = encoding;
     this.contentKeys = cks;
     this.featureKeys = fks;
@@ -102,6 +114,7 @@
   public void reload(File file) {
     PopulationConfig source = PopulationConfig.load(file);
     this.tweetsPerDoc = source.tweetsPerDoc;
+    this.processEntities = source.processEntities;
     this.encoding = source.encoding;
     this.contentKeys = source.contentKeys;
     this.featureKeys = source.featureKeys;
@@ -110,6 +123,7 @@
   public void reload(URL url) {
     PopulationConfig source = PopulationConfig.load(url);
     this.tweetsPerDoc = source.tweetsPerDoc;
+    this.processEntities = source.processEntities;
     this.encoding = source.encoding;
     this.contentKeys = source.contentKeys;
     this.featureKeys = source.featureKeys;
@@ -143,8 +157,6 @@
 
 
 class LoadConfigListener implements ActionListener {
-  public static final String RESOURCE_CODE = "twitter.population.config";
-
   PopulationDialogWrapper wrapper;
   
   public LoadConfigListener(PopulationDialogWrapper wrapper) {
@@ -153,8 +165,8 @@
 
   @Override
   public void actionPerformed(ActionEvent arg0) {
-    XJFileChooser chooser = MainFrame.getFileChooser();
-    chooser.setResource(RESOURCE_CODE);
+    XJFileChooser chooser = new XJFileChooser();
+    chooser.setResource(PopulationConfig.RESOURCE_CODE);
     chooser.setDialogTitle("Load XML configuration");
     chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY);
     int chosen = chooser.showOpenDialog(this.wrapper.dialog);
@@ -177,6 +189,7 @@
   @Override
   public void actionPerformed(ActionEvent event) {
     XJFileChooser chooser = new XJFileChooser();
+    chooser.setResource(PopulationConfig.RESOURCE_CODE);
     chooser.setDialogTitle("Save configuration as XML");
     chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY);
     int chosen = chooser.showSaveDialog(this.wrapper.dialog);

Modified: 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
===================================================================
--- 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
    2014-10-31 02:20:23 UTC (rev 18421)
+++ 
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
    2014-10-31 17:22:29 UTC (rev 18422)
@@ -16,6 +16,10 @@
 import gate.swing.XJFileChooser;
 import gate.util.ExtensionFileFilter;
 import gate.util.Strings;
+
+import java.awt.GridBagConstraints;
+import java.awt.GridBagLayout;
+import java.awt.Insets;
 import java.awt.Window;
 import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
@@ -42,7 +46,8 @@
   protected JDialog dialog;
   protected PopulationConfig config;
   private JTextField encodingField;
-  private JCheckBox checkbox;
+  private JCheckBox oneDocPerTweetCheckbox;
+  private JCheckBox entitiesCheckbox;
   private XJFileChooser chooser;
   private List<URL> fileUrls;
   private ListEditor featureKeysEditor, contentKeysEditor;
@@ -57,39 +62,80 @@
     dialog = new JDialog(MainFrame.getInstance(), "Populate from Twitter 
JSON", true);
     MainFrame.getGuiRoots().add(dialog);
     dialog.getContentPane().setLayout(new BoxLayout(dialog.getContentPane(), 
BoxLayout.Y_AXIS));
-    dialog.add(Box.createVerticalStrut(3));
     
-    Box encodingBox = Box.createHorizontalBox();
-    JLabel encodingLabel = new JLabel("Encoding:");
+    GridBagLayout formLayout = new GridBagLayout();
+    JPanel formPanel = new JPanel(formLayout);
+    GridBagConstraints labelConstraints = new GridBagConstraints();
+    labelConstraints.gridx = 0;
+    labelConstraints.insets = new Insets(3, 3, 0, 3);
+    labelConstraints.anchor = GridBagConstraints.LINE_END;
+    
+    GridBagConstraints componentConstraints = new GridBagConstraints();
+    componentConstraints.gridx = 1;
+    componentConstraints.gridwidth = GridBagConstraints.REMAINDER;
+    componentConstraints.insets = new Insets(3, 3, 0, 3);
+    componentConstraints.anchor = GridBagConstraints.LINE_START;
+    componentConstraints.weightx = 1.0;
+    componentConstraints.fill = GridBagConstraints.HORIZONTAL;
+    
+    
+    JLabel encodingLabel = new JLabel("Encoding");
     encodingField = new JTextField(config.getEncoding());
-    encodingBox.add(encodingLabel);
-    encodingBox.add(encodingField);
-    dialog.add(encodingBox);
-    dialog.add(Box.createVerticalStrut(4));
+    formLayout.setConstraints(encodingLabel, labelConstraints);
+    formPanel.add(encodingLabel);
+    formLayout.setConstraints(encodingField, componentConstraints);
+    formPanel.add(encodingField);
 
-    // Default is now 1 tweet per document; changed in PopulationConfig's
-    // default constructor.
-    Box checkboxBox = Box.createHorizontalBox();
-    checkboxBox.setToolTipText("If unchecked, one document per file");
-    JLabel checkboxLabel = new JLabel("One document per tweet");
-    checkbox = new JCheckBox();
-    checkbox.setSelected(config.getOneDocCheckbox());
-    checkboxBox.add(checkboxLabel);
-    checkboxBox.add(Box.createHorizontalGlue());
-    checkboxBox.add(checkbox);
-    dialog.add(checkboxBox);
-    dialog.add(Box.createVerticalStrut(4));
+    // don't need horizontal fill for checkboxes
+    componentConstraints.fill = GridBagConstraints.NONE;
     
-    contentKeysEditor = new ListEditor("Content keys: ", 
config.getContentKeys());
+    JLabel odptCheckboxLabel = new JLabel("One document per tweet");
+    odptCheckboxLabel.setToolTipText("If unchecked, one document per file");
+    oneDocPerTweetCheckbox = new JCheckBox();
+    oneDocPerTweetCheckbox.setToolTipText("If unchecked, one document per 
file");
+    oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox());
+    formLayout.setConstraints(odptCheckboxLabel, labelConstraints);
+    formPanel.add(odptCheckboxLabel);
+    
+    formLayout.setConstraints(oneDocPerTweetCheckbox, componentConstraints);
+    formPanel.add(oneDocPerTweetCheckbox);
+    
+    JLabel entitiesCheckboxLabel = new JLabel("Annotations for \"entities\"");
+    entitiesCheckboxLabel.setToolTipText("Create annotations based on the 
\"entities\" property of the JSON");
+    entitiesCheckbox = new JCheckBox();
+    entitiesCheckbox.setToolTipText("Create annotations based on the 
\"entities\" property of the JSON");
+    entitiesCheckbox.setSelected(config.isProcessEntities());
+    formLayout.setConstraints(entitiesCheckboxLabel, labelConstraints);
+    formPanel.add(entitiesCheckboxLabel);
+    
+    formLayout.setConstraints(entitiesCheckbox, componentConstraints);
+    formPanel.add(entitiesCheckbox);
+
+    // restore horizontal fill
+    componentConstraints.fill = GridBagConstraints.HORIZONTAL;
+
+    JLabel contentKeysLabel = new JLabel("Content keys");
+    contentKeysLabel.setToolTipText("JSON key paths to be turned into 
DocumentContent");    
+    contentKeysEditor = new ListEditor(config.getContentKeys());
     contentKeysEditor.setToolTipText("JSON key paths to be turned into 
DocumentContent");
-    dialog.add(contentKeysEditor);
-    dialog.add(Box.createVerticalStrut(4));
+    formLayout.setConstraints(contentKeysLabel, labelConstraints);
+    formPanel.add(contentKeysLabel);
+    formLayout.setConstraints(contentKeysEditor, componentConstraints);
+    formPanel.add(contentKeysEditor);
     
-    featureKeysEditor = new ListEditor("Feature keys: ", 
config.getFeatureKeys());
+    
+    JLabel featureKeysLabel = new JLabel("Feature keys");
+    featureKeysLabel.setToolTipText("JSON key paths to be turned into Tweet 
annotation features");    
+    featureKeysEditor = new ListEditor(config.getFeatureKeys());
     featureKeysEditor.setToolTipText("JSON key paths to be turned into Tweet 
annotation features");
-    dialog.add(featureKeysEditor);
-    dialog.add(Box.createVerticalStrut(6));
+    formLayout.setConstraints(featureKeysLabel, labelConstraints);
+    formPanel.add(featureKeysLabel);
+    formLayout.setConstraints(featureKeysEditor, componentConstraints);
+    formPanel.add(featureKeysEditor);
     
+    dialog.add(formPanel);
+    dialog.add(Box.createVerticalStrut(4));
+
     Box configPersistenceBox = Box.createHorizontalBox();
     configPersistenceBox.add(Box.createHorizontalGlue());
     JButton loadConfigButton = new JButton("Load configuration");
@@ -129,34 +175,22 @@
   }
   
   
-  public String getEncoding() {
-    return this.config.getEncoding();
-  }
-  
   public List<URL> getFileUrls() throws MalformedURLException {
     return this.fileUrls;
   }
 
-  public int getTweetsPerDoc() {
-    return this.config.getTweetsPerDoc();
+  public PopulationConfig getConfig() {
+    return this.config;
   }
   
-  public List<String> getContentKeys() {
-    return this.config.getContentKeys();
-  }
-  
-  public List<String> getFeatureKeys() {
-    return this.config.getFeatureKeys();
-  }
-  
-  
   protected void setNewConfig(PopulationConfig newConfig) {
     this.config = newConfig;
     this.updateGui();
   }
   
   protected void updateConfig() {
-    this.config.setTweetsPerDoc(this.checkbox.isSelected() ? 1 : 0);
+    this.config.setTweetsPerDoc(this.oneDocPerTweetCheckbox.isSelected() ? 1 : 
0);
+    this.config.setProcessEntities(this.entitiesCheckbox.isSelected());
     this.config.setContentKeys(this.contentKeysEditor.getValues());
     this.config.setFeatureKeys(this.featureKeysEditor.getValues());
     this.config.setEncoding(this.encodingField.getText());
@@ -167,7 +201,8 @@
     this.encodingField.setText(config.getEncoding());
     this.contentKeysEditor.setValues(config.getContentKeys());
     this.featureKeysEditor.setValues(config.getFeatureKeys());
-    this.checkbox.setSelected(config.getOneDocCheckbox());
+    this.oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox());
+    this.entitiesCheckbox.setSelected(config.isProcessEntities());
   }
   
   
@@ -224,19 +259,16 @@
   private JButton listButton;
   private ListEditorDialog listEditor;
   private List<String> values;
-  private JLabel label;
   private JTextField field;
   
   @Override
   public void setToolTipText(String text) {
     super.setToolTipText(text);
-    label.setToolTipText(text);
     field.setToolTipText(text);
   }
   
   
-  public ListEditor(String labelString, List<String> initialValues) {
-    label = new JLabel(labelString);
+  public ListEditor(List<String> initialValues) {
     field = new JTextField();
     values = initialValues;
     field.setText(Strings.toString(initialValues));
@@ -260,7 +292,6 @@
     });
     
     this.setLayout(new BoxLayout(this, BoxLayout.X_AXIS));
-    this.add(label);
     this.add(field);
     this.add(listButton);
   }

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java      
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java      
2014-10-31 17:22:29 UTC (rev 18422)
@@ -15,8 +15,10 @@
 
 import gate.Annotation;
 import gate.AnnotationSet;
+import gate.Document;
 import gate.Factory;
 import gate.FeatureMap;
+import gate.GateConstants;
 import gate.util.InvalidOffsetException;
 
 
@@ -28,11 +30,16 @@
  */
 public class PreAnnotation  {
   private FeatureMap features;
+  private String asName;
   private String type;
   private long start, end;
   
   
   public PreAnnotation(long start, long end, String type, FeatureMap features) 
{
+    this(start, end, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME, type, 
features);
+  }
+  
+  public PreAnnotation(long start, long end, String asName, String type, 
FeatureMap features) {
     if (features == null) {
       this.features = Factory.newFeatureMap();
     }
@@ -40,6 +47,7 @@
       this.features = features;
     }
     
+    this.asName = asName;
     this.type = type;
     this.setStart(start);
     this.setEnd(end);
@@ -53,6 +61,9 @@
     this.setEnd(end);
   }
   
+  public Annotation toAnnotation(Document doc, long startOffset) throws 
InvalidOffsetException {
+    return toAnnotation(doc.getAnnotations(asName), startOffset);
+  }
   
   public Annotation toAnnotation(AnnotationSet outputAS, long startOffset) 
throws InvalidOffsetException {
     long outputStart = this.start + startOffset;
@@ -77,6 +88,10 @@
   public void setFeatures(FeatureMap features) {
     this.features = features;
   }
+  
+  public String getASName() {
+    return asName;
+  }
 
   public String getType() {
     return this.type;

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java      
2014-10-31 17:22:29 UTC (rev 18422)
@@ -154,6 +154,8 @@
       String entityType = entityTypes.next();
       JsonNode entitiesOfType = entitiesNode.get(entityType);
       if(entitiesOfType != null && entitiesOfType.isArray() && 
entitiesOfType.size() > 0) {
+        // if the entityType is X:Y then assume X is the AS name and Y is the 
actual type
+        String[] setAndType = entityType.split(":", 2);
         Iterator<JsonNode> it = entitiesOfType.elements();
         while(it.hasNext()) {
           JsonNode entity = it.next();
@@ -166,8 +168,14 @@
               if(indicesList.get(0) instanceof Number && indicesList.get(1) 
instanceof Number) {
                 // finally we know we have a valid entity
                 features.remove("indices");
-                annotations.add(new PreAnnotation(startOffset + 
((Number)indicesList.get(0)).longValue(),
-                        startOffset + 
((Number)indicesList.get(1)).longValue(), entityType, features));
+                long annStart = startOffset + 
((Number)indicesList.get(0)).longValue();
+                long annEnd = startOffset + 
((Number)indicesList.get(1)).longValue();
+                if(setAndType.length == 2) {
+                  // explicit annotation set name
+                  annotations.add(new PreAnnotation(annStart, annEnd, 
setAndType[0], setAndType[1], features));
+                } else {
+                  annotations.add(new PreAnnotation(annStart, annEnd, 
entityType, features));
+                }
               }
             }
           }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to