Revision: 18395
          http://sourceforge.net/p/gate/code/18395
Author:   ian_roberts
Date:     2014-10-20 17:11:31 +0000 (Mon, 20 Oct 2014)
Log Message:
-----------
Added "skipExisting" parameter to skip annotations that already have a unit ID 
feature.  If a job builder fails part way through (e.g. with an HTTP error from 
the API) then this lets you re-run the build without creating duplicate 
annotations.

Modified Paths:
--------------
    
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java
    
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java

Modified: 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java
===================================================================
--- 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java
   2014-10-19 01:19:49 UTC (rev 18394)
+++ 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java
   2014-10-20 17:11:31 UTC (rev 18395)
@@ -36,29 +36,34 @@
 import gate.crowdsource.rest.CrowdFlowerClient;
 import gate.gui.ActionsPublisher;
 
+import static gate.crowdsource.CrowdFlowerConstants.UNIT_ID_FEATURE_NAME;
+
 import org.apache.log4j.Logger;
 
-@CreoleResource(name = "Entity Classification Job Builder",
-   comment = "Build a CrowdFlower job asking users to select the right label 
for entities",
-   helpURL = "http://gate.ac.uk/userguide/sec:crowd:classification";)
-public class EntityClassificationJobBuilder extends AbstractLanguageAnalyser 
implements ActionsPublisher {
+@CreoleResource(name = "Entity Classification Job Builder", comment = "Build a 
CrowdFlower job asking users to select the right label for entities", helpURL = 
"http://gate.ac.uk/userguide/sec:crowd:classification";)
+public class EntityClassificationJobBuilder extends AbstractLanguageAnalyser
+                                                                            
implements
+                                                                            
ActionsPublisher {
 
-  private static final Logger log = 
Logger.getLogger(EntityClassificationJobBuilder.class);
+  private static final Logger log = Logger
+          .getLogger(EntityClassificationJobBuilder.class);
 
   private static final long serialVersionUID = -1584716901194104888L;
 
   private String apiKey;
-  
+
   private Long jobId;
-  
+
   private String contextAnnotationType;
-  
+
   private String contextASName;
-  
+
   private String entityAnnotationType;
-  
+
   private String entityASName;
-  
+
+  private Boolean skipExisting;
+
   protected CrowdFlowerClient crowdFlowerClient;
 
   public String getApiKey() {
@@ -122,6 +127,18 @@
     this.entityASName = entityASName;
   }
 
+  public Boolean getSkipExisting() {
+    return skipExisting;
+  }
+
+  @Optional
+  @RunTime
+  @CreoleParameter(defaultValue = "true", comment = "Should we skip snippets 
that already "
+          + "have a feature indicating that they have been processed before?")
+  public void setSkipExisting(Boolean skipExisting) {
+    this.skipExisting = skipExisting;
+  }
+
   @Override
   public Resource init() throws ResourceInstantiationException {
     if(apiKey == null || "".equals(apiKey)) {
@@ -139,30 +156,45 @@
       if(jobId == null || jobId.longValue() <= 0) {
         throw new ExecutionException("Job ID must be provided");
       }
-      
+
       AnnotationSet entityAS = getDocument().getAnnotations(entityASName);
-      AnnotationSet contextAnnotations = 
getDocument().getAnnotations(contextASName)
-              .get(contextAnnotationType);
-      
-      List<Annotation> allEntities = 
Utils.inDocumentOrder(entityAS.get(entityAnnotationType));
-      fireStatusChanged("Creating CrowdFlower units for " + allEntities.size() 
+ " "
-              + entityAnnotationType + " annotations for classification task 
");
+      AnnotationSet contextAnnotations =
+              getDocument().getAnnotations(contextASName).get(
+                      contextAnnotationType);
 
+      List<Annotation> allEntities =
+              Utils.inDocumentOrder(entityAS.get(entityAnnotationType));
+      fireStatusChanged("Creating CrowdFlower units for " + allEntities.size()
+              + " " + entityAnnotationType
+              + " annotations for classification task ");
+
       int entityIdx = 0;
       for(Annotation entity : allEntities) {
         fireProgressChanged((100 * entityIdx++) / allEntities.size());
         if(isInterrupted()) throw new ExecutionInterruptedException();
-        AnnotationSet thisEntityContext = 
Utils.getCoveringAnnotations(contextAnnotations, entity);
+        // skip existing units, if so configured
+        if(skipExisting != null && skipExisting.booleanValue()
+                && entity.getFeatures().containsKey(UNIT_ID_FEATURE_NAME)) {
+          continue;
+        }
+
+        AnnotationSet thisEntityContext =
+                Utils.getCoveringAnnotations(contextAnnotations, entity);
         if(thisEntityContext.isEmpty()) {
-          log.warn(entityAnnotationType + " with ID " + entity.getId() +
-              " at offsets (" + Utils.start(entity) + ":" + Utils.end(entity) +
-              ") in document " + getDocument().getName() + 
-              " has no surrounding " + contextAnnotationType + " - ignored");
+          log.warn(entityAnnotationType + " with ID " + entity.getId()
+                  + " at offsets (" + Utils.start(entity) + ":"
+                  + Utils.end(entity) + ") in document "
+                  + getDocument().getName() + " has no surrounding "
+                  + contextAnnotationType + " - ignored");
         } else {
-          // get the "closest" context, i.e. the shortest annotation in the 
covering set.
+          // get the "closest" context, i.e. the shortest annotation in
+          // the covering set.
           // usually we'd expect this set to contain just one annotation
-          Annotation context = Collections.min(thisEntityContext, 
ANNOTATION_LENGTH_COMPARATOR);
-          crowdFlowerClient.createClassificationUnit(jobId, getDocument(), 
entityASName, context, entity);
+          Annotation context =
+                  Collections.min(thisEntityContext,
+                          ANNOTATION_LENGTH_COMPARATOR);
+          crowdFlowerClient.createClassificationUnit(jobId, getDocument(),
+                  entityASName, context, entity);
         }
       }
       fireProcessFinished();
@@ -171,15 +203,16 @@
       interrupted = false;
     }
   }
-  
-  private static final Comparator<Annotation> ANNOTATION_LENGTH_COMPARATOR = 
new Comparator<Annotation>() {
-    public int compare(Annotation a1, Annotation a2) {
-      return Utils.length(a1) - Utils.length(a2);
-    }
-  };
-  
+
+  private static final Comparator<Annotation> ANNOTATION_LENGTH_COMPARATOR =
+          new Comparator<Annotation>() {
+            public int compare(Annotation a1, Annotation a2) {
+              return Utils.length(a1) - Utils.length(a2);
+            }
+          };
+
   private List<Action> actions = null;
-  
+
   public List<Action> getActions() {
     if(actions == null) {
       actions = new ArrayList<Action>();

Modified: 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java
===================================================================
--- 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java
   2014-10-19 01:19:49 UTC (rev 18394)
+++ 
gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java
   2014-10-20 17:11:31 UTC (rev 18395)
@@ -33,10 +33,8 @@
 
 import javax.swing.Action;
 
-@CreoleResource(name = "Entity Annotation Job Builder",
-    comment = "Build a CrowdFlower job asking users to annotate entities "
-       + "within a snippet of text",
-    helpURL = "http://gate.ac.uk/userguide/sec:crowd:annotation";)
+@CreoleResource(name = "Entity Annotation Job Builder", comment = "Build a 
CrowdFlower job asking users to annotate entities "
+        + "within a snippet of text", helpURL = 
"http://gate.ac.uk/userguide/sec:crowd:annotation";)
 public class EntityAnnotationJobBuilder extends AbstractLanguageAnalyser
                                                                         
implements
                                                                         
ActionsPublisher {
@@ -67,6 +65,8 @@
 
   private String entityASName;
 
+  private Boolean skipExisting;
+
   protected CrowdFlowerClient crowdFlowerClient;
 
   public String getApiKey() {
@@ -203,6 +203,18 @@
     this.goldReasonFeatureName = goldReasonFeatureName;
   }
 
+  public Boolean getSkipExisting() {
+    return skipExisting;
+  }
+
+  @Optional
+  @RunTime
+  @CreoleParameter(defaultValue = "true", comment = "Should we skip snippets 
that already "
+          + "have a feature indicating that they have been processed before?")
+  public void setSkipExisting(Boolean skipExisting) {
+    this.skipExisting = skipExisting;
+  }
+
   @Override
   public Resource init() throws ResourceInstantiationException {
     if(apiKey == null || "".equals(apiKey)) {
@@ -240,6 +252,13 @@
       for(Annotation snippet : allSnippets) {
         fireProgressChanged((100 * snippetIdx++) / allSnippets.size());
         if(isInterrupted()) throw new ExecutionInterruptedException();
+        // skip existing units, if so configured
+        if(skipExisting != null
+                && skipExisting.booleanValue()
+                && snippet.getFeatures().containsKey(
+                        entityAnnotationType + "_unit_id")) {
+          continue;
+        }
         AnnotationSet snippetTokens =
                 Utils.getContainedAnnotations(tokens, snippet);
         String detail = null;

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Comprehensive Server Monitoring with Site24x7.
Monitor 10 servers for $9/Month.
Get alerted through email, SMS, voice calls or mobile push notifications.
Take corrective actions from your mobile device.
http://p.sf.net/sfu/Zoho
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to