Revision: 18395 http://sourceforge.net/p/gate/code/18395 Author: ian_roberts Date: 2014-10-20 17:11:31 +0000 (Mon, 20 Oct 2014) Log Message: ----------- Added "skipExisting" parameter to skip annotations that already have a unit ID feature. If a job builder fails part way through (e.g. with an HTTP error from the API) then this lets you re-run the build without creating duplicate annotations.
Modified Paths: -------------- gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java Modified: gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java =================================================================== --- gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java 2014-10-19 01:19:49 UTC (rev 18394) +++ gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/classification/EntityClassificationJobBuilder.java 2014-10-20 17:11:31 UTC (rev 18395) @@ -36,29 +36,34 @@ import gate.crowdsource.rest.CrowdFlowerClient; import gate.gui.ActionsPublisher; +import static gate.crowdsource.CrowdFlowerConstants.UNIT_ID_FEATURE_NAME; + import org.apache.log4j.Logger; -@CreoleResource(name = "Entity Classification Job Builder", - comment = "Build a CrowdFlower job asking users to select the right label for entities", - helpURL = "http://gate.ac.uk/userguide/sec:crowd:classification") -public class EntityClassificationJobBuilder extends AbstractLanguageAnalyser implements ActionsPublisher { +@CreoleResource(name = "Entity Classification Job Builder", comment = "Build a CrowdFlower job asking users to select the right label for entities", helpURL = "http://gate.ac.uk/userguide/sec:crowd:classification") +public class EntityClassificationJobBuilder extends AbstractLanguageAnalyser + implements + ActionsPublisher { - private static final Logger log = Logger.getLogger(EntityClassificationJobBuilder.class); + private static final Logger log = Logger + .getLogger(EntityClassificationJobBuilder.class); private static final long serialVersionUID = -1584716901194104888L; private String apiKey; - + private Long jobId; - + private String contextAnnotationType; - + private String contextASName; - + private String entityAnnotationType; - + private String entityASName; - + + private Boolean skipExisting; + protected CrowdFlowerClient crowdFlowerClient; public String getApiKey() { @@ -122,6 +127,18 @@ this.entityASName = entityASName; } + public Boolean getSkipExisting() { + return skipExisting; + } + + @Optional + @RunTime + @CreoleParameter(defaultValue = "true", comment = "Should we skip snippets that already " + + "have a feature indicating that they have been processed before?") + public void setSkipExisting(Boolean skipExisting) { + this.skipExisting = skipExisting; + } + @Override public Resource init() throws ResourceInstantiationException { if(apiKey == null || "".equals(apiKey)) { @@ -139,30 +156,45 @@ if(jobId == null || jobId.longValue() <= 0) { throw new ExecutionException("Job ID must be provided"); } - + AnnotationSet entityAS = getDocument().getAnnotations(entityASName); - AnnotationSet contextAnnotations = getDocument().getAnnotations(contextASName) - .get(contextAnnotationType); - - List<Annotation> allEntities = Utils.inDocumentOrder(entityAS.get(entityAnnotationType)); - fireStatusChanged("Creating CrowdFlower units for " + allEntities.size() + " " - + entityAnnotationType + " annotations for classification task "); + AnnotationSet contextAnnotations = + getDocument().getAnnotations(contextASName).get( + contextAnnotationType); + List<Annotation> allEntities = + Utils.inDocumentOrder(entityAS.get(entityAnnotationType)); + fireStatusChanged("Creating CrowdFlower units for " + allEntities.size() + + " " + entityAnnotationType + + " annotations for classification task "); + int entityIdx = 0; for(Annotation entity : allEntities) { fireProgressChanged((100 * entityIdx++) / allEntities.size()); if(isInterrupted()) throw new ExecutionInterruptedException(); - AnnotationSet thisEntityContext = Utils.getCoveringAnnotations(contextAnnotations, entity); + // skip existing units, if so configured + if(skipExisting != null && skipExisting.booleanValue() + && entity.getFeatures().containsKey(UNIT_ID_FEATURE_NAME)) { + continue; + } + + AnnotationSet thisEntityContext = + Utils.getCoveringAnnotations(contextAnnotations, entity); if(thisEntityContext.isEmpty()) { - log.warn(entityAnnotationType + " with ID " + entity.getId() + - " at offsets (" + Utils.start(entity) + ":" + Utils.end(entity) + - ") in document " + getDocument().getName() + - " has no surrounding " + contextAnnotationType + " - ignored"); + log.warn(entityAnnotationType + " with ID " + entity.getId() + + " at offsets (" + Utils.start(entity) + ":" + + Utils.end(entity) + ") in document " + + getDocument().getName() + " has no surrounding " + + contextAnnotationType + " - ignored"); } else { - // get the "closest" context, i.e. the shortest annotation in the covering set. + // get the "closest" context, i.e. the shortest annotation in + // the covering set. // usually we'd expect this set to contain just one annotation - Annotation context = Collections.min(thisEntityContext, ANNOTATION_LENGTH_COMPARATOR); - crowdFlowerClient.createClassificationUnit(jobId, getDocument(), entityASName, context, entity); + Annotation context = + Collections.min(thisEntityContext, + ANNOTATION_LENGTH_COMPARATOR); + crowdFlowerClient.createClassificationUnit(jobId, getDocument(), + entityASName, context, entity); } } fireProcessFinished(); @@ -171,15 +203,16 @@ interrupted = false; } } - - private static final Comparator<Annotation> ANNOTATION_LENGTH_COMPARATOR = new Comparator<Annotation>() { - public int compare(Annotation a1, Annotation a2) { - return Utils.length(a1) - Utils.length(a2); - } - }; - + + private static final Comparator<Annotation> ANNOTATION_LENGTH_COMPARATOR = + new Comparator<Annotation>() { + public int compare(Annotation a1, Annotation a2) { + return Utils.length(a1) - Utils.length(a2); + } + }; + private List<Action> actions = null; - + public List<Action> getActions() { if(actions == null) { actions = new ArrayList<Action>(); Modified: gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java =================================================================== --- gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java 2014-10-19 01:19:49 UTC (rev 18394) +++ gate/trunk/plugins/Crowd_Sourcing/src/gate/crowdsource/ne/EntityAnnotationJobBuilder.java 2014-10-20 17:11:31 UTC (rev 18395) @@ -33,10 +33,8 @@ import javax.swing.Action; -@CreoleResource(name = "Entity Annotation Job Builder", - comment = "Build a CrowdFlower job asking users to annotate entities " - + "within a snippet of text", - helpURL = "http://gate.ac.uk/userguide/sec:crowd:annotation") +@CreoleResource(name = "Entity Annotation Job Builder", comment = "Build a CrowdFlower job asking users to annotate entities " + + "within a snippet of text", helpURL = "http://gate.ac.uk/userguide/sec:crowd:annotation") public class EntityAnnotationJobBuilder extends AbstractLanguageAnalyser implements ActionsPublisher { @@ -67,6 +65,8 @@ private String entityASName; + private Boolean skipExisting; + protected CrowdFlowerClient crowdFlowerClient; public String getApiKey() { @@ -203,6 +203,18 @@ this.goldReasonFeatureName = goldReasonFeatureName; } + public Boolean getSkipExisting() { + return skipExisting; + } + + @Optional + @RunTime + @CreoleParameter(defaultValue = "true", comment = "Should we skip snippets that already " + + "have a feature indicating that they have been processed before?") + public void setSkipExisting(Boolean skipExisting) { + this.skipExisting = skipExisting; + } + @Override public Resource init() throws ResourceInstantiationException { if(apiKey == null || "".equals(apiKey)) { @@ -240,6 +252,13 @@ for(Annotation snippet : allSnippets) { fireProgressChanged((100 * snippetIdx++) / allSnippets.size()); if(isInterrupted()) throw new ExecutionInterruptedException(); + // skip existing units, if so configured + if(skipExisting != null + && skipExisting.booleanValue() + && snippet.getFeatures().containsKey( + entityAnnotationType + "_unit_id")) { + continue; + } AnnotationSet snippetTokens = Utils.getContainedAnnotations(tokens, snippet); String detail = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Comprehensive Server Monitoring with Site24x7. Monitor 10 servers for $9/Month. Get alerted through email, SMS, voice calls or mobile push notifications. Take corrective actions from your mobile device. http://p.sf.net/sfu/Zoho _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs