ContentItemMentionBuilder.java

rwesten Sat, 30 Nov 2013 01:27:35 -0800

Author: rwesten
Date: Sat Nov 30 09:26:23 2013
New Revision: 1546706

URL: http://svn.apache.org/r1546706
Log:
STANBOL-1219: fixed remaining issues as described in the 2nd comment of the 
issue


Modified:
    
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
    
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java

Modified: 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java?rev=1546706&r1=1546705&r2=1546706&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
 Sat Nov 30 09:26:23 2013
@@ -289,8 +289,20 @@ public class EntityCoMentionEngine exten
                 new Object []{ci.getUri().getUnicodeString(), language, 
StringUtils.abbreviate(at.getSpan(), 100)});
         }
         //create the in-memory database for the mentioned Entities
-        ContentItemMentionBuilder entityMentionIndex = new 
ContentItemMentionBuilder(ci, 
+        ContentItemMentionBuilder entityMentionIndex = new 
ContentItemMentionBuilder(
             labelTokenizer, language, linkerConfig.getDefaultLanguage());
+        MGraph metadata = ci.getMetadata();
+        Set<UriRef> textAnnotations = new HashSet<UriRef>();
+        ci.getLock().readLock().lock();
+        try { //iterate over all TextAnnotations (mentions of Entities)
+            for(Iterator<Triple> it = metadata.filter(null, RDF_TYPE, 
ENHANCER_TEXTANNOTATION); it.hasNext();){
+                UriRef ta = (UriRef)it.next().getSubject();
+                entityMentionIndex.registerTextAnnotation(ta, metadata);
+                textAnnotations.add(ta); //store the registered text 
annotations
+            }
+        } finally {
+            ci.getLock().readLock().unlock();
+        }
         EntityLinker entityLinker = new EntityLinker(at,language, 
             languageConfig, entityMentionIndex, linkerConfig, 
labelTokenizer,entityMentionIndex);
         //process
@@ -303,33 +315,37 @@ public class EntityCoMentionEngine exten
         //TODO: write results
         ci.getLock().writeLock().lock();
         try {
-            writeComentions(ci,entityLinker.getLinkedEntities().values(), 
language);
+            writeComentions(ci,entityLinker.getLinkedEntities().values(), 
language, textAnnotations);
         } finally {
             ci.getLock().writeLock().unlock();
         }
     }
 
-    private void writeComentions(ContentItem ci,Collection<LinkedEntity> 
comentions, String language) {
+    private void writeComentions(ContentItem ci,Collection<LinkedEntity> 
comentions, String language,
+            Set<UriRef> textAnnotations) {
         Language languageObject = null;
         if(language != null && !language.isEmpty()){
             languageObject = new Language(language);
         }
         
         MGraph metadata = ci.getMetadata();
-        
+        //we MUST adjust the confidence level of existing annotations only once
+        //se we need to keep track of those
+        Set<NonLiteral> adjustedSuggestions = new HashSet<NonLiteral>();
         log.debug("Write Co-Mentions:");
         for(LinkedEntity comention : comentions){
             log.debug(" > {}",comention);
             //URIs of TextAnnotations for the initial mention of this 
co-mention
-            Collection<UriRef> initialMentions = new 
ArrayList<UriRef>(comention.getOccurrences().size());
+            Collection<UriRef> initialMentions = new 
ArrayList<UriRef>(comention.getSuggestions().size());
             for(Suggestion suggestion : comention.getSuggestions()){
                 Entity entity = suggestion.getEntity();
-                
if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
+                if(textAnnotations.contains(entity.getUri())){
+//                
if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
                     //this is a textAnnotation
                     initialMentions.add(entity.getUri());
                 } //else TODO support also Entities!!
             }
-            //first create the TextAnnotations for the co-mention
+            //create the TextAnnotations for the co-mention
             for(Occurrence occurrence : comention.getOccurrences()){
                 Literal startLiteral = 
literalFactory.createTypedLiteral(occurrence.getStart());
                 Literal endLiteral = 
literalFactory.createTypedLiteral(occurrence.getEnd());
@@ -341,8 +357,8 @@ public class EntityCoMentionEngine exten
                 while(it.hasNext()){
                     Triple t = it.next();
                     Integer end = EnhancementEngineHelper.get(metadata, 
t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
-                    if(end != null &&
-                            metadata.filter(t.getSubject(), RDF_TYPE, 
ENHANCER_TEXTANNOTATION).hasNext()){
+                    if(end != null && 
textAnnotations.contains(t.getSubject())){
+                            //metadata.filter(t.getSubject(), RDF_TYPE, 
ENHANCER_TEXTANNOTATION).hasNext()){
                         textAnnotation = (UriRef)t.getSubject();
                         if(end > occurrence.getEnd()){
                             // there is an other TextAnnotation selecting a 
bigger Span
@@ -355,8 +371,8 @@ public class EntityCoMentionEngine exten
                 while(it.hasNext()){
                     Triple t = it.next();
                     Integer start = EnhancementEngineHelper.get(metadata, 
t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
-                    if(start != null &&
-                            metadata.filter(t.getSubject(), RDF_TYPE, 
ENHANCER_TEXTANNOTATION).hasNext()){
+                    if(start != null && 
textAnnotations.contains(t.getSubject())){
+                            //metadata.filter(t.getSubject(), RDF_TYPE, 
ENHANCER_TEXTANNOTATION).hasNext()){
                         textAnnotation = (UriRef)t.getSubject();
                         if(start < occurrence.getStart()){
                             // there is an other TextAnnotation selecting a 
bigger Span
@@ -367,10 +383,11 @@ public class EntityCoMentionEngine exten
                 }
                 if(!ignore){
                     //collect confidence values of co-mentions
-                    Double maxConfidence = null;
-                    Double maxExistingConfidence = null;
+                    Double maxConfidence = null; //maximum confidence of 
suggestions of the initial mention
+                    Double maxExistingConfidence = null; //maximum confidence 
of existing suggestions
                     if(textAnnotation == null){ //not found ... create a new 
TextAnnotation for the co-mention
                         textAnnotation = 
EnhancementEngineHelper.createTextEnhancement(ci, this);
+                        textAnnotations.add(textAnnotation); //add it to the 
set of TextAnnotations
                         metadata.add(new TripleImpl(textAnnotation, 
                             Properties.ENHANCER_START, 
                             startLiteral));
@@ -386,9 +403,8 @@ public class EntityCoMentionEngine exten
                     } else { //if existing add this engine as contributor
                         metadata.add(new TripleImpl(textAnnotation, 
DC_CONTRIBUTOR, 
                             new PlainLiteralImpl(this.getClass().getName())));
-                        //consider the confidence value of the existing 
TextAnnotation
-                        maxConfidence = EnhancementEngineHelper.get(metadata, 
textAnnotation, 
-                            ENHANCER_CONFIDENCE, Double.class, literalFactory);
+                        //maxConfidence = 
EnhancementEngineHelper.get(metadata, textAnnotation, 
+                        //    ENHANCER_CONFIDENCE, Double.class, 
literalFactory);
                     }
                     //now process initial mention(s) for the co-mention
                     Set<UriRef> dcTypes = new HashSet<UriRef>();
@@ -398,7 +414,7 @@ public class EntityCoMentionEngine exten
                         while(dcTypesIt.hasNext()){
                             dcTypes.add(dcTypesIt.next());
                         }
-                        //check confidence of the initial one
+                        //check confidence of the initial mention 
(fise:TextAnnotation)
                         Double confidnece = 
EnhancementEngineHelper.get(metadata, initialMention, 
                             ENHANCER_CONFIDENCE, Double.class, literalFactory);
                         if(confidnece != null){
@@ -408,15 +424,82 @@ public class EntityCoMentionEngine exten
                                 maxConfidence = confidnece;
                             }
                         }
+                        //now we need to compare the suggestions of the initial
+                        //mention(s) with the existing one. 
+                        //Get information about the suggestions of the initial 
mention
+                        Map<Resource,Double> initialSuggestions = new 
HashMap<Resource,Double>();
+                        Map<Resource, Resource> initialSuggestedEntities = new 
HashMap<Resource,Resource>();
+                        for(Iterator<Triple> suggestions = 
metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
+                            if(!textAnnotations.contains(suggestions)) {
+                                NonLiteral suggestion = 
suggestions.next().getSubject();
+                                Resource suggestedEntity = 
EnhancementEngineHelper.getReference(metadata, suggestion, 
ENHANCER_ENTITY_REFERENCE);
+                                if(suggestedEntity != null){ //it has a 
suggestion
+                                    Double confidence = 
EnhancementEngineHelper.get(
+                                        metadata, suggestion, 
ENHANCER_CONFIDENCE, Double.class, literalFactory);
+                                    if(maxConfidence == null){
+                                        maxConfidence = confidence;
+                                    } else 
if(maxConfidence.compareTo(confidnece) <= 0){
+                                        maxConfidence = confidnece;
+                                    }
+                                    
initialSuggestions.put(suggestion,confidence);
+                                    
initialSuggestedEntities.put(suggestedEntity, suggestion);
+                                } //no suggestion (dc:relation to some other 
resource)
+                            } // else ignore dc:relation to other 
fise:TextAnnotations
+                        }
+                        //now we collect existing Suggestions for this 
TextAnnoation where we need
+                        //to adjust the confidence (quite some things to check 
....)
                         Map<NonLiteral, Double> existingSuggestions = new 
HashMap<NonLiteral,Double>();
                        if(maxConfidence != null && confidenceAdjustmentFactor 
< 1){
-                               //adapt confidence of existing annotations
+                           //suggestions are defined by incoming dc:releation
                                for(Iterator<Triple> esIt = 
metadata.filter(null, DC_RELATION, textAnnotation);esIt.hasNext();){
                                        NonLiteral existingSuggestion = 
esIt.next().getSubject();
-                                       
existingSuggestions.put(existingSuggestion,
-                                                       
EnhancementEngineHelper.get(metadata, existingSuggestion, 
-                                                                       
ENHANCER_CONFIDENCE, Double.class, literalFactory));
-                               }
+                                       //but not all of them are suggestions
+                                       
if(!textAnnotations.contains(existingSuggestion)) { //ignore 
fise:TextAnnotations
+                                       Double existingConfidence = 
EnhancementEngineHelper.get(metadata, existingSuggestion, 
+                                        ENHANCER_CONFIDENCE, Double.class, 
literalFactory);
+                                       //ignore fise:TextAnnotations also 
suggested for the initial mention
+                                    
if(!initialSuggestions.containsKey(existingSuggestion)){
+                                        Resource suggestedEntity = 
EnhancementEngineHelper.getReference(metadata, existingSuggestion, 
ENHANCER_ENTITY_REFERENCE);
+                                        //we might also have different 
fise:TextAnnotations that
+                                        //fise:entity-reference to an Entity 
present in the
+                                        //suggestions for the initial mention
+                                        
if(!initialSuggestedEntities.containsKey(suggestedEntity)){
+                                            //finally make sure that we adjust 
confidences only once
+                                            
if(!adjustedSuggestions.contains(existingSuggestion)){
+                                                
existingSuggestions.put(existingSuggestion, existingConfidence);
+                                            } //else confidence already 
adjusted
+                                        } else { // different 
fise:EntityAnnotation, but same reference Entity
+                                            //we need to check confidences to 
decide what to do
+                                            Resource initialSuggestion = 
initialSuggestedEntities.get(suggestedEntity);
+                                            Double initialConfidence = 
initialSuggestions.get(initialSuggestion);
+                                            if((existingConfidence == null && 
initialConfidence == null) ||
+                                                    (existingConfidence != 
null && 
+                                                    
existingConfidence.compareTo(initialConfidence) >= 0)){
+                                                //existing confidence >= 
initial .. keep existing
+                                                
initialSuggestions.remove(initialSuggestion); 
+                                                if(maxExistingConfidence == 
null){
+                                                    maxExistingConfidence = 
existingConfidence;
+                                                } else 
if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+                                                    maxExistingConfidence = 
existingConfidence;
+                                                }
+                                            } else { //initial has higher 
confidence
+                                                //adjust this one (if not yet 
adjusted)
+                                                
if(!adjustedSuggestions.contains(existingSuggestion)){
+                                                    
existingSuggestions.put(existingSuggestion, existingConfidence);
+                                                } 
+                                            }
+                                        }
+                                    } else { //a initial mention already 
present
+                                        //no need to process initial mention
+                                        
initialSuggestions.remove(existingSuggestion);
+                                        if(maxExistingConfidence == null){
+                                            maxExistingConfidence = 
existingConfidence;
+                                        } else 
if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+                                            maxExistingConfidence = 
existingConfidence;
+                                        }
+                                    }
+                                       } //else ignore dc:relations to other 
fise:TextAnnotations
+                               }
                                for(Entry<NonLiteral,Double> entry : 
existingSuggestions.entrySet()){
                                        if(entry.getValue() != null){
                                                double adjustedConfidence = 
entry.getValue() * confidenceAdjustmentFactor;
@@ -425,15 +508,12 @@ public class EntityCoMentionEngine exten
                                                }
                                                
EnhancementEngineHelper.set(metadata, entry.getKey(), 
                                                                
ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
+                                               
adjustedSuggestions.add(entry.getKey()); //mark as adjusted
                                        }
                                }
                        }
-                        //add the suggestions of the initial mention to this 
one
-                        Set<Resource> values = new HashSet<Resource>();
-                        for(Iterator<Triple> suggestions = 
metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
-                            values.add(suggestions.next().getSubject());
-                        }
-                        for(Resource suggestion : values){
+                       //add the suggestions of the initial mention to this one
+                        for(Resource suggestion : initialSuggestions.keySet()){
                             metadata.add(new 
TripleImpl((NonLiteral)suggestion, DC_RELATION, textAnnotation));
     
                         }

Modified: 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java?rev=1546706&r1=1546705&r2=1546706&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
 Sat Nov 30 09:26:23 2013
@@ -28,12 +28,15 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
 import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
@@ -50,51 +53,36 @@ public class ContentItemMentionBuilder e
     private static final Logger log = 
LoggerFactory.getLogger(ContentItemMentionBuilder.class);
     private static final LiteralFactory lf = LiteralFactory.getInstance();
     
-    private ContentItem ci;
     /**
      * The last index notified via {@link #startToken(Token)}
      */
     private Integer lastIndex = 0; 
     
     private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new 
TreeMap<Integer,Collection<EntityMention>>();
-
     
-    public ContentItemMentionBuilder(ContentItem ci, LabelTokenizer 
labelTokenizer,
-            String...languages){
+    public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, 
String...languages){
         super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, 
languages);
-        this.ci = ci;
-        ci.getLock().readLock().lock();
-        try {
-            initContext();
-        } finally {
-            ci.getLock().readLock().unlock();
-        }
     }
 
-
-    private void initContext() {
-        MGraph m = ci.getMetadata();
-        for(Iterator<Triple> it = m.filter(null, RDF_TYPE, 
ENHANCER_TEXTANNOTATION); it.hasNext();){
-            UriRef ta = (UriRef)it.next().getSubject();
-            String selectedText = EnhancementEngineHelper.getString(m, ta, 
ENHANCER_SELECTED_TEXT);
-            if(selectedText != null){
-                //NOTE: Typically it is not possible to find co-mentions for 
Entities with a
-                //      single Token, so can ignore those.
-                //      The only exception would be to use proper-nouns for 
initial linking and
-                //      Nouns for the co-mention resolution. In such cases 
this might result
-                //      in additional extractions.
-                String[] tokens = tokenizer.tokenize(selectedText, language);
-                if(tokens.length > 1){ //TODO make configurable
-                    Double confidence = 
EnhancementEngineHelper.get(m,ta,ENHANCER_CONFIDENCE,Double.class,lf);
-                    if(confidence == null || confidence > 0.85){ //TODO make 
configurable
-                        Integer start = 
EnhancementEngineHelper.get(m,ta,ENHANCER_START,Integer.class,lf);
-                        Integer end = 
EnhancementEngineHelper.get(m,ta,ENHANCER_END,Integer.class,lf);
-                        registerMention(new EntityMention(ta,m, 
ENHANCER_SELECTED_TEXT, DC_TYPE, 
-                            start != null && end != null ? new 
Integer[]{start,end} : null));
-                    } // else confidence to low
-                } //else ignore Tokens with a single token
-            } // else no selected text
-        }
+    public void registerTextAnnotation(UriRef textAnnotation, TripleCollection 
metadata){
+        String selectedText = EnhancementEngineHelper.getString(metadata, 
textAnnotation, ENHANCER_SELECTED_TEXT);
+        if(selectedText != null){
+            //NOTE: Typically it is not possible to find co-mentions for 
Entities with a
+            //      single Token, so can ignore those.
+            //      The only exception would be to use proper-nouns for 
initial linking and
+            //      Nouns for the co-mention resolution. In such cases this 
might result
+            //      in additional extractions.
+            String[] tokens = tokenizer.tokenize(selectedText, language);
+            if(tokens.length > 1){ //TODO make configurable
+                Double confidence = 
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
+                if(confidence == null || confidence > 0.85){ //TODO make 
configurable
+                    Integer start = 
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
+                    Integer end = 
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
+                    registerMention(new EntityMention(textAnnotation,metadata, 
ENHANCER_SELECTED_TEXT, DC_TYPE, 
+                        start != null && end != null ? new 
Integer[]{start,end} : null));
+                } // else confidence to low
+            } //else ignore Tokens with a single token
+        } // else no selected text
     }
 
     private void registerMention(EntityMention entityMention){

svn commit: r1546706 - in /stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention: EntityCoMentionEngine.java impl/ContentItemMentionBuilder.java

Reply via email to