Author: rwesten
Date: Sat Nov 30 09:26:23 2013
New Revision: 1546706
URL: http://svn.apache.org/r1546706
Log:
STANBOL-1219: fixed remaining issues as described in the 2nd comment of the
issue
Modified:
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
Modified:
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java?rev=1546706&r1=1546705&r2=1546706&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
Sat Nov 30 09:26:23 2013
@@ -289,8 +289,20 @@ public class EntityCoMentionEngine exten
new Object []{ci.getUri().getUnicodeString(), language,
StringUtils.abbreviate(at.getSpan(), 100)});
}
//create the in-memory database for the mentioned Entities
- ContentItemMentionBuilder entityMentionIndex = new
ContentItemMentionBuilder(ci,
+ ContentItemMentionBuilder entityMentionIndex = new
ContentItemMentionBuilder(
labelTokenizer, language, linkerConfig.getDefaultLanguage());
+ MGraph metadata = ci.getMetadata();
+ Set<UriRef> textAnnotations = new HashSet<UriRef>();
+ ci.getLock().readLock().lock();
+ try { //iterate over all TextAnnotations (mentions of Entities)
+ for(Iterator<Triple> it = metadata.filter(null, RDF_TYPE,
ENHANCER_TEXTANNOTATION); it.hasNext();){
+ UriRef ta = (UriRef)it.next().getSubject();
+ entityMentionIndex.registerTextAnnotation(ta, metadata);
+ textAnnotations.add(ta); //store the registered text
annotations
+ }
+ } finally {
+ ci.getLock().readLock().unlock();
+ }
EntityLinker entityLinker = new EntityLinker(at,language,
languageConfig, entityMentionIndex, linkerConfig,
labelTokenizer,entityMentionIndex);
//process
@@ -303,33 +315,37 @@ public class EntityCoMentionEngine exten
//TODO: write results
ci.getLock().writeLock().lock();
try {
- writeComentions(ci,entityLinker.getLinkedEntities().values(),
language);
+ writeComentions(ci,entityLinker.getLinkedEntities().values(),
language, textAnnotations);
} finally {
ci.getLock().writeLock().unlock();
}
}
- private void writeComentions(ContentItem ci,Collection<LinkedEntity>
comentions, String language) {
+ private void writeComentions(ContentItem ci,Collection<LinkedEntity>
comentions, String language,
+ Set<UriRef> textAnnotations) {
Language languageObject = null;
if(language != null && !language.isEmpty()){
languageObject = new Language(language);
}
MGraph metadata = ci.getMetadata();
-
+ //we MUST adjust the confidence level of existing annotations only once
+ //se we need to keep track of those
+ Set<NonLiteral> adjustedSuggestions = new HashSet<NonLiteral>();
log.debug("Write Co-Mentions:");
for(LinkedEntity comention : comentions){
log.debug(" > {}",comention);
//URIs of TextAnnotations for the initial mention of this
co-mention
- Collection<UriRef> initialMentions = new
ArrayList<UriRef>(comention.getOccurrences().size());
+ Collection<UriRef> initialMentions = new
ArrayList<UriRef>(comention.getSuggestions().size());
for(Suggestion suggestion : comention.getSuggestions()){
Entity entity = suggestion.getEntity();
-
if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
+ if(textAnnotations.contains(entity.getUri())){
+//
if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
//this is a textAnnotation
initialMentions.add(entity.getUri());
} //else TODO support also Entities!!
}
- //first create the TextAnnotations for the co-mention
+ //create the TextAnnotations for the co-mention
for(Occurrence occurrence : comention.getOccurrences()){
Literal startLiteral =
literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral =
literalFactory.createTypedLiteral(occurrence.getEnd());
@@ -341,8 +357,8 @@ public class EntityCoMentionEngine exten
while(it.hasNext()){
Triple t = it.next();
Integer end = EnhancementEngineHelper.get(metadata,
t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
- if(end != null &&
- metadata.filter(t.getSubject(), RDF_TYPE,
ENHANCER_TEXTANNOTATION).hasNext()){
+ if(end != null &&
textAnnotations.contains(t.getSubject())){
+ //metadata.filter(t.getSubject(), RDF_TYPE,
ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (UriRef)t.getSubject();
if(end > occurrence.getEnd()){
// there is an other TextAnnotation selecting a
bigger Span
@@ -355,8 +371,8 @@ public class EntityCoMentionEngine exten
while(it.hasNext()){
Triple t = it.next();
Integer start = EnhancementEngineHelper.get(metadata,
t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
- if(start != null &&
- metadata.filter(t.getSubject(), RDF_TYPE,
ENHANCER_TEXTANNOTATION).hasNext()){
+ if(start != null &&
textAnnotations.contains(t.getSubject())){
+ //metadata.filter(t.getSubject(), RDF_TYPE,
ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (UriRef)t.getSubject();
if(start < occurrence.getStart()){
// there is an other TextAnnotation selecting a
bigger Span
@@ -367,10 +383,11 @@ public class EntityCoMentionEngine exten
}
if(!ignore){
//collect confidence values of co-mentions
- Double maxConfidence = null;
- Double maxExistingConfidence = null;
+ Double maxConfidence = null; //maximum confidence of
suggestions of the initial mention
+ Double maxExistingConfidence = null; //maximum confidence
of existing suggestions
if(textAnnotation == null){ //not found ... create a new
TextAnnotation for the co-mention
textAnnotation =
EnhancementEngineHelper.createTextEnhancement(ci, this);
+ textAnnotations.add(textAnnotation); //add it to the
set of TextAnnotations
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_START,
startLiteral));
@@ -386,9 +403,8 @@ public class EntityCoMentionEngine exten
} else { //if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation,
DC_CONTRIBUTOR,
new PlainLiteralImpl(this.getClass().getName())));
- //consider the confidence value of the existing
TextAnnotation
- maxConfidence = EnhancementEngineHelper.get(metadata,
textAnnotation,
- ENHANCER_CONFIDENCE, Double.class, literalFactory);
+ //maxConfidence =
EnhancementEngineHelper.get(metadata, textAnnotation,
+ // ENHANCER_CONFIDENCE, Double.class,
literalFactory);
}
//now process initial mention(s) for the co-mention
Set<UriRef> dcTypes = new HashSet<UriRef>();
@@ -398,7 +414,7 @@ public class EntityCoMentionEngine exten
while(dcTypesIt.hasNext()){
dcTypes.add(dcTypesIt.next());
}
- //check confidence of the initial one
+ //check confidence of the initial mention
(fise:TextAnnotation)
Double confidnece =
EnhancementEngineHelper.get(metadata, initialMention,
ENHANCER_CONFIDENCE, Double.class, literalFactory);
if(confidnece != null){
@@ -408,15 +424,82 @@ public class EntityCoMentionEngine exten
maxConfidence = confidnece;
}
}
+ //now we need to compare the suggestions of the initial
+ //mention(s) with the existing one.
+ //Get information about the suggestions of the initial
mention
+ Map<Resource,Double> initialSuggestions = new
HashMap<Resource,Double>();
+ Map<Resource, Resource> initialSuggestedEntities = new
HashMap<Resource,Resource>();
+ for(Iterator<Triple> suggestions =
metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
+ if(!textAnnotations.contains(suggestions)) {
+ NonLiteral suggestion =
suggestions.next().getSubject();
+ Resource suggestedEntity =
EnhancementEngineHelper.getReference(metadata, suggestion,
ENHANCER_ENTITY_REFERENCE);
+ if(suggestedEntity != null){ //it has a
suggestion
+ Double confidence =
EnhancementEngineHelper.get(
+ metadata, suggestion,
ENHANCER_CONFIDENCE, Double.class, literalFactory);
+ if(maxConfidence == null){
+ maxConfidence = confidence;
+ } else
if(maxConfidence.compareTo(confidnece) <= 0){
+ maxConfidence = confidnece;
+ }
+
initialSuggestions.put(suggestion,confidence);
+
initialSuggestedEntities.put(suggestedEntity, suggestion);
+ } //no suggestion (dc:relation to some other
resource)
+ } // else ignore dc:relation to other
fise:TextAnnotations
+ }
+ //now we collect existing Suggestions for this
TextAnnoation where we need
+ //to adjust the confidence (quite some things to check
....)
Map<NonLiteral, Double> existingSuggestions = new
HashMap<NonLiteral,Double>();
if(maxConfidence != null && confidenceAdjustmentFactor
< 1){
- //adapt confidence of existing annotations
+ //suggestions are defined by incoming dc:releation
for(Iterator<Triple> esIt =
metadata.filter(null, DC_RELATION, textAnnotation);esIt.hasNext();){
NonLiteral existingSuggestion =
esIt.next().getSubject();
-
existingSuggestions.put(existingSuggestion,
-
EnhancementEngineHelper.get(metadata, existingSuggestion,
-
ENHANCER_CONFIDENCE, Double.class, literalFactory));
- }
+ //but not all of them are suggestions
+
if(!textAnnotations.contains(existingSuggestion)) { //ignore
fise:TextAnnotations
+ Double existingConfidence =
EnhancementEngineHelper.get(metadata, existingSuggestion,
+ ENHANCER_CONFIDENCE, Double.class,
literalFactory);
+ //ignore fise:TextAnnotations also
suggested for the initial mention
+
if(!initialSuggestions.containsKey(existingSuggestion)){
+ Resource suggestedEntity =
EnhancementEngineHelper.getReference(metadata, existingSuggestion,
ENHANCER_ENTITY_REFERENCE);
+ //we might also have different
fise:TextAnnotations that
+ //fise:entity-reference to an Entity
present in the
+ //suggestions for the initial mention
+
if(!initialSuggestedEntities.containsKey(suggestedEntity)){
+ //finally make sure that we adjust
confidences only once
+
if(!adjustedSuggestions.contains(existingSuggestion)){
+
existingSuggestions.put(existingSuggestion, existingConfidence);
+ } //else confidence already
adjusted
+ } else { // different
fise:EntityAnnotation, but same reference Entity
+ //we need to check confidences to
decide what to do
+ Resource initialSuggestion =
initialSuggestedEntities.get(suggestedEntity);
+ Double initialConfidence =
initialSuggestions.get(initialSuggestion);
+ if((existingConfidence == null &&
initialConfidence == null) ||
+ (existingConfidence !=
null &&
+
existingConfidence.compareTo(initialConfidence) >= 0)){
+ //existing confidence >=
initial .. keep existing
+
initialSuggestions.remove(initialSuggestion);
+ if(maxExistingConfidence ==
null){
+ maxExistingConfidence =
existingConfidence;
+ } else
if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+ maxExistingConfidence =
existingConfidence;
+ }
+ } else { //initial has higher
confidence
+ //adjust this one (if not yet
adjusted)
+
if(!adjustedSuggestions.contains(existingSuggestion)){
+
existingSuggestions.put(existingSuggestion, existingConfidence);
+ }
+ }
+ }
+ } else { //a initial mention already
present
+ //no need to process initial mention
+
initialSuggestions.remove(existingSuggestion);
+ if(maxExistingConfidence == null){
+ maxExistingConfidence =
existingConfidence;
+ } else
if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+ maxExistingConfidence =
existingConfidence;
+ }
+ }
+ } //else ignore dc:relations to other
fise:TextAnnotations
+ }
for(Entry<NonLiteral,Double> entry :
existingSuggestions.entrySet()){
if(entry.getValue() != null){
double adjustedConfidence =
entry.getValue() * confidenceAdjustmentFactor;
@@ -425,15 +508,12 @@ public class EntityCoMentionEngine exten
}
EnhancementEngineHelper.set(metadata, entry.getKey(),
ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
+
adjustedSuggestions.add(entry.getKey()); //mark as adjusted
}
}
}
- //add the suggestions of the initial mention to this
one
- Set<Resource> values = new HashSet<Resource>();
- for(Iterator<Triple> suggestions =
metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
- values.add(suggestions.next().getSubject());
- }
- for(Resource suggestion : values){
+ //add the suggestions of the initial mention to this one
+ for(Resource suggestion : initialSuggestions.keySet()){
metadata.add(new
TripleImpl((NonLiteral)suggestion, DC_RELATION, textAnnotation));
}
Modified:
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java?rev=1546706&r1=1546705&r2=1546706&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
(original)
+++
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
Sat Nov 30 09:26:23 2013
@@ -28,12 +28,15 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
@@ -50,51 +53,36 @@ public class ContentItemMentionBuilder e
private static final Logger log =
LoggerFactory.getLogger(ContentItemMentionBuilder.class);
private static final LiteralFactory lf = LiteralFactory.getInstance();
- private ContentItem ci;
/**
* The last index notified via {@link #startToken(Token)}
*/
private Integer lastIndex = 0;
private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new
TreeMap<Integer,Collection<EntityMention>>();
-
- public ContentItemMentionBuilder(ContentItem ci, LabelTokenizer
labelTokenizer,
- String...languages){
+ public ContentItemMentionBuilder(LabelTokenizer labelTokenizer,
String...languages){
super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD,
languages);
- this.ci = ci;
- ci.getLock().readLock().lock();
- try {
- initContext();
- } finally {
- ci.getLock().readLock().unlock();
- }
}
-
- private void initContext() {
- MGraph m = ci.getMetadata();
- for(Iterator<Triple> it = m.filter(null, RDF_TYPE,
ENHANCER_TEXTANNOTATION); it.hasNext();){
- UriRef ta = (UriRef)it.next().getSubject();
- String selectedText = EnhancementEngineHelper.getString(m, ta,
ENHANCER_SELECTED_TEXT);
- if(selectedText != null){
- //NOTE: Typically it is not possible to find co-mentions for
Entities with a
- // single Token, so can ignore those.
- // The only exception would be to use proper-nouns for
initial linking and
- // Nouns for the co-mention resolution. In such cases
this might result
- // in additional extractions.
- String[] tokens = tokenizer.tokenize(selectedText, language);
- if(tokens.length > 1){ //TODO make configurable
- Double confidence =
EnhancementEngineHelper.get(m,ta,ENHANCER_CONFIDENCE,Double.class,lf);
- if(confidence == null || confidence > 0.85){ //TODO make
configurable
- Integer start =
EnhancementEngineHelper.get(m,ta,ENHANCER_START,Integer.class,lf);
- Integer end =
EnhancementEngineHelper.get(m,ta,ENHANCER_END,Integer.class,lf);
- registerMention(new EntityMention(ta,m,
ENHANCER_SELECTED_TEXT, DC_TYPE,
- start != null && end != null ? new
Integer[]{start,end} : null));
- } // else confidence to low
- } //else ignore Tokens with a single token
- } // else no selected text
- }
+ public void registerTextAnnotation(UriRef textAnnotation, TripleCollection
metadata){
+ String selectedText = EnhancementEngineHelper.getString(metadata,
textAnnotation, ENHANCER_SELECTED_TEXT);
+ if(selectedText != null){
+ //NOTE: Typically it is not possible to find co-mentions for
Entities with a
+ // single Token, so can ignore those.
+ // The only exception would be to use proper-nouns for
initial linking and
+ // Nouns for the co-mention resolution. In such cases this
might result
+ // in additional extractions.
+ String[] tokens = tokenizer.tokenize(selectedText, language);
+ if(tokens.length > 1){ //TODO make configurable
+ Double confidence =
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
+ if(confidence == null || confidence > 0.85){ //TODO make
configurable
+ Integer start =
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
+ Integer end =
EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
+ registerMention(new EntityMention(textAnnotation,metadata,
ENHANCER_SELECTED_TEXT, DC_TYPE,
+ start != null && end != null ? new
Integer[]{start,end} : null));
+ } // else confidence to low
+ } //else ignore Tokens with a single token
+ } // else no selected text
}
private void registerMention(EntityMention entityMention){