Author: rwesten
Date: Mon Apr 15 05:31:34 2013
New Revision: 1467854
URL: http://svn.apache.org/r1467854
Log:
STANBOL-1030: implemented feature as described in the Issue. In addition the
EntityhubLinkingEngine now ensures that EntityRankings are included for
EntitySearcher results; STANBOL-1013: several improvements to the internal API
of ProcessingState (NLP based spotting of Entities); STANBOL-1011: Moved the
MIN_SEARCH_TOKEN_LEGTH property from the EntityLinking to the TextProcessing
configuration. This makes the ProcessingState (future NLP spotting) independent
from the EntityLinking configuration - a pre requirement for separating the
Spotting from Linking as intended by STANBOL-1013.
Modified:
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntitySearcherUtils.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/resources/log4j.properties
Modified:
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntitySearcherUtils.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntitySearcherUtils.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntitySearcherUtils.java
(original)
+++
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntitySearcherUtils.java
Mon Apr 15 05:31:34 2013
@@ -21,6 +21,7 @@ import java.util.Set;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory;
import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
@@ -60,6 +61,8 @@ public class EntitySearcherUtils {
query.addSelectedField(select.getUnicodeString());
}
}
+ //also add the entity rankings
+ query.addSelectedField(RdfResourceEnum.entityRank.getUri());
query.setLimit(20);//TODO make configurable
query.setConstraint(field.getUnicodeString(), new
TextConstraint(search, languages));
return query;
Modified:
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
Mon Apr 15 05:31:34 2013
@@ -20,12 +20,12 @@ import static org.apache.stanbol.enhance
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_DEREFERENCE_ENTITIES_STATE;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MATCHING_LANGUAGE;
-import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
+import static
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MIN_TOKEN_SCORE;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_SUGGESTIONS;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
-import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_SEARCH_TOKEN_LENGTH;
+import static
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
import static
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
@@ -43,7 +43,6 @@ import java.util.Dictionary;
import java.util.Hashtable;
import java.util.NavigableSet;
import java.util.NoSuchElementException;
-import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.felix.scr.annotations.Activate;
@@ -115,14 +114,17 @@ import org.slf4j.LoggerFactory;
"es;lc=Noun", //the OpenNLP POS tagger for Spanish does not
support ProperNouns
"nl;lc=Noun"}), //same for Dutch
@Property(name=DEFAULT_MATCHING_LANGUAGE,value=""),
- @Property(name=TYPE_MAPPINGS,cardinality=Integer.MAX_VALUE),
+ @Property(name=TYPE_MAPPINGS,cardinality=Integer.MAX_VALUE, value={
+ "dbp-ont:Organisation; dbp-ont:Newspaper; schema:Organization >
dbp-ont:Organisation",
+ "dbp-ont:Person; foaf:Person; schema:Person > dbp-ont:Person",
+ "dbp-ont:Place; schema:Place > dbp-ont:Place",
+ "dbp-ont:Work; schema:CreativeWork > dbp-ont:Work",
+ "dbp-ont:Event; schema:Event > dbp-ont:Event",
+ "schema:Product > schema:Product",
+ "skos:Concept > skos:Concept"}),
@Property(name=DEREFERENCE_ENTITIES,
boolValue=DEFAULT_DEREFERENCE_ENTITIES_STATE),
@Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
- value={"http://www.w3.org/2000/01/rdf-schema#comment",
- "http://www.w3.org/2003/01/geo/wgs84_pos#lat",
- "http://www.w3.org/2003/01/geo/wgs84_pos#long",
- "http://xmlns.com/foaf/0.1/depiction",
- "http://dbpedia.org/ontology/thumbnail"}),
+
value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
@Property(name=SERVICE_RANKING,intValue=0)
})
public class EntityhubLinkingEngine implements ServiceTrackerCustomizer {
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
Mon Apr 15 05:31:34 2013
@@ -162,13 +162,6 @@ public class EntityLinkerConfig {
*/
public static final String MIN_MATCH_FACTOR =
"enhancer.engines.linking.minMatchScore";
/**
- * Used as fallback in case a {@link Token} does not have a {@link PosTag}
or
- * {@link NlpAnnotations#POS_ANNOTATION POS annotations} do have a low
confidence.
- * In such cases only words that are longer than this value will be
considerd for
- * linking
- */
- public static final String MIN_SEARCH_TOKEN_LENGTH =
"enhancer.engines.linking.minSearchTokenLength";
- /**
* The maximum number of {@link Token} used as search terms with the
* {@link EntitySearcher#lookup(String, Set, java.util.List, String[],
Integer)}
* method
@@ -196,12 +189,12 @@ public class EntityLinkerConfig {
* Allows to add a list of fields that are included when dereferencing
Entities
*/
public static final String DEREFERENCE_ENTITIES_FIELDS =
"enhancer.engines.linking.dereferenceFields";
-
/**
- * The minimum length of Token to be used for searches in case no
- * POS (Part of Speech) tags are available.
+ * Allows to enable/disable sorting of suggestion that have the same score
+ * based on the entity ranking (popularity of the entity within the
knowledge base)
*/
- public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3;
+ public static final String RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS =
"enhancer.engines.linking.useEntityRankings";
+
/**
* The default number for the maximum number of terms suggested for a word
*/
@@ -339,10 +332,6 @@ public class EntityLinkerConfig {
*/
private boolean dereferenceEntitiesState;
/**
- * The minimum length of labels that are looked-up in the directory
- */
- private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
- /**
* The the maximum number of terms suggested for a word
*/
private int maxSuggestions = DEFAULT_SUGGESTIONS;
@@ -428,6 +417,13 @@ public class EntityLinkerConfig {
* By default Entities are dereferenced
*/
public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
+
+ /**
+ * The default value for the state if entities that would have the same
score
+ * should get their score slightly changed to ensure that entities with an
+ * higher ranking (popularity) do have an higher score.
+ */
+ public static final boolean
DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS = true;
/**
* If Tokens match is determined by comparing them using some algorithm.
* Results need to be in the range [0..1]. This factor defines the minimum
@@ -445,12 +441,13 @@ public class EntityLinkerConfig {
private double minTextScore = DEFAULT_MIN_TEXT_SCORE;
private double minMatchScore = DEFAULT_MIN_MATCH_SCORE;
+ private boolean rankEqualScoresBasedOnEntityRankings =
DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS;
+
/**
* Default constructor the initializes the configuration with the
* default values
*/
public EntityLinkerConfig(){
- setMinSearchTokenLength(DEFAULT_MIN_SEARCH_TOKEN_LENGTH);
setMaxSuggestions(DEFAULT_SUGGESTIONS);
setMaxSearchTokens(DEFAULT_MAX_SEARCH_TOKENS);
setRedirectProcessingMode(DEFAULT_REDIRECT_PROCESSING_MODE);
@@ -626,28 +623,7 @@ public class EntityLinkerConfig {
} catch (IllegalArgumentException e){
throw new ConfigurationException(MIN_MATCH_FACTOR, e.getMessage());
}
-
- // init MIN_SEARCH_TOKEN_LENGTH
- value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
- Integer minSearchTokenLength;
- if(value instanceof Integer){
- minSearchTokenLength = (Integer)value;
- } else if (value != null){
- try {
- minSearchTokenLength = Integer.valueOf(value.toString());
- } catch(NumberFormatException e){
- throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH,
"Values MUST be valid Integer values > 0",e);
- }
- } else {
- minSearchTokenLength = null;
- }
- if(minSearchTokenLength != null){
- if(minSearchTokenLength < 1){
- throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH,
"Values MUST be valid Integer values > 0");
- }
- linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
- }
-
+
//init LEMMA_MATCHING_STATE
value = configuration.get(LEMMA_MATCHING_STATE);
if(value instanceof Boolean){
@@ -845,6 +821,17 @@ public class EntityLinkerConfig {
}
}
+ //init USE ENTITY RANKINGS (STANBOL-1030)
+ value = configuration.get(RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS);
+ if(value instanceof Boolean){
+
linkerConfig.setRankEqualScoresBasedOnEntityRankings(((Boolean)value).booleanValue());
+ } else if (value != null){
+ linkerConfig.setRankEqualScoresBasedOnEntityRankings(
+ Boolean.parseBoolean(value.toString()));
+ } else {
+ linkerConfig.setRankEqualScoresBasedOnEntityRankings(
+ DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS);
+ }
}
@@ -926,26 +913,6 @@ public class EntityLinkerConfig {
__selectedFields = null;
}
/**
- * The minimum number of character a {@link Token} (word) must have to be
- * used {@link EntitySearcher#lookup(java.util.List, String...) lookup}
concepts
- * in the taxonomy. Note that this parameter is only used of no POS (Part-
- * of-speech) tags are available in the {@link AnalysedText}.
- * @param minSearchTokenLength the minSearchTokenLength to set
- */
- public void setMinSearchTokenLength(int minSearchTokenLength) {
- this.minSearchTokenLength = minSearchTokenLength;
- }
- /**
- * The minimum number of character a {@link Token} (word) must have to be
- * used {@link EntitySearcher#lookup(java.util.List, String...) lookup}
concepts
- * in the taxonomy. Note that this parameter is only used of no POS (Part-
- * of-speech) tags are available in the {@link AnalysedText}.
- * @return the minSearchTokenLength
- */
- public int getMinSearchTokenLength() {
- return minSearchTokenLength;
- }
- /**
* Setter for the maximum number of suggestion returned.
* @param maxSuggestions the maxSuggestions to set
*/
@@ -1344,4 +1311,27 @@ public class EntityLinkerConfig {
return __selectedFields;
}
}
+ /**
+ * If suggested entities that would have the same score (e.g. 1.0 - for a
+ * perfect match) should have their score slightly adapted so that they
+ * are sorted based on their entity ranking.<p>
+ * The entity ranking is defined as the importance (popularity,
connectivity, ...)
+ * of an entity within the knowledge base
+ * @return the state
+ */
+ public boolean isRankEqualScoresBasedOnEntityRankings() {
+ return rankEqualScoresBasedOnEntityRankings;
+ }
+ /**
+ * Setter for the state if suggested that would have the same score (e.g.
1.0 - for a
+ * perfect match) should have their score slightly adapted so that they
+ * are sorted based on their entity ranking.<p>
+ * The entity ranking is defined as the importance (popularity,
connectivity, ...)
+ * of an entity within the knowledge base
+ * @param state the state
+ */
+ public void setRankEqualScoresBasedOnEntityRankings(boolean state) {
+ this.rankEqualScoresBasedOnEntityRankings = state;
+ }
+
}
\ No newline at end of file
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
Mon Apr 15 05:31:34 2013
@@ -160,6 +160,7 @@ public class LanguageProcessingConfig im
* linked.
*/
private boolean linkMultiMatchableTokensInChunkState =
DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
+ private int minSearchTokenLength;
/**
@@ -505,6 +506,28 @@ public class LanguageProcessingConfig im
}
}
/**
+ * The minimum number of character a {@link Token} (word) must have to be
+ * used {@link EntitySearcher#lookup(java.util.List, String...) lookup}
concepts
+ * in the taxonomy. Note that this parameter is only used of no POS (Part-
+ * of-speech) tags are available in the {@link AnalysedText}.
+ * @param minSearchTokenLength the minSearchTokenLength to set
+ */
+ public void setMinSearchTokenLength(int minSearchTokenLength) {
+ this.minSearchTokenLength = minSearchTokenLength;
+ }
+ /**
+ * The minimum number of character a {@link Token} (word) must have to be
+ * used {@link EntitySearcher#lookup(java.util.List, String...) lookup}
concepts
+ * in the taxonomy. Note that this parameter is only used of no POS (Part-
+ * of-speech) tags are available in the {@link AnalysedText}.
+ * @return the minSearchTokenLength
+ */
+ public int getMinSearchTokenLength() {
+ return minSearchTokenLength;
+ }
+
+
+ /**
* Clones the {@link LanguageProcessingConfig}. Intended to be used
* to create language specific configs based on the default one.
*/
@@ -525,6 +548,7 @@ public class LanguageProcessingConfig im
c.matchUpperCaseTokensState = matchUpperCaseTokensState;
c.linkMultiMatchableTokensInChunkState =
linkMultiMatchableTokensInChunkState;
c.matchedLexicalCategories = matchedLexicalCategories;
+ c.minSearchTokenLength = minSearchTokenLength;
return c;
}
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
Mon Apr 15 05:31:34 2013
@@ -26,8 +26,11 @@ import java.util.Map;
import java.util.Set;
import
org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.osgi.service.cm.ConfigurationException;
import org.slf4j.Logger;
@@ -61,6 +64,24 @@ public class TextProcessingConfig {
* See the documentation of {@link LanguageConfiguration} for details of
the Syntax.
*/
public static final String PROCESSED_LANGUAGES =
"enhancer.engines.linking.processedLanguages";
+
+ /**
+ * The minimum length of Token to be used for searches in case no
+ * POS (Part of Speech) tags are available.
+ */
+ public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3;
+ /**
+ * Used as fallback in case a {@link Token} does not have a {@link PosTag}
or
+ * {@link NlpAnnotations#POS_ANNOTATION POS annotations} do have a low
confidence.
+ * In such cases only words that are longer than this value will be
considerd for
+ * linking
+ */
+ public static final String MIN_SEARCH_TOKEN_LENGTH =
"enhancer.engines.linking.minSearchTokenLength";
+ /**
+ * The minimum length of labels that are looked-up in the directory
+ */
+ private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
+
/*
* Parameters used for language specific text processing configurations
*/
@@ -210,6 +231,27 @@ public class TextProcessingConfig {
log.debug("> Noun matching activated (matched LexicalCategories:
{})",
tpc.defaultConfig.getLinkedLexicalCategories());
}
+ // init MIN_SEARCH_TOKEN_LENGTH
+ value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
+ Integer minSearchTokenLength;
+ if(value instanceof Integer){
+ minSearchTokenLength = (Integer)value;
+ } else if (value != null){
+ try {
+ minSearchTokenLength = Integer.valueOf(value.toString());
+ } catch(NumberFormatException e){
+ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH,
"Values MUST be valid Integer values > 0",e);
+ }
+ } else {
+ minSearchTokenLength = null;
+ }
+ if(minSearchTokenLength != null){
+ if(minSearchTokenLength < 1){
+ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH,
"Values MUST be valid Integer values > 0");
+ }
+ tpc.defaultConfig.setMinSearchTokenLength(minSearchTokenLength);
+ }
+
//parse the language configuration
value = configuration.get(PROCESSED_LANGUAGES);
if(value instanceof String){
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
Mon Apr 15 05:31:34 2013
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+import static
org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.ENTITY_RANK_COMPARATOR;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
@@ -81,7 +82,7 @@ public class EntityLinker {
this.linkerConfig = linkerConfig;
this.textProcessingConfig = textProcessingConfig;
this.labelTokenizer = labelTokenizer;
- this.state = new
ProcessingState(analysedText,language,textProcessingConfig,linkerConfig);
+ this.state = new
ProcessingState(analysedText,language,textProcessingConfig);
this.lookupLimit = Math.max(10,linkerConfig.getMaxSuggestions()*2);
}
/**
@@ -101,7 +102,12 @@ public class EntityLinker {
"none"});
}
List<String> searchStrings = new
ArrayList<String>(linkerConfig.getMaxSearchTokens());
- searchStrings.add(token.getTokenText());
+ String searchString = linkerConfig.isLemmaMatching() ?
token.getTokenLemma() :
+ token.getTokenText();
+ if(searchString == null){
+ searchString = token.getTokenText();
+ }
+ searchStrings.add(searchString);
//Determine the range we are allowed to search for tokens
final int minIncludeIndex;
final int maxIndcludeIndex;
@@ -113,8 +119,8 @@ public class EntityLinker {
restrirctContextByChunks){
minIncludeIndex = Math.max(
state.getConsumedIndex()+1,
- token.inChunk.startToken);
- maxIndcludeIndex = token.inChunk.endToken;
+ token.inChunk.getStartTokenIndex());
+ maxIndcludeIndex = token.inChunk.getEndTokenIndex();
} else {
maxIndcludeIndex = state.getTokens().size() - 1;
minIncludeIndex = state.getConsumedIndex() + 1;
@@ -136,7 +142,12 @@ public class EntityLinker {
});
}
if(prevToken.isMatchable){
- searchStrings.add(0,prevToken.getTokenText());
+ String prevSearchString =
linkerConfig.isLemmaMatching() ?
+ prevToken.getTokenLemma() :
prevToken.getTokenText();
+ if(prevSearchString == null){
+ prevSearchString = prevToken.getTokenText();
+ }
+ searchStrings.add(0,prevSearchString);
}
}
if(maxIndcludeIndex >= pastIndex){
@@ -150,7 +161,12 @@ public class EntityLinker {
});
}
if(pastToken.isMatchable){
- searchStrings.add(pastToken.getTokenText());
+ String pastSearchString =
linkerConfig.isLemmaMatching() ?
+ pastToken.getTokenLemma() :
pastToken.getTokenText();
+ if(pastSearchString == null){
+ pastSearchString = pastToken.getTokenText();
+ }
+ searchStrings.add(pastSearchString);
}
}
} while(searchStrings.size() < linkerConfig.getMaxSearchTokens()
&& distance <
@@ -202,6 +218,29 @@ public class EntityLinker {
log.warn(" currnet ranking : {}",suggestions);
log.warn(" ... this will result in worng confidence
values relative to the best match");
}
+ //adapt equals rankings based on the entity rank
+ if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
+ List<Suggestion> equalScoreList = new
ArrayList<Suggestion>(4);
+ double score = 2f;
+ for(Suggestion s : suggestions){
+ double actScore = s.getScore();
+ if(score == actScore){
+ equalScoreList.add(s);
+ } else {
+ if(equalScoreList.size() > 1){
+ adaptScoresForEntityRankings(equalScoreList,
actScore);
+ }
+ score = actScore;
+ equalScoreList.clear();
+ equalScoreList.add(s);
+ }
+ }
+ if(equalScoreList.size() > 1){
+ adaptScoresForEntityRankings(equalScoreList,0);
+ }
+ //resort by score
+ Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ }
//remove all suggestions > config.maxSuggestions
if(suggestions.size() > linkerConfig.getMaxSuggestions()){
suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
@@ -242,6 +281,41 @@ public class EntityLinker {
}
}
/**
+ * This method slightly adapts scores of Suggestions based on the Entity
ranking.
+ * It is used for Suggestions that would have the exact same score (e.g.
1.0) to
+ * ensure ordering of the suggestions based on the rankings of the Entities
+ * within the knowledge base linked against
+ * @param equalScoreList Entities with the same {@link
Suggestion#getScore()}
+ * values. If this is not the case this method will change scores in
unintended
+ * ways
+ * @param nextScore the score of the {@link Suggestion} with a lower score
as the
+ * list of suggestions parsed in the first parameter
+ */
+ private void adaptScoresForEntityRankings(List<Suggestion> equalScoreList,
double nextScore) {
+ double score = equalScoreList.get(0).getScore();
+ log.debug(" > Adapt Score of multiple Suggestions "
+ + "with '{}' based on EntityRanking",score);
+ //Adapt the score to reflect the entity ranking
+ //but do not change order with entities of different
+ //score. Also do not change the score more that 0.1
+ //TODO: make the max change (0.1) configurable
+ double dif = (Math.min(0.1, score-nextScore))/equalScoreList.size();
+ Collections.sort(equalScoreList,ENTITY_RANK_COMPARATOR);
+ log.debug(" - keep socre of {} at {}",
equalScoreList.get(0).getEntity().getId(), score);
+ for(int i=1;i<equalScoreList.size();i++){
+ score = score-dif;
+ if(ENTITY_RANK_COMPARATOR.compare(equalScoreList.get(i-1),
+ equalScoreList.get(i)) != 0){
+ equalScoreList.get(i).setScore(score);
+ log.debug(" - set score of {} at {}",
equalScoreList.get(i).getEntity().getId(), score);
+ } else {
+ double lastScore = equalScoreList.get(i-1).getScore();
+ equalScoreList.get(i).setScore(lastScore);
+ log.debug(" - set score of {} at {}",
equalScoreList.get(i).getEntity().getId(), lastScore);
+ }
+ }
+ }
+ /**
* After {@link #process()}ing this returns the entities linked for the
* parsed {@link AnalysedContent}.
* @return the linked entities
@@ -361,12 +435,18 @@ public class EntityLinker {
log.debug(" - found {} entities ...",results.size());
List<Suggestion> suggestions = new ArrayList<Suggestion>();
for(Entity result : results){
- log.debug(" > {}",result.getId());
+ if(log.isDebugEnabled()){
+ log.debug(" > {} (ranking:
{})",result.getId(),result.getEntityRanking());
+ }
Suggestion suggestion = matchLabels(result);
- log.debug(" < {}",suggestion);
if(suggestion.getMatch() != MATCH.NONE){
+ if(log.isDebugEnabled()){
+ log.debug(" + {}",suggestion);
+ }
suggestions.add(suggestion);
- }
+ } else {
+ log.debug(" - no match");
+ }
}
//sort the suggestions
if(suggestions.size()>1){
@@ -517,7 +597,11 @@ public class EntityLinker {
&& search ;currentIndex++){
currentToken = state.getTokens().get(currentIndex);
if(currentToken.hasAlphaNumeric){
- currentTokenText = currentToken.getTokenText();
+ currentTokenText = linkerConfig.isLemmaMatching() ?
+ currentToken.getTokenLemma() :
currentToken.getTokenText();
+ if(currentTokenText == null) { //no lemma available
+ currentTokenText = currentToken.getTokenText(); //fallback
to text
+ }
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
@@ -596,7 +680,11 @@ public class EntityLinker {
String labelTokenText = labelTokens[labelIndex];
if(labelTokenSet.contains(labelTokenText)){ //still not matched
currentToken = state.getTokens().get(currentIndex);
- currentTokenText = currentToken.getTokenText();
+ currentTokenText = linkerConfig.isLemmaMatching() ?
+ currentToken.getTokenLemma() :
currentToken.getTokenText();
+ if(currentTokenText == null) { //no lemma available
+ currentTokenText = currentToken.getTokenText(); //fallback
to text
+ }
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Mon Apr 15 05:31:34 2013
@@ -86,23 +86,23 @@ public class ProcessingState {
* The language of the text
*/
private String language;
-
+
protected final LanguageProcessingConfig tpc;
- protected final EntityLinkerConfig elc;
+ //protected final EntityLinkerConfig elc;
private AnalysedText at;
private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new
Predicate() {
@Override
public boolean evaluate(Object object) {
- return ((TokenData)object).isProcessable;
+ return ((TokenData)object).isLinkable;
}
};
public static final Collection<Pos> SUB_SENTENCE_START_POS = EnumSet.of(
Pos.Quote);
- public ProcessingState(AnalysedText at, String language,
LanguageProcessingConfig tpc, EntityLinkerConfig elc){
+ public ProcessingState(AnalysedText at, String language,
LanguageProcessingConfig tpc){
if(at == null){
throw new IllegalArgumentException("The parsed AnalysedText MUST
NOT be NULL!");
}
@@ -112,11 +112,7 @@ public class ProcessingState {
if(tpc == null){
throw new IllegalArgumentException("The parsed
TextProcessingConfig MUST NOT be NULL!");
}
- if(elc == null){
- throw new IllegalArgumentException("The parsed EntityLinkerConfig
MUST NOT be NULL!");
- }
this.tpc = tpc;
- this.elc = elc;
enclosedSpanTypes = EnumSet.of(SpanTypeEnum.Token);
if(!tpc.isIgnoreChunks()){
@@ -270,16 +266,16 @@ public class ProcessingState {
tokenData.morpho != null ?
tokenData.morpho : "none"});
}
//determine if the token should be linked/matched
- tokenData.isProcessable = tokenData.isLinkablePos;
- tokenData.isMatchable = tokenData.isProcessable ||
tokenData.isMatchablePos;
+ tokenData.isLinkable = tokenData.isLinkablePos;
+ tokenData.isMatchable = tokenData.isLinkable ||
tokenData.isMatchablePos;
//for non processable but upper case tolkens we need to
check
//the uper case token configuration
- if(!tokenData.isProcessable && tokenData.upperCase){
+ if(!tokenData.isLinkable && tokenData.upperCase){
if(tokenData.index > 0 && //not a sentence or
sub-sentence start
!tokens.get(tokenData.index-1).isSubSentenceStart){
if(tpc.isLinkUpperCaseTokens() && //if upper case
tokens should be linked
tokenData.isMatchable) { //convert
matchable to
- tokenData.isProcessable = true; //linkable
+ tokenData.isLinkable = true; //linkable
} else if(tpc.isMatchUpperCaseTokens() ||
tpc.isLinkUpperCaseTokens()){
//if matching for upperCase Tokens is
activated or
//linking is activated, but the current Token
is not
@@ -291,7 +287,7 @@ public class ProcessingState {
//add the token to the list
tokens.add(tokenData);
if(!foundProcessable){
- foundProcessable = tokenData.isProcessable;
+ foundProcessable = tokenData.isLinkable;
}
if(activeChunk != null){
if(tokenData.isMatchable){
@@ -302,7 +298,7 @@ public class ProcessingState {
activeChunk.endToken = tokens.size()-1;
log.debug(" - end Chunk@pos: {}",
activeChunk.endToken);
if(tpc.isLinkMultiMatchableTokensInChunk() &&
- activeChunk.matchableCount > 1 ){
+ activeChunk.getMatchableCount() > 1 ){
log.debug(" - multi-matchable Chunk:");
//mark the last of two immediate following
matchable
//tokens as processable
@@ -310,10 +306,10 @@ public class ProcessingState {
TokenData ct = tokens.get(i);
TokenData pt = tokens.get(i-1);
if(ct.isMatchable && pt.isMatchable){
- if(!ct.isProcessable) { //if not
already processable
+ if(!ct.isLinkable) { //if not already
processable
log.debug(" > convert Token
{}: {} (pos:{}) from matchable to processable",
new
Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
- ct.isProcessable = true;
+ ct.isLinkable = true;
if(!foundProcessable){
foundProcessable = true;
}
@@ -479,39 +475,39 @@ public class ProcessingState {
* the {@link Token} to check.
* @return <code>true</code> if the parsed token needs to be processed.
Otherwise <code>false</code>
*/
- class TokenData {
+ public class TokenData {
/** The Token */
- final Token token;
+ public final Token token;
/** The index of the Token within the current Section (Sentence) */
- final int index;
+ public final int index;
/** If this Token should be linked with the Vocabulary */
- boolean isProcessable;
+ public boolean isLinkable;
/** If this Token should be used for multi word searches in the
Vocabulary */
- boolean isMatchable;
+ public boolean isMatchable;
/** if this Token has an alpha or numeric char */
- final boolean hasAlphaNumeric;
+ public final boolean hasAlphaNumeric;
/** the chunk of this Token */
- final ChunkData inChunk;
+ public final ChunkData inChunk;
/** the morphological features of the Token (selected based on the POS
Tag) */
- final MorphoFeatures morpho;
+ public final MorphoFeatures morpho;
/**
* if this token starts with an upperCase letter
*/
- final boolean upperCase;
+ public final boolean upperCase;
/**
* If the POS type of this word matches a linkable category
*/
- final boolean isLinkablePos;
+ public final boolean isLinkablePos;
/**
* if the POS type of this word matches a matchable category
*/
- final boolean isMatchablePos;
+ public final boolean isMatchablePos;
/**
* if this Token represents the start of an sub-sentence such as an
* starting ending quote
* @see ProcessingState#SUB_SENTENCE_START_POS
*/
- final boolean isSubSentenceStart;
+ public final boolean isSubSentenceStart;
/**
* Constructs and initializes meta data needed for linking based
* on the current tokens (and its NLP annotation)
@@ -560,7 +556,7 @@ public class ProcessingState {
}
if(!matchedPosTag) { //not matched against a POS Tag ...
// ... fall back to the token length
- this.isLinkablePos = token.getSpan().length() >=
elc.getMinSearchTokenLength();
+ this.isLinkablePos = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
} else {
this.isLinkablePos = isLinkablePos;
}
@@ -594,7 +590,7 @@ public class ProcessingState {
}
if(!matchedPosTag){ //not matched against POS tag ...
//fall back to the token length
- this.isMatchablePos = token.getSpan().length() >=
elc.getMinSearchTokenLength();
+ this.isMatchablePos = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
} else {
this.isMatchablePos = isMatchablePos;
}
@@ -639,21 +635,18 @@ public class ProcessingState {
}
/**
- * Getter for the text as used for searching/matching
- * Entities in the linked vocabulary. If
- * {@link EntityLinkerConfig#isLemmaMatching()} is
- * enabled this will return the
- * {@link MorphoFeatures#getLemma()} (if available).
- * Otherwise the {@link Token#getSpan()} is returned
- * @return the text of the token as to be used for
- * matching. Guaranteed to be NOT NULL.
+ * Getter for token text
+ * @return the text of the token
*/
public String getTokenText(){
- if(elc.isLemmaMatching() && morpho != null){
- return morpho.getLemma();
- } else {
- return token.getSpan();
- }
+ return token.getSpan();
+ }
+ /**
+ * Getter for the Lemma of the token.
+ * @return the Lemma of the Token or <code>null</code> if not available
+ */
+ public String getTokenLemma(){
+ return morpho != null ? morpho.getLemma() : null;
}
}
@@ -670,12 +663,12 @@ public class ProcessingState {
* and {@link ChunkData#getEndChar()} are the absolute [start,end)
character
* indices within the {@link AnalysedText#getSpan()}
*/
- class ChunkData {
+ public class ChunkData {
protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
/** if the Chunk is processable */
- final boolean isProcessable;
+ public final boolean isProcessable;
/** the Chunk */
- final Chunk chunk;
+ public final Chunk chunk;
/**
* In case multiple overlapping and processable {@link Chunk}s the
* section selected by the chunks are merged. While {@link #chunk}
@@ -684,15 +677,11 @@ public class ProcessingState {
* merged) are not available via this class, but can be retrieved
* by iterating over the {@link AnalysedText} content part.
*/
- Chunk merged;
+ private Chunk merged;
/** the start token index relative to the current section (sentence) */
- int startToken;
+ private int startToken;
/** the end token index relative to the current section (sentence) */
- int endToken;
- /**
- * The number of processable Tokens enclosed by this Chunk
- */
- int processableCount;
+ private int endToken;
/**
* The number of matchable Tokens enclosed by this Chunk
*/
@@ -734,9 +723,26 @@ public class ProcessingState {
public int getEndChar(){
return merged == null ? chunk.getEnd() : merged.getEnd();
}
+ /**
+ * If this chunk is processable
+ * @return the state
+ */
public boolean isProcessable() {
return isProcessable;
}
+ /**
+ * Getter for the number of matchable tokens contained in this chunk
+ * @return The number of matchable tokens contained in this chunk
+ */
+ public int getMatchableCount() {
+ return matchableCount;
+ }
+ public int getStartTokenIndex() {
+ return startToken;
+ }
+ public int getEndTokenIndex() {
+ return endToken;
+ }
}
}
\ No newline at end of file
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java
Mon Apr 15 05:31:34 2013
@@ -270,7 +270,8 @@ public class Suggestion {
public String toString() {
return labelMatches.isEmpty() ? "no match" :labelMatches.get(0)
+ " for "+entity.getId()
- +(redirectsTo != null ? " redirected to "+redirectsTo.getId()
: "");
+ + (redirectsTo != null ? " (redirects:
"+redirectsTo.getId()+") " : "")
+ + " ranking: "+getEntityRank();
}
/**
@@ -290,6 +291,18 @@ public class Suggestion {
}
};
/**
+ * Compares {@link Suggestion}s based on the {@link
Suggestion#getEntityRank()}.
+ * <code>null</code> values are assumed to be the smallest.
+ */
+ public static final Comparator<Suggestion> ENTITY_RANK_COMPARATOR = new
Comparator<Suggestion>(){
+ @Override
+ public int compare(Suggestion arg0, Suggestion arg1) {
+ Float r1 = arg0.getEntityRank();
+ Float r2 = arg1.getEntityRank();
+ return r2 == null ? r1 == null ? 0 : -1 : r1 == null ? 1 :
r2.compareTo(r1);
+ }
+ };
+ /**
* Compares {@link Suggestion} first based on the {@link
Suggestion#getMatch()} value
* and secondly based on the {@link RdfResourceEnum#entityRank}.
*/
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
Mon Apr 15 05:31:34 2013
@@ -293,8 +293,8 @@ public class EntityLinkingEngineTest {
linkedEntity.getSuggestions().iterator();
assertEquals("Number of suggestions
"+linkedEntity.getSuggestions().size()+
" != number of expected suggestions
"+expectedSuggestions.size()+
- "for selection "+linkedEntity.getSelectedText(),
- linkedEntity.getSuggestions().size(),
+ "for selection "+linkedEntity.getSelectedText() + "(Expected:
" +
+ expectedSuggestions +")",
linkedEntity.getSuggestions().size(),
expectedSuggestions.size());
double score = linkedEntity.getScore();
for(int i=0;i<expectedSuggestions.size();i++){
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/resources/log4j.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/resources/log4j.properties?rev=1467854&r1=1467853&r2=1467854&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/resources/log4j.properties
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/resources/log4j.properties
Mon Apr 15 05:31:34 2013
@@ -21,4 +21,4 @@ log4j.appender.stdout=org.apache.log4j.C
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
-log4j.logger.org.apache.stanbol.enhancer.engines.keywordextraction=DEBUG
\ No newline at end of file
+log4j.logger.org.apache.stanbol.enhancer.engines.entitylinking=DEBUG
\ No newline at end of file