Author: rwesten
Date: Wed Apr 24 10:52:58 2013
New Revision: 1471366
URL: http://svn.apache.org/r1471366
Log:
implementation of STANBOL-1049; fix for STANBOL-1051
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
Wed Apr 24 10:52:58 2013
@@ -161,6 +161,7 @@ public class LanguageProcessingConfig im
*/
private boolean linkMultiMatchableTokensInChunkState =
DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
private int minSearchTokenLength;
+ private boolean linkOnlyUpperCaseTokenWithUnknownPos;
/**
@@ -526,6 +527,23 @@ public class LanguageProcessingConfig im
return minSearchTokenLength;
}
+ /**
+ * This returns the state if only upper case tokens should be marked as
+ * 'linkable' if they do not have a POS tag
+ * @return the state
+ */
+ public boolean isLinkOnlyUpperCaseTokensWithUnknownPos(){
+ return linkOnlyUpperCaseTokenWithUnknownPos;
+ }
+
+ /**
+ * This returns the state if only upper case tokens should be marked as
+ * 'linkable' if they do not have a POS tag
+ * @param linkOnlyUpperCaseTokenWithUnknownPos the state
+ */
+ public void setLinkOnlyUpperCaseTokenWithUnknownPos(boolean
linkOnlyUpperCaseTokenWithUnknownPos) {
+ this.linkOnlyUpperCaseTokenWithUnknownPos =
linkOnlyUpperCaseTokenWithUnknownPos;
+ }
/**
* Clones the {@link LanguageProcessingConfig}. Intended to be used
@@ -549,6 +567,7 @@ public class LanguageProcessingConfig im
c.linkMultiMatchableTokensInChunkState =
linkMultiMatchableTokensInChunkState;
c.matchedLexicalCategories = matchedLexicalCategories;
c.minSearchTokenLength = minSearchTokenLength;
+ c.linkOnlyUpperCaseTokenWithUnknownPos =
linkOnlyUpperCaseTokenWithUnknownPos;
return c;
}
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
Wed Apr 24 10:52:58 2013
@@ -17,6 +17,7 @@
package org.apache.stanbol.enhancer.engines.entitylinking.config;
import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
import java.util.Collections;
import java.util.Dictionary;
import java.util.EnumSet;
@@ -39,6 +40,19 @@ import org.slf4j.LoggerFactory;
public class TextProcessingConfig {
private static final Logger log =
LoggerFactory.getLogger(TextProcessingConfig.class);
+
+ /**
+ * Holds a list of ISO 2 letter language codes that do use unicase scripts
-
+ * do not know upper case letters.<p>
+ * More information is available the Wikipedia page for
+ * <a href="http://en.wikipedia.org/wiki/Letter_case">Letter case</a>.
+ */
+ public static final Set<String> UNICASE_SCRIPT_LANUAGES;
+ static {
+ UNICASE_SCRIPT_LANUAGES = Collections.unmodifiableSet(new
HashSet<String>(Arrays.asList(
+ "ar","he","zh","ja","ko","ka","hi","ne")));
+ }
+
/**
* If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link
Pos#Acronym} are Matched. If
* deactivated all Tokens with the category {@link LexicalCategory#Noun}
and
@@ -55,6 +69,17 @@ public class TextProcessingConfig {
* Default for the {@link #PROCESS_ONLY_PROPER_NOUNS_STATE} (false)
*/
public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE =
false;
+
+ /**
+ * Switch that allows to enable a mode where only upper case tokens are
marked as
+ * 'linkable' if no POS tag is available (or existing POS tags are of low
probability).<p>
+ * This is especially usefull for processing text in languages where no
POS tagger is
+ * available.<p>
+ * NOTE: that this configuration is ignored for lanugages where there are
no
+ * upper case letters (Arabic, Hebrew, Chinese, Japanese, Korean, Hindi)
+ */
+ public static final String
LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG =
"enhancer.engines.linking.linkOnlyUpperCaseTokensWithMissingPosTag";
+
/**
* Allows to configure the processed languages by using the syntax
supported by {@link LanguageConfiguration}.
* In addition this engine supports language specific configurations for
matched {@link LexicalCategory}
@@ -209,8 +234,7 @@ public class TextProcessingConfig {
*/
public final static TextProcessingConfig
createInstance(Dictionary<String,Object> configuration) throws
ConfigurationException {
TextProcessingConfig tpc = new TextProcessingConfig();
- //Parse the default text processing configuration
- //set the default LexicalTypes
+ //Parse the Proper Noun Linking state
Object value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE);
boolean properNounState;
if(value instanceof Boolean){
@@ -231,6 +255,17 @@ public class TextProcessingConfig {
log.debug("> Noun matching activated (matched LexicalCategories:
{})",
tpc.defaultConfig.getLinkedLexicalCategories());
}
+ //parse upper case linking for languages without POS support state
+ //see STANBOL-1049
+ value =
configuration.get(LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG);
+ final Boolean linkOnlyUpperCaseTokensWithMissingPosTag;
+ if(value instanceof Boolean){
+
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(((Boolean)value).booleanValue());
+ } else if(value != null){
+
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(Boolean.parseBoolean(value.toString()));
+ } else { //the default is the same as the properNounState
+
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(properNounState);
+ }
// init MIN_SEARCH_TOKEN_LENGTH
value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
Integer minSearchTokenLength;
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Wed Apr 24 10:52:58 2013
@@ -20,6 +20,7 @@
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static java.util.Collections.disjoint;
+import static
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
@@ -29,11 +30,13 @@ import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
import
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
@@ -91,6 +94,11 @@ public class ProcessingState {
//protected final EntityLinkerConfig elc;
private AnalysedText at;
+ /**
+ * If the language uses a unicase script and therefore upper case specific
+ * processing rules can not be used (see STANBOL-1049)
+ */
+ private boolean isUnicaseLanguage;
private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new
Predicate() {
@Override
@@ -120,6 +128,10 @@ public class ProcessingState {
}
this.at = at; //store as field (just used for logging)
this.language = language;
+ //STANBOL-1049: we need now to know if a language uses a unicase script
+ //ensure lower case and only use the language part
+ String lookupLang = language.toLowerCase(Locale.ROOT).split("[_-]")[0];
+ this.isUnicaseLanguage = UNICASE_SCRIPT_LANUAGES.contains(lookupLang);
//prefer to iterate over sentences
Iterator<Sentence> sentences = at.getSentences();
this.sections = sentences.hasNext() ? sentences :
Collections.singleton(at).iterator();
@@ -222,8 +234,8 @@ public class ProcessingState {
section = null;
processableTokensIterator = null;
consumedIndex = -1;
- boolean foundProcessable = false;
- while(!foundProcessable && sections.hasNext()){
+ boolean foundLinkableToken = false;
+ while(!foundLinkableToken && sections.hasNext()){
section = sections.next();
tokens.clear(); //clear token for each section (STANBOL-818)
Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
@@ -265,29 +277,63 @@ public class ProcessingState {
tokenData.inChunk != null ?
tokenData.inChunk.chunk.getSpan() : "none",
tokenData.morpho != null ?
tokenData.morpho : "none"});
}
- //determine if the token should be linked/matched
- tokenData.isLinkable = tokenData.isLinkablePos;
- tokenData.isMatchable = tokenData.isLinkable ||
tokenData.isMatchablePos;
- //for non processable but upper case tolkens we need to
check
- //the uper case token configuration
- if(!tokenData.isLinkable && tokenData.upperCase){
- if(tokenData.index > 0 && //not a sentence or
sub-sentence start
+ if(!tokenData.hasAlphaNumeric){
+ tokenData.isLinkable = false;
+ tokenData.isMatchable = false;
+ } else {
+ // (1) apply basic rules for linkable/processable
tokens
+ //determine if the token should be linked/matched
+ tokenData.isLinkable = tokenData.isLinkablePos != null
? tokenData.isLinkablePos : false;
+ //matchabel := linkable OR has matchablePos
+ tokenData.isMatchable = tokenData.isLinkable ||
+ (tokenData.isMatchablePos != null &&
tokenData.isMatchablePos);
+
+ //(2) for non linkable tokens check for upper case
rules
+ if(!tokenData.isLinkable && tokenData.upperCase &&
+ tokenData.index > 0 && //not a sentence or
sub-sentence start
!tokens.get(tokenData.index-1).isSubSentenceStart){
- if(tpc.isLinkUpperCaseTokens() && //if upper case
tokens should be linked
- tokenData.isMatchable) { //convert
matchable to
- tokenData.isLinkable = true; //linkable
- } else if(tpc.isMatchUpperCaseTokens() ||
tpc.isLinkUpperCaseTokens()){
- //if matching for upperCase Tokens is
activated or
- //linking is activated, but the current Token
is not
- //matchable, than mark the Token as matchable
- tokenData.isMatchable = true;
- } //else upper case matching and linking is
deactivated
- }
+ //We have an upper case token!
+ if(tpc.isLinkUpperCaseTokens()){
+ if(tokenData.isMatchable) { //convert
matchable to
+ tokenData.isLinkable = true; //linkable
+ } else { // and other tokens to
+ tokenData.isMatchable = true; //matchable
+ }
+ } else {
+ //finally we need to convert other Tokens to
matchable
+ //if MatchUpperCaseTokens is active
+ if(!tokenData.isMatchable &&
tpc.isMatchUpperCaseTokens()){
+ tokenData.isMatchable = true;
+ }
+ }
+ } //else not an upper case token
+
+ //(3) Unknown POS tag Rules (see STANBOL-1049)
+ if(!tokenData.isLinkable && tokenData.isLinkablePos ==
null &&
+ tokenData.isLinkablePos == null){
+ if(isUnicaseLanguage ||
!tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
+ if(tokenData.hasSearchableLength){
+ tokenData.isLinkable = true;
+ } //else no need to change the state
+ } else { //non unicase language and link only
upper case tokens enabled
+ if(tokenData.upperCase && // upper case token
+ tokenData.index > 0 && //not a
sentence or sub-sentence start
+
!tokens.get(tokenData.index-1).isSubSentenceStart){
+ if(tokenData.hasSearchableLength){
+ tokenData.isLinkable = true;
+ } else {
+ tokenData.isMatchable = true;
+ }
+ } else if(tokenData.hasSearchableLength){
//lower case and long token
+ tokenData.isMatchable = true;
+ } //else lower case and short word
+ }
+ } //else already linkable or POS tag present
}
//add the token to the list
tokens.add(tokenData);
- if(!foundProcessable){
- foundProcessable = tokenData.isLinkable;
+ if(!foundLinkableToken){
+ foundLinkableToken = tokenData.isLinkable;
}
if(activeChunk != null){
if(tokenData.isMatchable){
@@ -310,8 +356,8 @@ public class ProcessingState {
log.debug(" > convert Token
{}: {} (pos:{}) from matchable to processable",
new
Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
ct.isLinkable = true;
- if(!foundProcessable){
- foundProcessable = true;
+ if(!foundLinkableToken){
+ foundLinkableToken = true;
}
}
i--;//mark both (ct & pt) as processed
@@ -328,7 +374,7 @@ public class ProcessingState {
}
}
processableTokensIterator = new FilterIterator(tokens.iterator(),
PROCESSABLE_TOKEN_OREDICATE);
- return foundProcessable;
+ return foundLinkableToken;
}
/**
* Getter for the text covered by the next tokenCount tokens relative to
@@ -495,13 +541,17 @@ public class ProcessingState {
*/
public final boolean upperCase;
/**
+ * if the length of the token is >= {@link
LanguageProcessingConfig#getMinSearchTokenLength()}
+ */
+ public boolean hasSearchableLength;
+ /**
* If the POS type of this word matches a linkable category
*/
- public final boolean isLinkablePos;
+ public final Boolean isLinkablePos;
/**
* if the POS type of this word matches a matchable category
*/
- public final boolean isMatchablePos;
+ public final Boolean isMatchablePos;
/**
* if this Token represents the start of an sub-sentence such as an
* starting ending quote
@@ -521,7 +571,7 @@ public class ProcessingState {
this.index = index;
this.inChunk = chunk;
this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
-
+ this.hasSearchableLength = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
PosTag selectedPosTag = null;
boolean matchedPosTag = false; //matched any of the POS annotations
@@ -541,13 +591,16 @@ public class ProcessingState {
if((!disjoint(tpc.getLinkedLexicalCategories(),
posTag.getCategories())) ||
(!disjoint(tpc.getLinkedPos(),
posTag.getPosHierarchy())) ||
tpc.getLinkedPosTags().contains(posTag.getTag())){
- if(posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
+ if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
selectedPosTag = posTag;
isLinkablePos = true;
isMatchablePos = true;
+ matchedPosTag = true;
break;
} // else probability to low for inclusion
- } else if(posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
+ } else if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
selectedPosTag = posTag; //also rejected PosTags are
selected
matchedPosTag = true;
isLinkablePos = false;
@@ -555,14 +608,13 @@ public class ProcessingState {
} // else probability to low for exclusion
}
if(!matchedPosTag) { //not matched against a POS Tag ...
- // ... fall back to the token length
- this.isLinkablePos = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
+ this.isLinkablePos = null;
} else {
this.isLinkablePos = isLinkablePos;
}
//(2) check if this token should be considered to match labels of
suggestions
- if(this.isLinkablePos){ //processable tokens are also matchable
+ if(this.isLinkablePos != null && this.isLinkablePos){
//processable tokens are also matchable
this.isMatchablePos = true;
} else { //check POS and length to see if token is matchable
matchedPosTag = false; //reset to false!
@@ -571,14 +623,16 @@ public class ProcessingState {
if(posTag.isMapped()){
if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
posTag.getCategories())){
- if(posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
+ if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
//override selectedPosTag if present
selectedPosTag = posTag; //mark the matchable
as selected PosTag
isMatchablePos = true;
matchedPosTag = true;
break;
} // else probability to low for inclusion
- } else if(posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
+ } else if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
if(selectedPosTag == null){ //do not override
existing values
selectedPosTag = posTag; //also rejected
PosTags are selected
}
@@ -590,7 +644,8 @@ public class ProcessingState {
}
if(!matchedPosTag){ //not matched against POS tag ...
//fall back to the token length
- this.isMatchablePos = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
+ this.isMatchablePos = null;
+ //this.isMatchablePos = token.getSpan().length() >=
tpc.getMinSearchTokenLength();
} else {
this.isMatchablePos = isMatchablePos;
}
@@ -599,10 +654,12 @@ public class ProcessingState {
for(Value<PosTag> posAnnotation : posAnnotations){
PosTag posTag = posAnnotation.value();
if((!disjoint(SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
- if(posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
+ if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinPosAnnotationProbability()){
isSubSentenceStart = true;
} // else probability to low for inclusion
- } else if(posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
+ } else if(posAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >=
tpc.getMinExcludePosAnnotationProbability()){
isSubSentenceStart = false;
}
}
@@ -696,11 +753,13 @@ public class ProcessingState {
for (Value<PhraseTag> phraseAnnotation :
chunk.getAnnotations(PHRASE_ANNOTATION)) {
if
(tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
||
tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
- if (phraseAnnotation.probability() >=
tpc.getMinPhraseAnnotationProbability()) {
+ if (phraseAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ phraseAnnotation.probability() >=
tpc.getMinPhraseAnnotationProbability()) {
process = true;
break;
} // else probability to low for inclusion
- } else if (phraseAnnotation.probability() >=
tpc.getMinExcludePhraseAnnotationProbability()) {
+ } else if (phraseAnnotation.probability() ==
Value.UNKNOWN_PROBABILITY ||
+ phraseAnnotation.probability() >=
tpc.getMinExcludePhraseAnnotationProbability()) {
process = false;
break;
} // else probability to low for exclusion
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
Wed Apr 24 10:52:58 2013
@@ -98,7 +98,14 @@ public class EntityLinkingEngineTest {
public static final String TEST_TEXT = "Dr. Patrick Marshall (1869 -
November 1950) was a"
+ " geologist who lived in New Zealand and worked at the University of
Otago.";
+ /**
+ * changed oder af given and family name
+ */
+ public static final String TEST_TEXT_WO = "Dr. Marshall Patrick (1869 -
November 1950) was a"
+ + " geologist who lived in New Zealand and worked at the University of
Otago.";
+
private static AnalysedText TEST_ANALYSED_TEXT;
+ private static AnalysedText TEST_ANALYSED_TEXT_WO;
// public static final String TEST_TEXT2 = "A CBS televised debate between
Australia's " +
// "candidates for Prime Minister in the upcoming US election has
been rescheduled " +
@@ -108,6 +115,8 @@ public class EntityLinkingEngineTest {
private static final String TEST_REFERENCED_SITE_NAME = "dummRefSiteName";
+ private static Value<PhraseTag> NOUN_PHRASE = Value.value(new
PhraseTag("NP",LexicalCategory.Noun),1d);
+
static TestSearcherImpl searcher;
public static final UriRef NAME = new UriRef(NamespaceEnum.rdfs+"label");
@@ -166,49 +175,60 @@ public class EntityLinkingEngineTest {
graph.add(new TripleImpl(uri, TYPE,
OntologicalClasses.DBPEDIA_ORGANISATION));
searcher.addEntity(new Entity(uri, graph));
- Value<PhraseTag> nounPhrase = Value.value(new
PhraseTag("NP",LexicalCategory.Noun),1d);
TEST_ANALYSED_TEXT =
AnalysedTextFactory.getDefaultInstance().createAnalysedText(
- ciFactory.createBlob(new StringSource(TEST_TEXT)));
- TEST_ANALYSED_TEXT.addSentence(0, TEST_ANALYSED_TEXT.getEnd());
- //add some noun phrases
- TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick
Marshall".length()).addAnnotation(PHRASE_ANNOTATION, nounPhrase);
- TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("New Zealand"),
TEST_TEXT.indexOf("New Zealand")+"New Zealand".length())
- .addAnnotation(PHRASE_ANNOTATION, nounPhrase);
- TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("geologist"),
TEST_TEXT.indexOf("geologist")+"geologist".length())
- .addAnnotation(PHRASE_ANNOTATION, nounPhrase);
- TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("the University of
Otago"),
- TEST_TEXT.length()-1).addAnnotation(PHRASE_ANNOTATION, nounPhrase);
- //add some tokens
- TEST_ANALYSED_TEXT.addToken(0, 2).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.Abbreviation),1d));
- TEST_ANALYSED_TEXT.addToken(2, 3).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag(".",Pos.Point),1d));
+ ciFactory.createBlob(new StringSource(TEST_TEXT)));
+ TEST_ANALYSED_TEXT_WO =
AnalysedTextFactory.getDefaultInstance().createAnalysedText(
+ ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
+ initAnalyzedText(TEST_ANALYSED_TEXT);
+ TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick
Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ initAnalyzedText(TEST_ANALYSED_TEXT_WO);
+ TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall
Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+ TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ }
+
+ /**
+ * @param nounPhrase
+ */
+ private static void initAnalyzedText(AnalysedText at) {
+ at.addSentence(0, TEST_ANALYSED_TEXT.getEnd());
+ at.addChunk(TEST_TEXT.indexOf("New Zealand"), TEST_TEXT.indexOf("New
Zealand")+"New Zealand".length())
+ .addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+ at.addChunk(TEST_TEXT.indexOf("geologist"),
TEST_TEXT.indexOf("geologist")+"geologist".length())
+ .addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+ at.addChunk(TEST_TEXT.indexOf("the University of Otago"),
+ TEST_TEXT.length()-1).addAnnotation(PHRASE_ANNOTATION,
NOUN_PHRASE);
+ //add some tokens
+ at.addToken(0, 2).addAnnotation(POS_ANNOTATION, Value.value(new
PosTag("NE",Pos.Abbreviation),1d));
+ at.addToken(2, 3).addAnnotation(POS_ANNOTATION, Value.value(new
PosTag(".",Pos.Point),1d));
int start = TEST_TEXT.indexOf("(1869 - November 1950)");
-
TEST_ANALYSED_TEXT.addToken(start,start+1).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("(",Pos.OpenBracket),1d));
-
TEST_ANALYSED_TEXT.addToken(start+1,start+5).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NUM",Pos.Numeral),1d));
-
TEST_ANALYSED_TEXT.addToken(start+6,start+7).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("-",Pos.Hyphen),1d));
-
TEST_ANALYSED_TEXT.addToken(start+8,start+16).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-
TEST_ANALYSED_TEXT.addToken(start+17,start+21).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NUM",Pos.Numeral),1d));
-
TEST_ANALYSED_TEXT.addToken(start+21,start+22).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag(")",Pos.CloseBracket),1d));
+ at.addToken(start,start+1).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("(",Pos.OpenBracket),1d));
+ at.addToken(start+1,start+5).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NUM",Pos.Numeral),1d));
+ at.addToken(start+6,start+7).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("-",Pos.Hyphen),1d));
+ at.addToken(start+8,start+16).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+ at.addToken(start+17,start+21).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NUM",Pos.Numeral),1d));
+ at.addToken(start+21,start+22).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag(")",Pos.CloseBracket),1d));
start = TEST_TEXT.indexOf("geologist");
-
TEST_ANALYSED_TEXT.addToken(start,start+9).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+ at.addToken(start,start+9).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
start = TEST_TEXT.indexOf("New Zealand");
-
TEST_ANALYSED_TEXT.addToken(start,start+3).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-
TEST_ANALYSED_TEXT.addToken(start+4,start+11).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ at.addToken(start,start+3).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+ at.addToken(start+4,start+11).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
start = TEST_TEXT.indexOf("the University of Otago");
-
TEST_ANALYSED_TEXT.addToken(start,start+3).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("ART",Pos.Article),1d));
-
TEST_ANALYSED_TEXT.addToken(start+4,start+14).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-
TEST_ANALYSED_TEXT.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("OF",LexicalCategory.PronounOrDeterminer),1d));
-
TEST_ANALYSED_TEXT.addToken(start+18,start+23).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
-
TEST_ANALYSED_TEXT.addToken(start+23,start+24).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag(".",Pos.Point),1d));
-
+ at.addToken(start,start+3).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("ART",Pos.Article),1d));
+ at.addToken(start+4,start+14).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+ at.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("OF",LexicalCategory.PronounOrDeterminer),1d));
+ at.addToken(start+18,start+23).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ at.addToken(start+23,start+24).addAnnotation(POS_ANNOTATION,
Value.value(new PosTag(".",Pos.Point),1d));
}
private LabelTokenizer labelTokenizer = new SimpleLabelTokenizer();
+
@Before
public void bindServices() throws IOException {
}
@@ -255,6 +275,34 @@ public class EntityLinkingEngineTest {
/**
* This tests the EntityLinker functionality (if the expected Entities
* are linked). In this case with the default configurations for
+ * {@link LexicalCategory#Noun}.
+ * @throws Exception
+ */
+ @Test
+ public void testEntityLinkerWithWrongOrder() throws Exception {
+ LanguageProcessingConfig tpc = new LanguageProcessingConfig();
+
tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
+ tpc.setLinkedPos(Collections.EMPTY_SET);
+ EntityLinkerConfig config = new EntityLinkerConfig();
+ config.setMinFoundTokens(2);//this is assumed by this test
+ config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
+ EntityLinker linker = new EntityLinker(TEST_ANALYSED_TEXT_WO,"en",
+ tpc, searcher, config, labelTokenizer);
+ linker.process();
+ Map<String,List<String>> expectedResults = new
HashMap<String,List<String>>();
+ expectedResults.put("Marshall Patrick", new ArrayList<String>(
+ Arrays.asList("urn:test:PatrickMarshall")));
+ expectedResults.put("geologist", new ArrayList<String>(
+ Arrays.asList("urn:test:redirect:Geologist"))); //the
redirected entity
+ expectedResults.put("New Zealand", new ArrayList<String>(
+ Arrays.asList("urn:test:NewZealand")));
+ expectedResults.put("University of Otago", new ArrayList<String>(
+
Arrays.asList("urn:test:UniversityOfOtago","urn:test:UniversityOfOtago_Texas")));
+ validateEntityLinkerResults(linker, expectedResults);
+ }
+ /**
+ * This tests the EntityLinker functionality (if the expected Entities
+ * are linked). In this case with the default configurations for
* {@link Pos#ProperNoun}.
* @throws Exception
*/