en...

rwesten Wed, 24 Apr 2013 03:53:30 -0700

Author: rwesten
Date: Wed Apr 24 10:52:58 2013
New Revision: 1471366

URL: http://svn.apache.org/r1471366
Log:
implementation of STANBOL-1049; fix for STANBOL-1051


Modified:
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
 Wed Apr 24 10:52:58 2013
@@ -161,6 +161,7 @@ public class LanguageProcessingConfig im
      */
     private boolean linkMultiMatchableTokensInChunkState = 
DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
     private int minSearchTokenLength;
+    private boolean linkOnlyUpperCaseTokenWithUnknownPos;
 
 
     /**
@@ -526,6 +527,23 @@ public class LanguageProcessingConfig im
         return minSearchTokenLength;
     }
 
+    /**
+     * This returns the state if only upper case tokens should be marked as 
+     * 'linkable' if they do not have a POS tag
+     * @return the state
+     */
+    public boolean isLinkOnlyUpperCaseTokensWithUnknownPos(){
+        return linkOnlyUpperCaseTokenWithUnknownPos;
+    }
+ 
+    /**
+     * This returns the state if only upper case tokens should be marked as 
+     * 'linkable' if they do not have a POS tag
+     * @param linkOnlyUpperCaseTokenWithUnknownPos the state
+     */
+    public void setLinkOnlyUpperCaseTokenWithUnknownPos(boolean 
linkOnlyUpperCaseTokenWithUnknownPos) {
+        this.linkOnlyUpperCaseTokenWithUnknownPos = 
linkOnlyUpperCaseTokenWithUnknownPos;
+    }
     
     /**
      * Clones the {@link LanguageProcessingConfig}. Intended to be used
@@ -549,6 +567,7 @@ public class LanguageProcessingConfig im
         c.linkMultiMatchableTokensInChunkState = 
linkMultiMatchableTokensInChunkState;
         c.matchedLexicalCategories = matchedLexicalCategories;
         c.minSearchTokenLength = minSearchTokenLength;
+        c.linkOnlyUpperCaseTokenWithUnknownPos = 
linkOnlyUpperCaseTokenWithUnknownPos;
         return c;
     }
 

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
 Wed Apr 24 10:52:58 2013
@@ -17,6 +17,7 @@
 package org.apache.stanbol.enhancer.engines.entitylinking.config;
 
 import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.EnumSet;
@@ -39,6 +40,19 @@ import org.slf4j.LoggerFactory;
 public class TextProcessingConfig {
 
     private static final Logger log = 
LoggerFactory.getLogger(TextProcessingConfig.class);
+
+    /**
+     * Holds a list of ISO 2 letter language codes that do use unicase scripts 
-
+     * do not know upper case letters.<p>
+     * More information is available the Wikipedia page for 
+     * <a href="http://en.wikipedia.org/wiki/Letter_case";>Letter case</a>.
+     */
+    public static final Set<String> UNICASE_SCRIPT_LANUAGES;
+    static {
+        UNICASE_SCRIPT_LANUAGES = Collections.unmodifiableSet(new 
HashSet<String>(Arrays.asList(
+            "ar","he","zh","ja","ko","ka","hi","ne")));
+    }
+    
     /**
      * If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link 
Pos#Acronym} are Matched. If
      * deactivated all Tokens with the category {@link LexicalCategory#Noun} 
and 
@@ -55,6 +69,17 @@ public class TextProcessingConfig {
      * Default for the {@link #PROCESS_ONLY_PROPER_NOUNS_STATE} (false)
      */
     public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE = 
false;
+    
+    /**
+     * Switch that allows to enable a mode where only upper case tokens are 
marked as
+     * 'linkable' if no POS tag is available (or existing POS tags are of low 
probability).<p>
+     * This is especially usefull for processing text in languages where no 
POS tagger is
+     * available.<p>
+     * NOTE: that this configuration is ignored for lanugages where there are 
no 
+     * upper case letters (Arabic, Hebrew, Chinese, Japanese, Korean, Hindi)
+     */
+    public static final String 
LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG = 
"enhancer.engines.linking.linkOnlyUpperCaseTokensWithMissingPosTag";
+    
     /**
      * Allows to configure the processed languages by using the syntax 
supported by {@link LanguageConfiguration}.
      * In addition this engine supports language specific configurations for 
matched {@link LexicalCategory}
@@ -209,8 +234,7 @@ public class TextProcessingConfig {
      */
     public final static TextProcessingConfig 
createInstance(Dictionary<String,Object> configuration) throws 
ConfigurationException {
         TextProcessingConfig tpc = new TextProcessingConfig();
-        //Parse the default text processing configuration
-        //set the default LexicalTypes
+        //Parse the Proper Noun Linking state 
         Object value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE);
         boolean properNounState;
         if(value instanceof Boolean){
@@ -231,6 +255,17 @@ public class TextProcessingConfig {
             log.debug("> Noun matching activated (matched LexicalCategories: 
{})",
                 tpc.defaultConfig.getLinkedLexicalCategories());
         }
+        //parse upper case linking for languages without POS support state
+        //see STANBOL-1049
+        value = 
configuration.get(LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG);
+        final Boolean linkOnlyUpperCaseTokensWithMissingPosTag;
+        if(value instanceof Boolean){
+            
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(((Boolean)value).booleanValue());
+        } else if(value != null){
+            
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(Boolean.parseBoolean(value.toString()));
+        } else { //the default is the same as the properNounState
+            
tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(properNounState);
+        }
         // init MIN_SEARCH_TOKEN_LENGTH
         value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
         Integer minSearchTokenLength;

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 Wed Apr 24 10:52:58 2013
@@ -20,6 +20,7 @@
 package org.apache.stanbol.enhancer.engines.entitylinking.impl;
 
 import static java.util.Collections.disjoint;
+import static 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
@@ -29,11 +30,13 @@ import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 
 import org.apache.commons.collections.Predicate;
 import org.apache.commons.collections.iterators.FilterIterator;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
@@ -91,6 +94,11 @@ public class ProcessingState {
     //protected final EntityLinkerConfig elc;
 
     private AnalysedText at;
+    /**
+     * If the language uses a unicase script and therefore upper case specific
+     * processing rules can not be used (see STANBOL-1049)
+     */
+    private boolean isUnicaseLanguage;
 
     private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new 
Predicate() {
         @Override
@@ -120,6 +128,10 @@ public class ProcessingState {
         }
         this.at = at; //store as field (just used for logging)
         this.language = language;
+        //STANBOL-1049: we need now to know if a language uses a unicase script
+        //ensure lower case and only use the language part 
+        String lookupLang = language.toLowerCase(Locale.ROOT).split("[_-]")[0];
+        this.isUnicaseLanguage = UNICASE_SCRIPT_LANUAGES.contains(lookupLang);
         //prefer to iterate over sentences
         Iterator<Sentence> sentences = at.getSentences();
         this.sections = sentences.hasNext() ? sentences : 
Collections.singleton(at).iterator();
@@ -222,8 +234,8 @@ public class ProcessingState {
         section = null;
         processableTokensIterator = null;
         consumedIndex = -1;
-        boolean foundProcessable = false;
-        while(!foundProcessable && sections.hasNext()){
+        boolean foundLinkableToken = false;
+        while(!foundLinkableToken && sections.hasNext()){
             section = sections.next();
             tokens.clear(); //clear token for each section (STANBOL-818)
             Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
@@ -265,29 +277,63 @@ public class ProcessingState {
                                          tokenData.inChunk != null ? 
tokenData.inChunk.chunk.getSpan() : "none",
                                          tokenData.morpho != null ? 
tokenData.morpho : "none"});
                     }
-                    //determine if the token should be linked/matched
-                    tokenData.isLinkable = tokenData.isLinkablePos;
-                    tokenData.isMatchable = tokenData.isLinkable || 
tokenData.isMatchablePos;
-                    //for non processable but upper case tolkens we need to 
check
-                    //the uper case token configuration
-                    if(!tokenData.isLinkable && tokenData.upperCase){
-                        if(tokenData.index > 0 && //not a sentence or 
sub-sentence start
+                    if(!tokenData.hasAlphaNumeric){
+                        tokenData.isLinkable = false;
+                        tokenData.isMatchable = false;
+                    } else {
+                        // (1) apply basic rules for linkable/processable 
tokens
+                        //determine if the token should be linked/matched
+                        tokenData.isLinkable = tokenData.isLinkablePos != null 
? tokenData.isLinkablePos : false;
+                        //matchabel := linkable OR has matchablePos
+                        tokenData.isMatchable = tokenData.isLinkable || 
+                                (tokenData.isMatchablePos != null && 
tokenData.isMatchablePos);
+                        
+                        //(2) for non linkable tokens check for upper case 
rules
+                        if(!tokenData.isLinkable && tokenData.upperCase && 
+                                tokenData.index > 0 && //not a sentence or 
sub-sentence start
                                 
!tokens.get(tokenData.index-1).isSubSentenceStart){
-                            if(tpc.isLinkUpperCaseTokens() && //if upper case 
tokens should be linked
-                                    tokenData.isMatchable) { //convert 
matchable to 
-                                tokenData.isLinkable = true; //linkable
-                            } else if(tpc.isMatchUpperCaseTokens() || 
tpc.isLinkUpperCaseTokens()){
-                                //if matching for upperCase Tokens is 
activated or
-                                //linking is activated, but the current Token 
is not
-                                //matchable, than mark the Token as matchable
-                                tokenData.isMatchable = true;
-                            } //else upper case matching and linking is 
deactivated
-                        }
+                            //We have an upper case token!
+                            if(tpc.isLinkUpperCaseTokens()){
+                                if(tokenData.isMatchable) { //convert 
matchable to 
+                                    tokenData.isLinkable = true; //linkable
+                                } else { // and other tokens to
+                                    tokenData.isMatchable = true; //matchable
+                                }
+                            } else { 
+                                //finally we need to convert other Tokens to 
matchable
+                                //if MatchUpperCaseTokens is active
+                                if(!tokenData.isMatchable && 
tpc.isMatchUpperCaseTokens()){
+                                    tokenData.isMatchable = true;
+                                }
+                            }
+                        } //else not an upper case token
+                        
+                        //(3) Unknown POS tag Rules (see STANBOL-1049)
+                        if(!tokenData.isLinkable && tokenData.isLinkablePos == 
null && 
+                                tokenData.isLinkablePos == null){
+                            if(isUnicaseLanguage || 
!tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
+                                if(tokenData.hasSearchableLength){
+                                    tokenData.isLinkable = true;
+                                } //else no need to change the state
+                            } else { //non unicase language and link only 
upper case tokens enabled
+                                if(tokenData.upperCase && // upper case token
+                                        tokenData.index > 0 && //not a 
sentence or sub-sentence start
+                                        
!tokens.get(tokenData.index-1).isSubSentenceStart){
+                                    if(tokenData.hasSearchableLength){
+                                        tokenData.isLinkable = true;
+                                    } else {
+                                        tokenData.isMatchable = true;
+                                    }
+                                } else if(tokenData.hasSearchableLength){ 
//lower case and long token
+                                    tokenData.isMatchable = true;
+                                } //else lower case and short word 
+                            }
+                        } //else already linkable or POS tag present
                     }
                     //add the token to the list
                     tokens.add(tokenData);
-                    if(!foundProcessable){
-                        foundProcessable = tokenData.isLinkable;
+                    if(!foundLinkableToken){
+                        foundLinkableToken = tokenData.isLinkable;
                     }
                     if(activeChunk != null){
                         if(tokenData.isMatchable){
@@ -310,8 +356,8 @@ public class ProcessingState {
                                             log.debug("     > convert Token 
{}: {} (pos:{}) from matchable to processable",
                                                 new 
Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
                                             ct.isLinkable = true;
-                                            if(!foundProcessable){
-                                                foundProcessable = true;
+                                            if(!foundLinkableToken){
+                                                foundLinkableToken = true;
                                             }
                                         }
                                         i--;//mark both (ct & pt) as processed
@@ -328,7 +374,7 @@ public class ProcessingState {
             }
         }
         processableTokensIterator = new FilterIterator(tokens.iterator(), 
PROCESSABLE_TOKEN_OREDICATE);
-        return foundProcessable;
+        return foundLinkableToken;
     }
     /**
      * Getter for the text covered by the next tokenCount tokens relative to
@@ -495,13 +541,17 @@ public class ProcessingState {
          */
         public final boolean upperCase;
         /**
+         * if the length of the token is &gt;= {@link 
LanguageProcessingConfig#getMinSearchTokenLength()}
+         */
+        public boolean hasSearchableLength;
+        /**
          * If the POS type of this word matches a linkable category
          */
-        public final boolean isLinkablePos;
+        public final Boolean isLinkablePos;
         /**
          * if the POS type of this word matches a matchable category
          */
-        public final boolean isMatchablePos;
+        public final Boolean isMatchablePos;
         /**
          * if this Token represents the start of an sub-sentence such as an 
          * starting ending quote 
@@ -521,7 +571,7 @@ public class ProcessingState {
             this.index = index;
             this.inChunk = chunk;
             this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
-
+            this.hasSearchableLength = token.getSpan().length() >= 
tpc.getMinSearchTokenLength();
             PosTag selectedPosTag = null;
             boolean matchedPosTag = false; //matched any of the POS annotations
             
@@ -541,13 +591,16 @@ public class ProcessingState {
                 if((!disjoint(tpc.getLinkedLexicalCategories(), 
posTag.getCategories())) ||
                         (!disjoint(tpc.getLinkedPos(), 
posTag.getPosHierarchy())) ||
                         tpc.getLinkedPosTags().contains(posTag.getTag())){
-                    if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
+                    if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                            posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
                         selectedPosTag = posTag;
                         isLinkablePos = true;
                         isMatchablePos = true;
+                        matchedPosTag = true;
                         break;
                     } // else probability to low for inclusion
-                } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
+                } else if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                        posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
                     selectedPosTag = posTag; //also rejected PosTags are 
selected
                     matchedPosTag = true;
                     isLinkablePos = false;
@@ -555,14 +608,13 @@ public class ProcessingState {
                 } // else probability to low for exclusion
             }
             if(!matchedPosTag) { //not matched against a POS Tag ...
-                // ... fall back to the token length
-                this.isLinkablePos = token.getSpan().length() >= 
tpc.getMinSearchTokenLength();
+                this.isLinkablePos = null;
             } else {
                 this.isLinkablePos = isLinkablePos;
             }
             
             //(2) check if this token should be considered to match labels of 
suggestions
-            if(this.isLinkablePos){ //processable tokens are also matchable
+            if(this.isLinkablePos != null && this.isLinkablePos){ 
//processable tokens are also matchable
                 this.isMatchablePos = true;
             } else { //check POS and length to see if token is matchable
                 matchedPosTag = false; //reset to false!
@@ -571,14 +623,16 @@ public class ProcessingState {
                     if(posTag.isMapped()){
                         
if(!Collections.disjoint(tpc.getMatchedLexicalCategories(), 
                             posTag.getCategories())){
-                            if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
+                            if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                                    posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
                                 //override selectedPosTag if present
                                 selectedPosTag = posTag; //mark the matchable 
as selected PosTag
                                 isMatchablePos = true;
                                 matchedPosTag = true;
                                 break;
                             } // else probability to low for inclusion
-                        } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
+                        } else if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                                posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
                             if(selectedPosTag == null){ //do not override 
existing values
                                 selectedPosTag = posTag; //also rejected 
PosTags are selected
                             }
@@ -590,7 +644,8 @@ public class ProcessingState {
                 }
                 if(!matchedPosTag){ //not matched against POS tag ...
                     //fall back to the token length
-                    this.isMatchablePos = token.getSpan().length() >= 
tpc.getMinSearchTokenLength();    
+                    this.isMatchablePos = null;
+                    //this.isMatchablePos = token.getSpan().length() >= 
tpc.getMinSearchTokenLength();    
                 } else {
                     this.isMatchablePos = isMatchablePos;
                 }
@@ -599,10 +654,12 @@ public class ProcessingState {
             for(Value<PosTag> posAnnotation : posAnnotations){
                 PosTag posTag = posAnnotation.value();
                 
if((!disjoint(SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
-                    if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
+                    if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                            posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
                         isSubSentenceStart = true;
                     } // else probability to low for inclusion
-                } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
+                } else if(posAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                        posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
                     isSubSentenceStart = false;
                 }
             }
@@ -696,11 +753,13 @@ public class ProcessingState {
             for (Value<PhraseTag> phraseAnnotation : 
chunk.getAnnotations(PHRASE_ANNOTATION)) {
                 if 
(tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
                     || 
tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
-                    if (phraseAnnotation.probability() >= 
tpc.getMinPhraseAnnotationProbability()) {
+                    if (phraseAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                            phraseAnnotation.probability() >= 
tpc.getMinPhraseAnnotationProbability()) {
                         process = true;
                         break;
                     } // else probability to low for inclusion
-                } else if (phraseAnnotation.probability() >= 
tpc.getMinExcludePhraseAnnotationProbability()) {
+                } else if (phraseAnnotation.probability() == 
Value.UNKNOWN_PROBABILITY ||
+                        phraseAnnotation.probability() >= 
tpc.getMinExcludePhraseAnnotationProbability()) {
                     process = false;
                     break;
                 } // else probability to low for exclusion

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1471366&r1=1471365&r2=1471366&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
 Wed Apr 24 10:52:58 2013
@@ -98,7 +98,14 @@ public class EntityLinkingEngineTest {
     public static final String TEST_TEXT = "Dr. Patrick Marshall (1869 - 
November 1950) was a"
         + " geologist who lived in New Zealand and worked at the University of 
Otago.";
     
+    /**
+     * changed oder af given and family name
+     */
+    public static final String TEST_TEXT_WO = "Dr. Marshall Patrick (1869 - 
November 1950) was a"
+        + " geologist who lived in New Zealand and worked at the University of 
Otago.";
+
     private static AnalysedText TEST_ANALYSED_TEXT;
+    private static AnalysedText TEST_ANALYSED_TEXT_WO;
     
 //    public static final String TEST_TEXT2 = "A CBS televised debate between 
Australia's " +
 //             "candidates for Prime Minister in the upcoming US election has 
been rescheduled " +
@@ -108,6 +115,8 @@ public class EntityLinkingEngineTest {
     
     private static final String TEST_REFERENCED_SITE_NAME = "dummRefSiteName";
     
+    private static Value<PhraseTag> NOUN_PHRASE = Value.value(new 
PhraseTag("NP",LexicalCategory.Noun),1d);
+    
     static TestSearcherImpl searcher;
     
     public static final UriRef NAME = new UriRef(NamespaceEnum.rdfs+"label");
@@ -166,49 +175,60 @@ public class EntityLinkingEngineTest {
         graph.add(new TripleImpl(uri, TYPE, 
OntologicalClasses.DBPEDIA_ORGANISATION));
         searcher.addEntity(new Entity(uri, graph));
         
-        Value<PhraseTag> nounPhrase = Value.value(new 
PhraseTag("NP",LexicalCategory.Noun),1d);
         TEST_ANALYSED_TEXT = 
AnalysedTextFactory.getDefaultInstance().createAnalysedText(
-                ciFactory.createBlob(new StringSource(TEST_TEXT)));
-        TEST_ANALYSED_TEXT.addSentence(0, TEST_ANALYSED_TEXT.getEnd());
-        //add some noun phrases
-        TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick 
Marshall".length()).addAnnotation(PHRASE_ANNOTATION, nounPhrase);
-        TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("New Zealand"), 
TEST_TEXT.indexOf("New Zealand")+"New Zealand".length())
-        .addAnnotation(PHRASE_ANNOTATION, nounPhrase);
-        TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("geologist"), 
TEST_TEXT.indexOf("geologist")+"geologist".length())
-        .addAnnotation(PHRASE_ANNOTATION, nounPhrase);
-        TEST_ANALYSED_TEXT.addChunk(TEST_TEXT.indexOf("the University of 
Otago"), 
-            TEST_TEXT.length()-1).addAnnotation(PHRASE_ANNOTATION, nounPhrase);
-        //add some tokens
-        TEST_ANALYSED_TEXT.addToken(0, 2).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.Abbreviation),1d));
-        TEST_ANALYSED_TEXT.addToken(2, 3).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag(".",Pos.Point),1d));
+            ciFactory.createBlob(new StringSource(TEST_TEXT)));
+        TEST_ANALYSED_TEXT_WO = 
AnalysedTextFactory.getDefaultInstance().createAnalysedText(
+                ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
+        initAnalyzedText(TEST_ANALYSED_TEXT);
+        TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick 
Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
         TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
         TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+        initAnalyzedText(TEST_ANALYSED_TEXT_WO);
+        TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall 
Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+        TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+        TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+    }
+
+    /**
+     * @param nounPhrase
+     */
+    private static void initAnalyzedText(AnalysedText at) {
+        at.addSentence(0, TEST_ANALYSED_TEXT.getEnd());
+        at.addChunk(TEST_TEXT.indexOf("New Zealand"), TEST_TEXT.indexOf("New 
Zealand")+"New Zealand".length())
+        .addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+        at.addChunk(TEST_TEXT.indexOf("geologist"), 
TEST_TEXT.indexOf("geologist")+"geologist".length())
+        .addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
+        at.addChunk(TEST_TEXT.indexOf("the University of Otago"), 
+            TEST_TEXT.length()-1).addAnnotation(PHRASE_ANNOTATION, 
NOUN_PHRASE);
+        //add some tokens
+        at.addToken(0, 2).addAnnotation(POS_ANNOTATION, Value.value(new 
PosTag("NE",Pos.Abbreviation),1d));
+        at.addToken(2, 3).addAnnotation(POS_ANNOTATION, Value.value(new 
PosTag(".",Pos.Point),1d));
         int start = TEST_TEXT.indexOf("(1869 - November 1950)");
-        
TEST_ANALYSED_TEXT.addToken(start,start+1).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("(",Pos.OpenBracket),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+1,start+5).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NUM",Pos.Numeral),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+6,start+7).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("-",Pos.Hyphen),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+8,start+16).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+17,start+21).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NUM",Pos.Numeral),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+21,start+22).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag(")",Pos.CloseBracket),1d));
+        at.addToken(start,start+1).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("(",Pos.OpenBracket),1d));
+        at.addToken(start+1,start+5).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NUM",Pos.Numeral),1d));
+        at.addToken(start+6,start+7).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("-",Pos.Hyphen),1d));
+        at.addToken(start+8,start+16).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+        at.addToken(start+17,start+21).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NUM",Pos.Numeral),1d));
+        at.addToken(start+21,start+22).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag(")",Pos.CloseBracket),1d));
                 
         start = TEST_TEXT.indexOf("geologist");
-        
TEST_ANALYSED_TEXT.addToken(start,start+9).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+        at.addToken(start,start+9).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
         
         start = TEST_TEXT.indexOf("New Zealand");
-        
TEST_ANALYSED_TEXT.addToken(start,start+3).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+4,start+11).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+        at.addToken(start,start+3).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+        at.addToken(start+4,start+11).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
         
         start = TEST_TEXT.indexOf("the University of Otago");
-        
TEST_ANALYSED_TEXT.addToken(start,start+3).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("ART",Pos.Article),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+4,start+14).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("OF",LexicalCategory.PronounOrDeterminer),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+18,start+23).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
-        
TEST_ANALYSED_TEXT.addToken(start+23,start+24).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag(".",Pos.Point),1d));
-        
+        at.addToken(start,start+3).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("ART",Pos.Article),1d));
+        at.addToken(start+4,start+14).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NE",Pos.CommonNoun),1d));
+        at.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("OF",LexicalCategory.PronounOrDeterminer),1d));
+        at.addToken(start+18,start+23).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+        at.addToken(start+23,start+24).addAnnotation(POS_ANNOTATION, 
Value.value(new PosTag(".",Pos.Point),1d));
     }
     
     private LabelTokenizer labelTokenizer = new SimpleLabelTokenizer();
 
+
     @Before
     public void bindServices() throws IOException {
     }
@@ -255,6 +275,34 @@ public class EntityLinkingEngineTest {
     /**
      * This tests the EntityLinker functionality (if the expected Entities
      * are linked). In this case with the default configurations for
+     * {@link LexicalCategory#Noun}.
+     * @throws Exception
+     */
+    @Test
+    public void testEntityLinkerWithWrongOrder() throws Exception {
+        LanguageProcessingConfig tpc = new LanguageProcessingConfig();
+        
tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
+        tpc.setLinkedPos(Collections.EMPTY_SET);
+        EntityLinkerConfig config = new EntityLinkerConfig();
+        config.setMinFoundTokens(2);//this is assumed by this test
+        config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
+        EntityLinker linker = new EntityLinker(TEST_ANALYSED_TEXT_WO,"en",
+            tpc, searcher, config, labelTokenizer);
+        linker.process();
+        Map<String,List<String>> expectedResults = new 
HashMap<String,List<String>>();
+        expectedResults.put("Marshall Patrick", new ArrayList<String>(
+                Arrays.asList("urn:test:PatrickMarshall")));
+        expectedResults.put("geologist", new ArrayList<String>(
+                Arrays.asList("urn:test:redirect:Geologist"))); //the 
redirected entity
+        expectedResults.put("New Zealand", new ArrayList<String>(
+                Arrays.asList("urn:test:NewZealand")));
+        expectedResults.put("University of Otago", new ArrayList<String>(
+                
Arrays.asList("urn:test:UniversityOfOtago","urn:test:UniversityOfOtago_Texas")));
+        validateEntityLinkerResults(linker, expectedResults);
+    }
+    /**
+     * This tests the EntityLinker functionality (if the expected Entities
+     * are linked). In this case with the default configurations for
      * {@link Pos#ProperNoun}.
      * @throws Exception
      */

svn commit: r1471366 - in /stanbol/trunk/enhancement-engines/entitylinking/engine/src: main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/ main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ test/java/org/apache/stanbol/en...

Reply via email to