ProcessingState.java

rwesten Thu, 21 Feb 2013 06:49:33 -0800

Author: rwesten
Date: Thu Feb 21 14:49:09 2013
New Revision: 1448671

URL: http://svn.apache.org/r1448671
Log:
STANBOL-951: The EntityLinking keeps now track of sub-sentence start/end tokens 
when processing upper case words. In addition upper case tokens are now only 
increased from none to matching and from matching to linking, but not from none 
to linking; Minor: added a default languages configuration to the 
TextProcessingConfig. Currently this has no effect as the 
EntityhubLinkingEngine overrides this with its own default, but this will 
ensure that other Engine implementations do start with a reasonable default


Modified:
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1448671&r1=1448670&r2=1448671&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
 Thu Feb 21 14:49:09 2013
@@ -103,7 +103,11 @@ public class TextProcessingConfig {
      * The languages this engine is configured to enhance. An empty List is
      * considered as active for any language
      */
-    private LanguageConfiguration languages = new 
LanguageConfiguration(PROCESSED_LANGUAGES, new String[]{"*"});
+    private LanguageConfiguration languages = new 
LanguageConfiguration(PROCESSED_LANGUAGES, 
+        // link multiple matchable tokens in chunks; link upper case words
+        new String[]{"*;lmmtip;uc=LINK;prop=0.75;pprob=0.75", 
+            "de;uc=MATCH", //in German all Nouns are upper case
+            });
 
     private LanguageProcessingConfig defaultConfig;
     private Map<String,LanguageProcessingConfig> languageConfigs = new 
HashMap<String,LanguageProcessingConfig>();

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java?rev=1448671&r1=1448670&r2=1448671&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
 Thu Feb 21 14:49:09 2013
@@ -31,7 +31,6 @@ import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.TreeMap;
 import java.util.Map.Entry;
 import java.util.Set;
 
@@ -39,7 +38,6 @@ import org.apache.clerezza.rdf.core.Lang
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
@@ -69,7 +67,6 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
-import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 /**

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1448671&r1=1448670&r2=1448671&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 Thu Feb 21 14:49:09 2013
@@ -24,6 +24,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Iterator;
@@ -44,6 +45,7 @@ import org.apache.stanbol.enhancer.nlp.m
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -96,6 +98,9 @@ public class ProcessingState {
             return ((TokenData)object).isProcessable;
         }
     };
+
+    public static final Collection<Pos> SUB_SENTENCE_START_POS = EnumSet.of(
+        Pos.Quote);
     
     public ProcessingState(AnalysedText at, String language, 
LanguageProcessingConfig tpc, EntityLinkerConfig elc){
         if(at == null){
@@ -235,7 +240,7 @@ public class ProcessingState {
                 }
                 if(span.getType() == SpanTypeEnum.Chunk){
                     ChunkData chunkData = new ChunkData((Chunk)span);
-                    if(chunkData.isProcessable){
+                    if(chunkData.isProcessable()){
                         if(activeChunk != null){ //current Chunk not yet 
closed -> overlapping chunks!
                             if(activeChunk.getEndChar() < span.getEnd()){ 
//merge partly overlapping chunks
                                 log.info("   - merge overlapping and 
processable Chunks {} <-> {}",
@@ -264,12 +269,32 @@ public class ProcessingState {
                                          tokenData.inChunk != null ? 
tokenData.inChunk.chunk.getSpan() : "none",
                                          tokenData.morpho != null ? 
tokenData.morpho : "none"});
                     }
+                    //determine if the token should be linked/matched
+                    tokenData.isProcessable = tokenData.isLinkablePos;
+                    tokenData.isMatchable = tokenData.isProcessable || 
tokenData.isMatchablePos;
+                    //for non processable but upper case tolkens we need to 
check
+                    //the uper case token configuration
+                    if(!tokenData.isProcessable && tokenData.upperCase){
+                        if(tokenData.index > 0 && //not a sentence or 
sub-sentence start
+                                
!tokens.get(tokenData.index-1).isSubSentenceStart){
+                            if(tpc.isLinkUpperCaseTokens() && //if upper case 
tokens should be linked
+                                    tokenData.isMatchable) { //convert 
matchable to 
+                                tokenData.isProcessable = true; //linkable
+                            } else if(tpc.isMatchUpperCaseTokens() || 
tpc.isLinkUpperCaseTokens()){
+                                //if matching for upperCase Tokens is 
activated or
+                                //linking is activated, but the current Token 
is not
+                                //matchable, than mark the Token as matchable
+                                tokenData.isMatchable = true;
+                            } //else upper case matching and linking is 
deactivated
+                        }
+                    }
+                    //add the token to the list
                     tokens.add(tokenData);
                     if(!foundProcessable){
                         foundProcessable = tokenData.isProcessable;
                     }
                     if(activeChunk != null){
-                        if(tokenData.isMatchable ){
+                        if(tokenData.isMatchable){
                             activeChunk.matchableCount++;
                         } 
                         if (span.getEnd() >= activeChunk.getEndChar()){
@@ -469,7 +494,24 @@ public class ProcessingState {
         final ChunkData inChunk;
         /** the morphological features of the Token (selected based on the POS 
Tag) */
         final MorphoFeatures morpho;
-        
+        /**
+         * if this token starts with an upperCase letter
+         */
+        final boolean upperCase;
+        /**
+         * If the POS type of this word matches a linkable category
+         */
+        final boolean isLinkablePos;
+        /**
+         * if the POS type of this word matches a matchable category
+         */
+        final boolean isMatchablePos;
+        /**
+         * if this Token represents the start of an sub-sentence such as an 
+         * starting ending quote 
+         * @see ProcessingState#SUB_SENTENCE_START_POS
+         */
+        final boolean isSubSentenceStart;
         /**
          * Constructs and initializes meta data needed for linking based 
          * on the current tokens (and its NLP annotation)
@@ -488,49 +530,47 @@ public class ProcessingState {
             boolean matchedPosTag = false; //matched any of the POS annotations
             
             //(1) check if this Token should be linked against the Vocabulary 
(isProcessable)
-            boolean upperCase = index > 0 && //not a sentence start
-                    token.getEnd() > token.getStart() && //not an empty token
+            upperCase = token.getEnd() > token.getStart() && //not an empty 
token
                     Character.isUpperCase(token.getSpan().codePointAt(0)); 
//and upper case
-            if(tpc.isLinkUpperCaseTokens() && upperCase){
-                isProcessable = true;
-            } else { //else use POS tag & token length
-                for(Value<PosTag> posAnnotation : 
token.getAnnotations(POS_ANNOTATION)){
-                    // check three possible match
-                    //  1. the LexicalCategory matches
-                    //  2. the Pos matches
-                    //  3. the String tag matches
-                    PosTag posTag = posAnnotation.value();
-                    if((!disjoint(tpc.getLinkedLexicalCategories(), 
posTag.getCategories())) ||
-                            (!disjoint(tpc.getLinkedPos(), 
posTag.getPosHierarchy())) ||
-                            tpc.getLinkedPosTags().contains(posTag.getTag())){
-                        if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
-                            selectedPosTag = posTag;
-                            isProcessable = true;
-                            matchedPosTag = true;
-                            break;
-                        } // else probability to low for inclusion
-                    } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
-                        selectedPosTag = posTag; //also rejected PosTags are 
selected
-                        matchedPosTag = true;
-                        isProcessable = false;
+            boolean isLinkablePos = false;
+            boolean isMatchablePos = false;
+            boolean isSubSentenceStart = false;
+            List<Value<PosTag>> posAnnotations = 
token.getAnnotations(POS_ANNOTATION);
+            for(Value<PosTag> posAnnotation : posAnnotations){
+                // check three possible match
+                //  1. the LexicalCategory matches
+                //  2. the Pos matches
+                //  3. the String tag matches
+                PosTag posTag = posAnnotation.value();
+                if((!disjoint(tpc.getLinkedLexicalCategories(), 
posTag.getCategories())) ||
+                        (!disjoint(tpc.getLinkedPos(), 
posTag.getPosHierarchy())) ||
+                        tpc.getLinkedPosTags().contains(posTag.getTag())){
+                    if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
+                        selectedPosTag = posTag;
+                        isLinkablePos = true;
+                        isMatchablePos = true;
                         break;
-                    } // else probability to low for exclusion
-                }
-                if(!matchedPosTag) { //not matched against a POS Tag ...
-                    // ... fall back to the token length
-                    isProcessable = token.getSpan().length() >= 
elc.getMinSearchTokenLength();
-                }
+                    } // else probability to low for inclusion
+                } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
+                    selectedPosTag = posTag; //also rejected PosTags are 
selected
+                    matchedPosTag = true;
+                    isLinkablePos = false;
+                    break;
+                } // else probability to low for exclusion
+            }
+            if(!matchedPosTag) { //not matched against a POS Tag ...
+                // ... fall back to the token length
+                this.isLinkablePos = token.getSpan().length() >= 
elc.getMinSearchTokenLength();
+            } else {
+                this.isLinkablePos = isLinkablePos;
             }
             
             //(2) check if this token should be considered to match labels of 
suggestions
-            if(isProcessable){ //processable tokens are also matchable
-                isMatchable = true;
-            } else if(tpc.isMatchUpperCaseTokens() && upperCase){
-                //match upper case tokens regardless of POS and length
-                isMatchable = true;
+            if(this.isLinkablePos){ //processable tokens are also matchable
+                this.isMatchablePos = true;
             } else { //check POS and length to see if token is matchable
                 matchedPosTag = false; //reset to false!
-                for(Value<PosTag> posAnnotation : 
token.getAnnotations(POS_ANNOTATION)){
+                for(Value<PosTag> posAnnotation : posAnnotations){
                     PosTag posTag = posAnnotation.value();
                     if(posTag.isMapped()){
                         
if(!Collections.disjoint(tpc.getMatchedLexicalCategories(), 
@@ -538,7 +578,7 @@ public class ProcessingState {
                             if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
                                 //override selectedPosTag if present
                                 selectedPosTag = posTag; //mark the matchable 
as selected PosTag
-                                isMatchable = true;
+                                isMatchablePos = true;
                                 matchedPosTag = true;
                                 break;
                             } // else probability to low for inclusion
@@ -546,7 +586,7 @@ public class ProcessingState {
                             if(selectedPosTag == null){ //do not override 
existing values
                                 selectedPosTag = posTag; //also rejected 
PosTags are selected
                             }
-                            isMatchable = false;
+                            isMatchablePos = false;
                             matchedPosTag = true;
                             break;
                         } // else probability to low for exclusion
@@ -554,11 +594,25 @@ public class ProcessingState {
                 }
                 if(!matchedPosTag){ //not matched against POS tag ...
                     //fall back to the token length
-                    isMatchable = token.getSpan().length() >= 
elc.getMinSearchTokenLength();    
+                    this.isMatchablePos = token.getSpan().length() >= 
elc.getMinSearchTokenLength();    
+                } else {
+                    this.isMatchablePos = isMatchablePos;
+                }
+            }
+            //(3) check if the POS tag indicates the start/end of an 
sub-sentence
+            for(Value<PosTag> posAnnotation : posAnnotations){
+                PosTag posTag = posAnnotation.value();
+                
if((!disjoint(SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
+                    if(posAnnotation.probability() >= 
tpc.getMinPosAnnotationProbability()){
+                        isSubSentenceStart = true;
+                    } // else probability to low for inclusion
+                } else if(posAnnotation.probability() >= 
tpc.getMinExcludePosAnnotationProbability()){
+                    isSubSentenceStart = false;
                 }
             }
+            this.isSubSentenceStart = isSubSentenceStart;
             
-            //(3) check for morpho analyses
+            //(4) check for morpho analyses
             if(selectedPosTag == null){ //token is not processable or matchable
                 //we need to set the selectedPoas tag to the first POS 
annotation
                 Value<PosTag> posAnnotation = 
token.getAnnotation(POS_ANNOTATION);
@@ -583,6 +637,7 @@ public class ProcessingState {
                 morpho = mf;
             }
         }
+        
         /**
          * Getter for the text as used for searching/matching
          * Entities in the linked vocabulary. If 
@@ -600,6 +655,7 @@ public class ProcessingState {
                 return token.getSpan();
             }
         }
+                
     }
     /** 
      * Represents a Chunk (group of tokens) used as context for EntityLinking.
@@ -678,6 +734,9 @@ public class ProcessingState {
         public int getEndChar(){
             return merged == null ? chunk.getEnd() : merged.getEnd();
         }
+        public boolean isProcessable() {
+            return isProcessable;
+        }
     }
     
 }
\ No newline at end of file

svn commit: r1448671 - in /stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking: config/TextProcessingConfig.java engine/EntityLinkingEngine.java impl/ProcessingState.java

Reply via email to