Author: rwesten
Date: Thu May  9 15:08:33 2013
New Revision: 1480676

URL: http://svn.apache.org/r1480676
Log:
fixes for two issues related to STANBOL-1049; fix for STANBOL-1063 and 
implementation of STANBOL-1064. In addition this changes the 
IllegalStateException as mentioned in 
http://markmail.org/message/acv7xkg2festbpjk to a WARN level logging

Modified:
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
 Thu May  9 15:08:33 2013
@@ -135,7 +135,7 @@ public class LanguageProcessingConfig im
     /**
      * The minimum confidence that a POS annotation 
      */
-    private double minExcludePosAnnotationProbability = 
DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY/2;
+    private double minExcludePosAnnotationProbability = 
DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;
 
     private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
 

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
 Thu May  9 15:08:33 2013
@@ -93,13 +93,10 @@ public class EntityLinker {
         while(state.next()) {
             TokenData token = state.getToken();
             if(log.isDebugEnabled()){
-                log.debug("--- preocess Token {}: {} (lemma: {} | pos:{}) 
chunk: {}",
-                    new Object[]{token.index,token.token.getSpan(),
-                                 token.morpho != null ? 
token.morpho.getLemma() : "none", 
-                                 token.token.getAnnotations(POS_ANNOTATION),
-                                 token.inChunk != null ? 
-                                         (token.inChunk.chunk + " "+ 
token.inChunk.chunk.getSpan()) : 
-                                             "none"});
+                log.debug("--- preocess Token {}: {} (lemma: {}) linkable={}, 
matchable={} | chunk: {}",
+                    new 
Object[]{token.index,token.getTokenText(),token.getTokenLemma(),
+                        token.isLinkable, token.isMatchable, token.inChunk != 
null ? 
+                                (token.inChunk.chunk + " "+ 
token.inChunk.chunk.getSpan()) : "none"});
             }
             List<String> searchStrings = new 
ArrayList<String>(linkerConfig.getMaxSearchTokens());
             String searchString = linkerConfig.isLemmaMatching() ? 
token.getTokenLemma() :
@@ -134,11 +131,10 @@ public class EntityLinker {
                 if(minIncludeIndex <= prevIndex){
                     TokenData prevToken = state.getTokens().get(prevIndex);
                     if(log.isDebugEnabled()){
-                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new 
Object[]{
+                        log.debug("    {} {}:'{}' (lemma: {}) linkable={}, 
matchable={}",new Object[]{
                             prevToken.isMatchable? '+':'-',prevToken.index,
-                            prevToken.token.getSpan(),
-                            prevToken.morpho != null ? 
prevToken.morpho.getLemma() : "none",
-                            prevToken.token.getAnnotations(POS_ANNOTATION)
+                            prevToken.getTokenText(), 
prevToken.getTokenLemma(),
+                            prevToken.isLinkable, prevToken.isMatchable
                         });
                     }
                     if(prevToken.isMatchable){
@@ -153,11 +149,10 @@ public class EntityLinker {
                 if(maxIndcludeIndex >= pastIndex){
                     TokenData pastToken = state.getTokens().get(pastIndex);
                     if(log.isDebugEnabled()){
-                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new 
Object[]{
-                            pastToken.isMatchable? '+':'-',pastToken.index,
-                            pastToken.token.getSpan(),
-                            pastToken.morpho != null ? 
pastToken.morpho.getLemma() : "none",
-                            pastToken.token.getAnnotations(POS_ANNOTATION)
+                        log.debug("    {} {}:'{}' (lemma: {}) linkable={}, 
matchable={}",new Object[]{
+                                pastToken.isMatchable? '+':'-',pastToken.index,
+                                pastToken.getTokenText(), 
pastToken.getTokenLemma(),
+                                pastToken.isLinkable, pastToken.isMatchable
                         });
                     }
                     if(pastToken.isMatchable){

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
 Thu May  9 15:08:33 2013
@@ -20,8 +20,12 @@ import java.util.Comparator;
 
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class LabelMatch {
+    
+    private final Logger log = LoggerFactory.getLogger(LabelMatch.class);
     /**
      * To be used in case no match is present
      */
@@ -89,12 +93,14 @@ public class LabelMatch {
         score = textScore*labelScore;
         if(span < processableMatchCount){
             throw new IllegalArgumentException("The span '" + span
-                + "' MUST BE >= the number of matched processable tokens'"
+                + "' MUST BE >= then number of matched processable tokens'"
                 + processableMatchCount+"': "+toString()+"!");
         }
         if(span < matchCount){
-            throw new IllegalArgumentException("The span '" + span
-                + "' MUST BE >= the number of matched tokens '"+matchCount+"': 
"+toString()+"!");
+            log.warn("The span '{}' MUST BE >= then number of matched tokens 
'{}"
+                    + "': {}! Set span to {}.", new Object[]{
+                    span, matchCount, toString(), matchCount});
+            span = matchCount;
         }
         if(processableMatchCount > matchCount){
             throw new IllegalArgumentException("The number of matched 
processable tokens '"

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 Thu May  9 15:08:33 2013
@@ -271,11 +271,11 @@ public class ProcessingState {
                 } else if(span.getType() == SpanTypeEnum.Token){
                     TokenData tokenData = new 
TokenData(tokens.size(),(Token)span,activeChunk);
                     if(log.isDebugEnabled()){
-                        log.debug("  > Token {}: {} (pos:{}) chunk: '{}' | 
morpho: {}",
-                            new Object[]{tokenData.index,tokenData.token, 
-                                         
tokenData.token.getAnnotations(POS_ANNOTATION),
-                                         tokenData.inChunk != null ? 
tokenData.inChunk.chunk.getSpan() : "none",
-                                         tokenData.morpho != null ? 
tokenData.morpho : "none"});
+                        log.debug("  > {}: {} {}(pos:{}) chunk: '{}'",
+                            new Object[]{tokenData.index,tokenData.token,
+                                tokenData.morpho != null ? ("(lemma: 
"+tokenData.morpho.getLemma()+") ") : "",
+                                tokenData.token.getAnnotations(POS_ANNOTATION),
+                                tokenData.inChunk != null ? 
tokenData.inChunk.chunk.getSpan() : "none"});
                     }
                     if(!tokenData.hasAlphaNumeric){
                         tokenData.isLinkable = false;
@@ -296,6 +296,7 @@ public class ProcessingState {
                             if(tpc.isLinkUpperCaseTokens()){
                                 if(tokenData.isMatchable) { //convert 
matchable to 
                                     tokenData.isLinkable = true; //linkable
+                                    tokenData.isMatchable = true;
                                 } else { // and other tokens to
                                     tokenData.isMatchable = true; //matchable
                                 }
@@ -309,36 +310,43 @@ public class ProcessingState {
                         } //else not an upper case token
                         
                         //(3) Unknown POS tag Rules (see STANBOL-1049)
-                        if(!tokenData.isLinkable && tokenData.isLinkablePos == 
null && 
-                                tokenData.isLinkablePos == null){
+                        if(!tokenData.isLinkable && (tokenData.isLinkablePos 
== null || 
+                                tokenData.isMatchablePos == null)){
                             if(isUnicaseLanguage || 
!tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
-                                if(tokenData.hasSearchableLength){
+                                if(tokenData.isLinkablePos == null && 
tokenData.hasSearchableLength){
                                     tokenData.isLinkable = true;
+                                    tokenData.isMatchable = true;
                                 } //else no need to change the state
                             } else { //non unicase language and link only 
upper case tokens enabled
                                 if(tokenData.upperCase && // upper case token
                                         tokenData.index > 0 && //not a 
sentence or sub-sentence start
                                         
!tokens.get(tokenData.index-1).isSubSentenceStart){
-                                    if(tokenData.hasSearchableLength){
+                                    if(tokenData.hasSearchableLength && 
tokenData.isLinkablePos == null){
                                         tokenData.isLinkable = true;
-                                    } else {
+                                        tokenData.isMatchable = true;
+                                    } else if(tokenData.isMatchablePos == 
null){
                                         tokenData.isMatchable = true;
                                     }
-                                } else if(tokenData.hasSearchableLength){ 
//lower case and long token
+                                } else if(tokenData.hasSearchableLength &&  
//lower case and long token
+                                        tokenData.isMatchablePos == null){ 
                                     tokenData.isMatchable = true;
                                 } //else lower case and short word 
                             }
                         } //else already linkable or POS tag present
                     }
+                    log.debug("    - {}",tokenData); 
                     //add the token to the list
                     tokens.add(tokenData);
                     if(!foundLinkableToken){
                         foundLinkableToken = tokenData.isLinkable;
                     }
                     if(activeChunk != null){
-                        if(tokenData.isMatchable){
+                        if (tokenData.isLinkable){
+                            //ignore matchableCount in Chunks with linkable 
Tokens
+                            activeChunk.matchableCount = -10; //by setting the 
count to -10
+                        } else if(tokenData.isMatchable){
                             activeChunk.matchableCount++;
-                        } 
+                        }
                         if (span.getEnd() >= activeChunk.getEndChar()){
                             //this is the last token in the current chunk
                             activeChunk.endToken = tokens.size()-1;
@@ -705,7 +713,15 @@ public class ProcessingState {
         public String getTokenLemma(){
             return morpho != null ? morpho.getLemma() : null;
         }
-                
+        @Override
+        public String toString() {
+            return new StringBuilder("TokenData: '").append(getTokenText())
+                    
.append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
+                    .append(")| 
matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
+                    .append(")| alpha=").append(hasAlphaNumeric).append("| 
seachLength=")
+                    .append(hasSearchableLength).append("| 
upperCase=").append(upperCase)
+                    .append("]").toString();
+        }  
     }
     /** 
      * Represents a Chunk (group of tokens) used as context for EntityLinking.


Reply via email to