Author: rwesten
Date: Tue Jan 22 08:26:33 2013
New Revision: 1436791

URL: http://svn.apache.org/viewvc?rev=1436791&view=rev
Log:
fix for STANBOL-899: The EntityLinking engine now skips empty spans when 
initializing the next Section to be processed

Modified:
    
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java

Modified: 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1436791&r1=1436790&r2=1436791&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
 Tue Jan 22 08:26:33 2013
@@ -88,6 +88,8 @@ public class ProcessingState {
     protected final LanguageProcessingConfig tpc;
     protected final EntityLinkerConfig elc;
 
+    private AnalysedText at;
+
     private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new 
Predicate() {
         @Override
         public boolean evaluate(Object object) {
@@ -115,7 +117,7 @@ public class ProcessingState {
         if(!tpc.isIgnoreChunks()){
             enclosedSpanTypes.add(SpanTypeEnum.Chunk);
         }
-        
+        this.at = at; //store as field (just used for logging)
         this.language = language;
         //prefer to iterate over sentences
         Iterator<Sentence> sentences = at.getSentences();
@@ -227,6 +229,10 @@ public class ProcessingState {
             ChunkData activeChunk = null;
             while(enclosed.hasNext()){
                 Span span = enclosed.next();
+                if(span.getStart() >= span.getEnd()){ //save guard against 
empty spans
+                    log.warn("Detected Empty Span {} in section {} of Blob {}",
+                        new Object[]{span,section, at.getBlob()});
+                }
                 if(span.getType() == SpanTypeEnum.Chunk){
                     ChunkData chunkData = new ChunkData((Chunk)span);
                     if(chunkData.isProcessable){
@@ -482,7 +488,9 @@ public class ProcessingState {
             boolean matchedPosTag = false; //matched any of the POS annotations
             
             //(1) check if this Token should be linked against the Vocabulary 
(isProcessable)
-            boolean upperCase = index > 0 && 
Character.isUpperCase(token.getSpan().codePointAt(0));
+            boolean upperCase = index > 0 && //not a sentence start
+                    token.getEnd() > token.getStart() && //not an empty token
+                    Character.isUpperCase(token.getSpan().codePointAt(0)); 
//and upper case
             if(tpc.isLinkUpperCaseTokens() && upperCase){
                 isProcessable = true;
             } else { //else use POS tag & token length


Reply via email to