Author: rwesten
Date: Tue Jan 22 08:26:33 2013
New Revision: 1436791
URL: http://svn.apache.org/viewvc?rev=1436791&view=rev
Log:
fix for STANBOL-899: The EntityLinking engine now skips empty spans when
initializing the next Section to be processed
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1436791&r1=1436790&r2=1436791&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Tue Jan 22 08:26:33 2013
@@ -88,6 +88,8 @@ public class ProcessingState {
protected final LanguageProcessingConfig tpc;
protected final EntityLinkerConfig elc;
+ private AnalysedText at;
+
private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new
Predicate() {
@Override
public boolean evaluate(Object object) {
@@ -115,7 +117,7 @@ public class ProcessingState {
if(!tpc.isIgnoreChunks()){
enclosedSpanTypes.add(SpanTypeEnum.Chunk);
}
-
+ this.at = at; //store as field (just used for logging)
this.language = language;
//prefer to iterate over sentences
Iterator<Sentence> sentences = at.getSentences();
@@ -227,6 +229,10 @@ public class ProcessingState {
ChunkData activeChunk = null;
while(enclosed.hasNext()){
Span span = enclosed.next();
+ if(span.getStart() >= span.getEnd()){ //save guard against
empty spans
+ log.warn("Detected Empty Span {} in section {} of Blob {}",
+ new Object[]{span,section, at.getBlob()});
+ }
if(span.getType() == SpanTypeEnum.Chunk){
ChunkData chunkData = new ChunkData((Chunk)span);
if(chunkData.isProcessable){
@@ -482,7 +488,9 @@ public class ProcessingState {
boolean matchedPosTag = false; //matched any of the POS annotations
//(1) check if this Token should be linked against the Vocabulary
(isProcessable)
- boolean upperCase = index > 0 &&
Character.isUpperCase(token.getSpan().codePointAt(0));
+ boolean upperCase = index > 0 && //not a sentence start
+ token.getEnd() > token.getStart() && //not an empty token
+ Character.isUpperCase(token.getSpan().codePointAt(0));
//and upper case
if(tpc.isLinkUpperCaseTokens() && upperCase){
isProcessable = true;
} else { //else use POS tag & token length