Author: rwesten
Date: Thu Apr 16 08:26:19 2015
New Revision: 1674016

URL: http://svn.apache.org/r1674016
Log:
merged implementation for STANBOL-1418 und fix for STANBOL-1416 from 0.12 to 
trunk

Added:
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityFstLinkingComponnet.java
      - copied unchanged from r1674012, 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityFstLinkingComponnet.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityTokenFilter.java
      - copied unchanged from r1674012, 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityTokenFilter.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/PlainFstLinkingComponnet.java
      - copied unchanged from r1674012, 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/PlainFstLinkingComponnet.java
Modified:
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 Thu Apr 16 08:26:19 2015
@@ -28,6 +28,7 @@ import static org.apache.stanbol.enhance
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
@@ -62,8 +63,8 @@ import org.apache.stanbol.enhancer.engin
 import 
org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
 import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
@@ -101,12 +102,23 @@ public class FstLinkingEngine implements
 
     protected final TextProcessingConfig tpConfig;
     protected final EntityLinkerConfig elConfig;
+    
+    /**
+     * Used in the {@link LinkingModeEnum#NER} to filter entities. For that 
configured
+     * mappings for the {@link NerTag#getType()} and {@link NerTag#getTag()} 
values 
+     * (the key) are mapped with the actual {@link Match#getTypes()} (the 
value set). 
+     * The <code>null</code> value is interpreted as wildCard (any type 
matches). An
+     * empty mapping is interpreted as an blacklist (do not lookup Named 
Entities
+     * with that {@link NerTag#getType() type}/{@link NerTag#getTag() tag}
+     */
+    protected final Map<String,Set<String>> neTypeMappings;
 
     private IndexConfiguration indexConfig;
 
     public FstLinkingEngine(String name, LinkingModeEnum linkingMode, 
             IndexConfiguration indexConfig,
-            TextProcessingConfig tpConfig, EntityLinkerConfig elConfig) {
+            TextProcessingConfig tpConfig, EntityLinkerConfig elConfig,
+            Map<String,Set<String>> neTypeMappings) {
         if (StringUtils.isBlank(name)) {
             throw new IllegalArgumentException("The parsed name MUST NOT be 
NULL nor blank!");
         }
@@ -124,6 +136,11 @@ public class FstLinkingEngine implements
             throw new IllegalArgumentException("The parsed Entity Linking 
configuration MUST NOT be NULL");
         }
         this.elConfig = elConfig;
+        if(linkingMode == LinkingModeEnum.NER && neTypeMappings == null){
+            throw new IllegalArgumentException("The NamedEntity type mappings 
MUST NOT be NULL "
+                    + "if the LinkingMode is NER!");
+        }
+        this.neTypeMappings = neTypeMappings;
     }
 
     @Override
@@ -155,9 +172,17 @@ public class FstLinkingEngine implements
         }
         // we need a detected language, the AnalyzedText contentPart with
         // Tokens.
-        AnalysedText at = getAnalysedText(this, ci, false);
-        if(at == null && linkingMode == LinkingModeEnum.PLAIN){
-            return NlpEngineHelper.getPlainText(this, ci, false) != null ? 
ENHANCE_ASYNC : CANNOT_ENHANCE;
+        AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+        if(at == null){
+            if( linkingMode == LinkingModeEnum.PLAIN){
+                return NlpEngineHelper.getPlainText(this, ci, false) != null ? 
ENHANCE_ASYNC : CANNOT_ENHANCE;
+            } else {
+                log.warn("Unable to process {} with engine name={} and mode={} 
"
+                        + ": Missing AnalyzedText content part. Please ensure 
that "
+                        + "NLP processing results are available before FST 
linking!", 
+                        new Object[]{ci,name,linkingMode});
+                return CANNOT_ENHANCE;
+            }
         } else {
             if(linkingMode == LinkingModeEnum.PLAIN){
                 return ENHANCE_ASYNC;
@@ -167,7 +192,7 @@ public class FstLinkingEngine implements
                 log.warn("Unable to process {} with engine name={} and mode={} 
"
                     + "as the AnalyzedText does not contain any Tokens!", 
                     new Object[]{ci,name,linkingMode});
-                return at.getTokens().hasNext() ? ENHANCE_ASYNC : 
CANNOT_ENHANCE;
+                return CANNOT_ENHANCE;
             }
         }
     }
@@ -243,7 +268,7 @@ public class FstLinkingEngine implements
                     log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
                 }
             }
-            int matches = match(content,tags.values());
+            int matches = match(content, tags.values(), 
session.entityMentionTypes);
             log.debug(" - loaded {} ({} loaded, {} cached, {} appended) 
Matches in {} ms", 
                     new Object[]{matches, session.getSessionDocLoaded(),
                         session.getSessionDocCached(), 
session.getSessionDocAppended(),
@@ -273,7 +298,7 @@ public class FstLinkingEngine implements
         tags.clear(); //help the GC
     }
 
-    private int match(String text, Collection<Tag> tags) {
+    private int match(String text, Collection<Tag> tags, 
Map<int[],Set<String>> emTypes) {
         log.trace("  ... process matches for {} extracted Tags:",tags.size());
         int matchCount = 0;
         Iterator<Tag> tagIt = tags.iterator();
@@ -294,7 +319,20 @@ public class FstLinkingEngine implements
                     log.trace(" {}. {}", i++,  match.getUri());
                 }
                 matchCount++;
-                if(!filterEntityByType(match.getTypes().iterator())){
+                final boolean filterType;
+                if(linkingMode == LinkingModeEnum.NER){
+                    Set<String> types = emTypes.get(new int[]{tag.getStart(), 
tag.getEnd()});
+                    if(types == null){
+                        log.warn(" - missing NE types for Named Entity [{},{}] 
{}!",
+                            new Object[]{tag.getStart(), 
tag.getEnd(),tag.getAnchor()});
+                        filterType = true;
+                    } else {
+                        filterType = 
filterByNamedEntityType(match.getTypes().iterator(), types);
+                    }
+                } else {
+                    filterType = 
filterEntityByType(match.getTypes().iterator());
+                }
+                if(!filterType){
                     int distance = Integer.MAX_VALUE;
                     Literal matchLabel = null;
                     for(Iterator<Literal> it = match.getLabels().iterator(); 
it.hasNext() && distance > 0;){
@@ -370,6 +408,44 @@ public class FstLinkingEngine implements
         return matchCount;
     }
     /**
+     * Filter Entities based on matching the entity types with the named 
entity types.
+     * The {@link #neTypeMappings} are used to convert named entity types to 
+     * entity types. 
+     * @param eTypes the types of the entity
+     * @param neTypes the types of the named entity
+     * @return
+     */
+    private boolean filterByNamedEntityType(Iterator<UriRef> eTypes, 
Set<String> neTypes) {
+        //first collect the allowed entity types
+        Set<String> entityTypes = new HashSet<String>();
+        for(String neType : neTypes){
+            if(neType != null){
+                Set<String> mappings = neTypeMappings.get(neType);
+                if(mappings != null){
+                    if(mappings.contains(null)){
+                        //found an wildcard
+                        return false; //do not filter
+                    } else {
+                        entityTypes.addAll(mappings);
+                    }
+                } //else no mapping for neType (tag or uri) present
+            }
+        }
+        if(entityTypes.isEmpty()){
+            return true; //no match possible .. filter
+        }
+        //second check the actual entity types against the allowed
+        while(eTypes.hasNext()){
+            UriRef typeUri = eTypes.next();
+            if(typeUri != null && 
entityTypes.contains(typeUri.getUnicodeString())){
+                return false; //we found an match .. do not filter
+            }
+        }
+        //no match found ... filter
+        return true;
+    }
+
+    /**
      * Applies the configured entity type based filters
      * @param entityTypes
      * @return
@@ -432,11 +508,23 @@ public class FstLinkingEngine implements
                 tokenStream = baseTokenStream;
                 reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
                 break;
-//            case NER:
+            case NER:
+                //this uses the NamedEntityTokenFilter as tokenStream and a
+                //combination with the longest dominant right as reducer 
+                NamedEntityTokenFilter neTokenFilter = new 
NamedEntityTokenFilter(
+                    baseTokenStream, at, session.getLanguage(), 
neTypeMappings.keySet(),
+                    session.entityMentionTypes);
+                tokenStream = neTokenFilter;
+                reducer = new ChainedTagClusterReducer(neTokenFilter,
+                    TagClusterReducer.LONGEST_DOMINANT_RIGHT);
+                break;
             case LINKABLE_TOKEN:
+                //this uses the LinkableTokenFilter as tokenStream
                 LinkableTokenFilter linkableTokenFilter = new 
LinkableTokenFilter(baseTokenStream, 
                     at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()),
                     elConfig.getMinChunkMatchScore(), 
elConfig.getMinFoundTokens());
+                //NOTE that the  LinkableTokenFilter implements longest 
dominant right
+                // based on the matchable span of tags (instead of the whole 
span).
                 reducer = new ChainedTagClusterReducer(
                     linkableTokenFilter,TagClusterReducer.ALL);
                 tokenStream = linkableTokenFilter;
@@ -446,11 +534,9 @@ public class FstLinkingEngine implements
                     + linkingMode + "! Please adapt implementation to changed 
Enumeration!");
         }
         log.debug(" - tokenStream: {}", tokenStream);
-        log.debug(" - reducer: {}", reducer);
-        //we use two TagClusterReducer implementations.
-        // (1) the linkableTokenFilter filters all tags that do not overlap any
-        //     linkable Token
-        // (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
+        log.debug(" - reducer: {} (class: {})", reducer, 
reducer.getClass().getName());
+        
+        //Now process the document
         final long[] time = new long[]{0};
         new Tagger(corpus.getFst(), tokenStream, 
reducer,session.isSkipAltTokens()) {
             

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 Thu Apr 16 08:26:19 2015
@@ -36,9 +36,13 @@ import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.Dictionary;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -63,6 +67,7 @@ import org.apache.felix.scr.annotations.
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.core.SolrCore;
+import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
 import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
 import org.apache.stanbol.commons.solr.IndexReference;
 import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
@@ -70,6 +75,7 @@ import org.apache.stanbol.enhancer.engin
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import 
org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
 import 
org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.FastLRUCacheManager;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
@@ -132,35 +138,12 @@ import com.google.common.util.concurrent
     value=IndexConfiguration.DEFAULT_FST_FOLDER),
     @Property(name=IndexConfiguration.SOLR_TYPE_FIELD, value="rdf:type"),
     @Property(name=IndexConfiguration.SOLR_RANKING_FIELD, 
value="entityhub:entityRank"),
-//  @Property(name=REDIRECT_FIELD,value="rdfs:seeAlso"),
-//  @Property(name=REDIRECT_MODE,options={
-//      @PropertyOption(
-//          value='%'+REDIRECT_MODE+".option.ignore",
-//          name="IGNORE"),
-//      @PropertyOption(
-//          value='%'+REDIRECT_MODE+".option.addValues",
-//          name="ADD_VALUES"),
-//      @PropertyOption(
-//              value='%'+REDIRECT_MODE+".option.follow",
-//              name="FOLLOW")
-//      },value="IGNORE"),
     @Property(name=FstLinkingEngineComponent.FST_THREAD_POOL_SIZE,
         intValue=FstLinkingEngineComponent.DEFAULT_FST_THREAD_POOL_SIZE),
     @Property(name=FstLinkingEngineComponent.ENTITY_CACHE_SIZE, 
         intValue=FstLinkingEngineComponent.DEFAULT_ENTITY_CACHE_SIZE),
     @Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
     @Property(name=INCLUDE_SIMILAR_SCORE, 
boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
-    @Property(name=FstLinkingEngineComponent.LINKING_MODE,  options={
-            @PropertyOption(
-                
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.plain",
-                name="PLAIN"),
-            @PropertyOption(
-                
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.linkableToken",
-                name="LINKABLE_TOKEN") //,
-            //@PropertyOption(
-            //    
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.ner",
-            //    name="NER")
-        },value="LINKABLE_TOKEN"),
     
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, 
boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
     @Property(name=PROCESSED_LANGUAGES, cardinality=Integer.MAX_VALUE,
@@ -178,9 +161,6 @@ import com.google.common.util.concurrent
         "dbp-ont:Event; schema:Event > dbp-ont:Event",
         "schema:Product > schema:Product",
         "skos:Concept > skos:Concept"}),
-//    @Property(name=DEREFERENCE_ENTITIES, 
boolValue=DEFAULT_DEREFERENCE_ENTITIES_STATE),
-//    @Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
-//        
value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
     @Property(name=SERVICE_RANKING,intValue=0)
 })
 public class FstLinkingEngineComponent {
@@ -206,6 +186,13 @@ public class FstLinkingEngineComponent {
     public static final String LINKING_MODE = 
"enhancer.engines.linking.lucenefst.mode";
     
     /**
+     * Allows to configure mappings of NamedEntity Types to types of Entities 
in the
+     * vocabulary. Configured keys are matched against the {@link 
NerTag#getTag()} AND
+     * {@link NerTag#getType()} values of NamedEntities. Configured Values are 
mapped
+     * against the values of the configured {@link 
IndexConfiguration#SOLR_TYPE_FIELD}.
+     */
+    public static final String NAMED_ENTITY_TYPE_MAPPINGS = 
"enhancer.engines.linking.lucenefst.neTypeMapping";
+    /**
      * The size of the thread pool used to create FST models (default=1). 
Creating
      * such models does need a lot of memory. Expect values up to 10times of 
the
      * build model. So while this task can easily performed concurrently users 
need
@@ -242,7 +229,7 @@ public class FstLinkingEngineComponent {
      */
     private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
     
-    private final Logger log = 
LoggerFactory.getLogger(FstLinkingEngineComponent.class);
+    protected final Logger log = 
LoggerFactory.getLogger(FstLinkingEngineComponent.class);
     /**
      * the name for the EnhancementEngine registered by this component
      */
@@ -257,7 +244,7 @@ public class FstLinkingEngineComponent {
      * used to resolve '{prefix}:{local-name}' used within the engines 
configuration
      */
     @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
-    protected NamespacePrefixService prefixService;    
+    private NamespacePrefixService prefixService;    
 
     /**
      * Holds the FST configuration parsed to the engine
@@ -322,7 +309,7 @@ public class FstLinkingEngineComponent {
      * The bundle context for this component. Also used to track dependencies
      * and register the {@link #engineRegistration}
      */
-    private BundleContext bundleContext;
+    protected BundleContext bundleContext;
     
     /**
      * Thread pool used for the runtime creation of FST modles.
@@ -355,6 +342,8 @@ public class FstLinkingEngineComponent {
      * The size of the EntityCache ( <code>0</code> ... means deactivated)
      */
     private int entityCacheSize;
+
+    private Map<String,Set<String>> nerTypeMappings;
     
     /**
      * Default constructor as used by OSGI. This expects that 
@@ -366,9 +355,63 @@ public class FstLinkingEngineComponent {
     @Activate
     @SuppressWarnings("unchecked")
     protected void activate(ComponentContext ctx) throws 
ConfigurationException {
-        log.info("activate {}",getClass().getSimpleName());
+        log.info("activate {}", getClass().getSimpleName());
+        log.debug("  - instance: {}", this);
+        log.debug("  - config: {}", ctx.getProperties());
         this.bundleContext = ctx.getBundleContext();
-        Dictionary<String,Object> properties = ctx.getProperties();
+        //(0) parse the linking mode
+        applyConfig(parseLinkingMode(ctx), ctx.getProperties(), prefixService);
+    }
+
+    /**
+     * Parses the LinkingMode from the {@link #LINKING_MODE} property. This
+     * allows to use this component to configure FST linking engines for any
+     * supported LinkingMode. If the {@link #LINKING_MODE} is not present the
+     * default {@link LinkingModeEnum#LINKABLE_TOKEN} is returned. <p>
+     * <b>NOTE:</b>Typically
+     * users will want to use the <ul>
+     * <li>{@link PlainFstLinkingComponnet} to configure FST engines for the 
+     * {@link LinkingModeEnum#PLAIN}
+     * <li> {@link NamedEntityFstLinkingComponnet} to configure FST engines for
+     * the {@link LinkingModeEnum#NER}
+     * </ul>
+     * but is is also fine to explicitly specify a {@link #LINKING_MODE} 
linking
+     * mode when using this component to configure the FST linking engine.
+     * @param ctx the parsed component context
+     * @return the parsed {@link LinkingModeEnum}
+     * @throws ConfigurationException
+     */
+    private LinkingModeEnum parseLinkingMode(ComponentContext ctx) throws 
ConfigurationException {
+        Object value = ctx.getProperties().get(LINKING_MODE);
+        LinkingModeEnum linkingMode;
+        if(value == null || StringUtils.isBlank(value.toString())){
+            linkingMode = LinkingModeEnum.LINKABLE_TOKEN;
+        } else {
+            try {
+                linkingMode = LinkingModeEnum.valueOf(value.toString());
+            } catch(IllegalArgumentException e){
+                throw new ConfigurationException(LINKING_MODE, "The parsed 
value '"
+                    +value+"' (type: "+value.getClass().getName()+") is not a 
member "
+                    + "of the enum (members: "+ 
Arrays.toString(LinkingModeEnum.values())
+                    + ")!",e);
+            }
+        }
+        return linkingMode;
+    }
+    /**
+     * Called by {@link #activate(ComponentContext)}, 
+     * {@link PlainFstLinkingComponnet#activate(ComponentContext)} and 
+     * {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
+     * apply the parsed {@link ComponentContext#getProperties()}. The
+     * {@link LinkingModeEnum linking mode} is parsed separately as OSGI does 
not
+     * allow to modify the parsed config and sup-classes do need to override
+     * the linking mode.
+     * @param linkingMode the linking mode
+     * @param properties
+     * @throws ConfigurationException
+     */
+    protected void applyConfig(LinkingModeEnum linkingMode, 
Dictionary<String,Object> properties, NamespacePrefixService prefixService)
+            throws ConfigurationException {
         //(0) The name for the Enhancement Engine and the basic metadata
         Object value = properties.get(PROPERTY_NAME);
         if(value == null || value.toString().isEmpty()){
@@ -381,21 +424,10 @@ public class FstLinkingEngineComponent {
         engineMetadata.put(PROPERTY_NAME, this.engineName);
         value = properties.get(Constants.SERVICE_RANKING);
         engineMetadata.put(Constants.SERVICE_RANKING, value == null ? 
Integer.valueOf(0) : value);
-        //(0) parse the linking mode
-        value = properties.get(LINKING_MODE);
-        if(value == null || StringUtils.isBlank(value.toString())){
-            this.linkingMode = LinkingModeEnum.LINKABLE_TOKEN;
-        } else {
-            try {
-                this.linkingMode = LinkingModeEnum.valueOf(value.toString());
-            } catch(IllegalArgumentException e){
-                throw new ConfigurationException(LINKING_MODE, "The parsed 
value '"
-                    +value+"' (type: "+value.getClass().getName()+") is not a 
member "
-                    + "of the enum (members: "+ 
Arrays.toString(LinkingModeEnum.values())
-                    + ")!",e);
-            }
-        }
-        log.info(" - linking mode: {}",linkingMode);
+        
+        //(0) set the linking mode
+        this.linkingMode = linkingMode;
+        log.info(" - linking mode: {}", linkingMode);
         
         //(1) parse the TextProcessing configuration
         //TODO: decide if we should use the TextProcessingConfig for this 
engine
@@ -561,8 +593,70 @@ public class FstLinkingEngineComponent {
         } else {
             solrRankingField = value.toString().trim();
         }
+        //(10) parse the NamedEntity type mappings (if linkingMode = NER)
+        if(linkingMode == LinkingModeEnum.NER){
+            nerTypeMappings = new HashMap<String,Set<String>>();
+            value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
+            if(value instanceof String[]){ //support array
+                value = Arrays.asList((String[])value);
+            } else if(value instanceof String) { //single value
+                value = Collections.singleton(value);
+            }
+            if(value instanceof Collection<?>){ //and collection
+                log.info(" - process Named Entity Type Mappings (used by 
LinkingMode: {})",linkingMode);
+                configs : for(Object o : (Iterable<?>)value){
+                    if(o != null){
+                        StringBuilder usage = new StringBuilder("useage: ");
+                        usage.append("'{namedEntity-tag-or-uri} > 
{entityType-1}[,{entityType-n}]'");
+                        String[] config = o.toString().split(">");
+                        String namedEntityType = config[0].trim();
+                        if(namedEntityType.isEmpty()){
+                            log.warn("Invalid Type Mapping Config '{}': 
Missing namedEntityType ({}) -> ignore this config",
+                                o,usage);
+                            continue configs;
+                        }
+                        if(NamespaceMappingUtils.getPrefix(namedEntityType) != 
null){
+                            namedEntityType = 
NamespaceMappingUtils.getConfiguredUri(
+                                prefixService, 
NAMED_ENTITY_TYPE_MAPPINGS,namedEntityType);
+                        }
+                        if(config.length < 2 || config[1].isEmpty()){
+                            log.warn("Invalid Type Mapping Config '{}': 
Missing dc:type URI '{}' ({}) -> ignore this config",
+                                o,usage);
+                            continue configs;
+                        }
+                        String entityTypes = config[1].trim();
+                        if(config.length > 2){
+                            log.warn("Configuration after 2nd '>' gets 
ignored. Will use mapping '{} > {}' from config {}",
+                                new Object[]{namedEntityType,entityTypes,o});
+                        }
+                        Set<String> types = 
nerTypeMappings.get(namedEntityType);
+                        if(types == null){ //add new element to the mapping
+                            types = new HashSet<String>();
+                            nerTypeMappings.put(namedEntityType, types);
+                        }
+                        for(String entityType : entityTypes.split(";")){
+                            entityType = entityType.trim();
+                            if(!entityType.isEmpty()){
+                                String typeUri;
+                                if("*".equals(entityType)){
+                                    typeUri = null; //null is used as wildcard
+                                } else {
+                                    typeUri = 
NamespaceMappingUtils.getConfiguredUri(
+                                        prefixService, 
NAMED_ENTITY_TYPE_MAPPINGS, entityType);
+                                }
+                                log.info("   - add {} > {}", namedEntityType, 
typeUri);
+                                types.add(typeUri);
+                            } //else ignore empty mapping
+                        }
+                    }
+                }
+            } else { //no mappings defined ... set wildcard mapping
+                log.info(" - No Named Entity type mappings configured. Will 
use wildcard mappings");
+                nerTypeMappings = Collections.singletonMap(null, 
Collections.<String>singleton(null));
+            }
+        }
         
-        //(10) start tracking the SolrCore
+        //(11) start tracking the SolrCore
         try {
             solrServerTracker = new RegisteredSolrServerTracker(
                 bundleContext, indexReference, null){
@@ -599,7 +693,18 @@ public class FstLinkingEngineComponent {
             throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name 
'"
                 + value.toString()+"' is invalid (expected: 
'[{server-name}:]{indexname}'");
         }
-        solrServerTracker.open();
+        try {
+            solrServerTracker.open();
+        } catch(RuntimeException e){
+            //FIX for STANBOL-1416 (see 
https://issues.apache.org/jira/browse/STANBOL-1416)
+            //If an available SolrCore can not be correctly initialized we will
+            //get the exception here. In this case we want this component to be
+            //activated and waiting for further service events. Because of that
+            //we catch here the exception.
+            log.debug("Error while processing existing SolrCore Service during 
"
+                    + "opening SolrServiceTracker ... waiting for further 
service"
+                    + "Events", e);
+        }
     }
     
     /**
@@ -712,18 +817,28 @@ public class FstLinkingEngineComponent {
             } else {
                log.info("  ... no corpus for default language {} available", 
defaultCoprous);
             }
-            //set the index configuration to the field;
+            
+            //check if the old configuration is still present
+            if(this.engineRegistration != null){
+                unregisterEngine();
+            }
+            
+            //create the new configuration
+            
+            //set the newly configured instances to the fields
             this.indexConfig = indexConfig;
+            this.solrServerReference = reference;
+            this.solrCore = core;
+            //create the new FST linking engine instance
             FstLinkingEngine engine = new FstLinkingEngine(engineName, 
                 linkingMode, indexConfig,
-                textProcessingConfig, entityLinkerConfig);
+                textProcessingConfig, entityLinkerConfig, nerTypeMappings);
+            //register it as a service
             String[] services = new String [] {
                     EnhancementEngine.class.getName(),
                     ServiceProperties.class.getName()};
             log.info(" ... register {}: {}", 
engine.getClass().getSimpleName(),engineName);
             this.engineRegistration = 
bundleContext.registerService(services,engine, engineMetadata);
-            this.solrServerReference = reference;
-            this.solrCore = core;
         }
 
         
@@ -765,12 +880,21 @@ public class FstLinkingEngineComponent {
      * rests the fields. If no engine is registered this does nothing!
      */
     private void unregisterEngine() {
+        log.debug("> in unregisterEngine() ...");
         //use local copies for method calls to avoid concurrency issues
         ServiceRegistration engineRegistration = this.engineRegistration;
         if(engineRegistration != null){
             log.info(" ... unregister Lucene FSTLinkingEngine {}",engineName);
-            engineRegistration.unregister();
+            try {
+                engineRegistration.unregister();
+            } catch(IllegalStateException e) {
+                //this is unexpected but can be ignored
+                log.info("Unexpected State: Service for FSTLinkingEngine "
+                        + engineName+" was already deactivated.", e);
+            }
             this.engineRegistration = null; //reset the field
+        } else {
+            log.debug(" ... no engine registration present");
         }
         solrServerReference = null;
         SolrCore solrServer = this.solrCore;
@@ -778,6 +902,8 @@ public class FstLinkingEngineComponent {
             log.debug(" ... unregister SolrCore {}", solrServer.getName());
             solrServer.close(); //decrease the reference count!!
             this.solrCore = null; //rest the field
+        } else {
+            log.debug(" ... no SolrCore present");
         }
         //deactivate the index configuration if present
         if(indexConfig != null){
@@ -790,6 +916,8 @@ public class FstLinkingEngineComponent {
                 cacheManager.close();
             }
             indexConfig = null;
+        } else {
+            log.debug(" ... no index config present");
         }
     }
 
@@ -834,7 +962,11 @@ public class FstLinkingEngineComponent {
      */
     @Deactivate
     protected void deactivate(ComponentContext ctx) {
-        log.info(" ... deactivate {}: {}",getClass().getSimpleName(), 
engineName);
+        log.info(" ... deactivate {}: {} (CompInst: {})",new Object[] {
+                getClass().getSimpleName(), 
+                engineName, ctx.getComponentInstance()});
+        log.debug("  - instance: {}", this);
+        log.debug("  - config: {}", ctx.getProperties());
         if(solrServerTracker != null){
             //closing the tracker will also cause registered engines to be
             //unregistered as service (see #updateEngineRegistration())

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 Thu Apr 16 08:26:19 2015
@@ -57,12 +57,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Class the ensures that only {@link TokenData#isLinkable linkable} Tokens
+ * Class that ensures that only {@link TokenData#isLinkable linkable} Tokens
  * are processed.<p>
  * This is ensured on two places:<ol>
  * <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link 
TaggingAttribute}
  * based on NLP processing results present in the {@link AnalysedText}. This
- * implementation Classifies Token similar to the {@link EntityLinkingEngine}.
+ * implementation classifies Token similar to the {@link EntityLinkingEngine}.
  * It uses the {@link TextProcessingConfig} for its configuration.<p>
  * <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL 
tags}
  * that do not overlap with any {@link TokenData#isLinkable linkable} are

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
 Thu Apr 16 08:26:19 2015
@@ -1,3 +1,19 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
 package org.apache.stanbol.enhancer.engines.lucenefstlinking;
 
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
@@ -15,10 +31,10 @@ public enum LinkingModeEnum {
      * or even only {@link Pos#ProperNoun} - depending on the 
      * {@link TextProcessingConfig} 
      */
-    LINKABLE_TOKEN //,
-//    /**
-//     * Only {@link NerTag}s are linked with the vocabualry
-//     */
-//    NER
+    LINKABLE_TOKEN,
+    /**
+     * Only {@link NerTag}s are linked with the vocabualry
+     */
+    NER
 
 }

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
 Thu Apr 16 08:26:19 2015
@@ -24,11 +24,12 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.NavigableMap;
 import java.util.Set;
+import java.util.TreeMap;
 
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Literal;
-import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.commons.lang.StringUtils;
@@ -40,24 +41,17 @@ import org.apache.lucene.document.String
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.queries.function.valuesource.IfFunction;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldLoader;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldType;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCache;
-import 
org.apache.stanbol.enhancer.engines.lucenefstlinking.impl.ValueSourceAccessor;
-import org.apache.stanbol.enhancer.servicesapi.ContentItem;
-import org.apache.stanbol.enhancer.servicesapi.EngineException;
-import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
 import org.opensextant.solrtexttagger.TaggerFstCorpus;
-import org.opensextant.solrtexttagger.UnsupportedTokenException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.eventbus.AllowConcurrentEvents;
-
 /**
  * Profile created based on the {@link IndexConfiguration} for processing a
  * parsed ContentItem. <p>
@@ -101,6 +95,18 @@ public class TaggingSession implements C
     protected final String typeField;
     protected final String redirectField;
     protected final String rankingField;
+    
+    /**
+     * Used in the {@link LinkingModeEnum#NER} to store the {@link 
NerTag#getTag()}
+     * and {@link NerTag#getType()} values for the span of the Named Entity.<p>
+     * This information is collected by the {@link NamedEntityTokenFilter} 
while
+     * iterating over the parsed text and is used in the processing of
+     * {@link Tag}s to filter Entities based on their types. <p>
+     * Not used in any linking mode other than <code>NER</code>
+     */
+    protected final NavigableMap<int[],Set<String>> entityMentionTypes = 
+            new TreeMap<int[],Set<String>>(Tag.SPAN_COMPARATOR);
+    
     private final RefCounted<SolrIndexSearcher> searcherRef;
     /**
      * Document Cache and session statistics for the cache

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
 Thu Apr 16 08:26:19 2015
@@ -25,9 +25,28 @@ one with the higher ranking will be used
 #Properties specific to the FST linking engine 
 
#===============================================================================
 
org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.name=Apache
 \
-Stanbol Enhancer Engine: FST Linking
+Stanbol Enhancer Engine: FST Linking: Linkable Token
 
org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.description=Lucene
 \
-FST based Entity Linking Engine implementation.
+FST based Entity Linking Engine that looks up Linkable Tokens in the 
controlled vocabulary. \
+Typically Proper Nouns (or all Nouns) are considered as linkable. Also Noun 
Phrases are \
+used to ensure that single word matches are not matched for phrases in the 
text (e.g. that \
+"university" is not matched with "University of Munich" mentioned in the text).
+
+org.apache.stanbol.enhancer.engines.lucenefstlinking.NamedEntityFstLinkingComponnet.name=Apache
 \
+Stanbol Enhancer Engine: FST Linking: Named Entities
+org.apache.stanbol.enhancer.engines.lucenefstlinking.NamedEntityFstLinkingComponnet=
 Lucene \
+FST based Entity Linking Enigne that looks up Named Entities recognized in the 
text in the \
+configured controlled vocabulary. This mode supports to filter possible 
matches in the \
+vocabulary based on the type detected for the Named Entity.
+
+org.apache.stanbol.enhancer.engines.lucenefstlinking.PlainFstLinkingComponnet.name=Apache
 \
+Stanbol Enhancer Engine: FST Linking: Plain
+org.apache.stanbol.enhancer.engines.lucenefstlinking.PlainFstLinkingComponnet.description=\
+Lucene FST based Entity Linking Engine that operates on the plain text. It 
does not use \
+(and require) any NLP processing results (other than language detection). The 
Query time \
+Lucene Analyzer is used to process the parsed text and every token is linked 
with the \
+controlled vocabulary.
+
 
 enhancer.engines.linking.lucenefst.solrcore.name=Solr Core
 enhancer.engines.linking.lucenefst.solrcore.description=The reference to the 
SolrCore. \
@@ -153,15 +172,23 @@ enhancer.engines.linking.entityTypes.nam
 enhancer.engines.linking.entityTypes.description=Allows to define a 
white/black list \
 based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for 
white \
 listing. Include '*' to force white listing (e.g. to allow Entities without 
any type). \
-Rules are processed based on their oder. 
+Rules are processed based on their oder. NOTE: Not used in the NER linking mode
 
 enhancer.engines.linking.lucenefst.mode.name=Linking Mode
 enhancer.engines.linking.lucenefst.mode.description=The linking mode allows to 
switch the \
 operation mode of the FST linking engine: PLAIN will link every single word 
with the \
 vocabulary. No NLP processing is required in this mode; LINKABLE_TOKEN will 
use NLP \
 processing results to determine what tokens should be linked (typically all 
Nouns or \
-only ProperNouns - configurable via the TextProcessing configuration); 
-#finally the NER mode will only link Named Entities detected by a NER 
component.
+only ProperNouns - configurable via the TextProcessing configuration); \
+finally the NER mode will only link Named Entities detected by a NER component.
 enhancer.engines.linking.lucenefst.mode.option.plain=Plain
 enhancer.engines.linking.lucenefst.mode.option.linkableToken=Linkable Tokens
-#enhancer.engines.linking.lucenefst.mode.option.ner=NER (not yet implemented)
+enhancer.engines.linking.lucenefst.mode.option.ner=NER
+
+enhancer.engines.linking.lucenefst.neTypeMapping.name=Named Entity Type 
Mappings
+enhancer.engines.linking.lucenefst.neTypeMapping.description=Allows to map 
Named \
+Entity Tags and Types to Entity types. Syntax: {ne-type} > {entity-type-1}; 
{entity-type-2}. \
+(e.g. a mapping for the tag "Person" to the type schema:Person - "Person > 
http://schema.org/Person";, \
+a second mapping for the type "dbpedia:Person" to person types of different 
ontologies \
+"dbpedia:Person > dbpedia:Person; schema:Person; foaf:Person"). \
+NOTE: Only used in the NER linking mode.

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
 Thu Apr 16 08:26:19 2015
@@ -301,7 +301,7 @@ public class FstLinkingEngineTest {
         elc.setMinFoundTokens(2);//this is assumed by this test
         elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
         FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking", 
-            LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc);
+            LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
         processConentItem(engine);
         validateEnhancements(
             Arrays.asList(
@@ -322,7 +322,7 @@ public class FstLinkingEngineTest {
         elc.setMinFoundTokens(2);//this is assumed by this test
         elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
         FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking", 
-            LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc);
+            LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
         processConentItem(engine);
         validateEnhancements(
             Arrays.asList(


Reply via email to