Author: rwesten
Date: Thu Apr 16 08:26:19 2015
New Revision: 1674016
URL: http://svn.apache.org/r1674016
Log:
merged implementation for STANBOL-1418 und fix for STANBOL-1416 from 0.12 to
trunk
Added:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityFstLinkingComponnet.java
- copied unchanged from r1674012,
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityFstLinkingComponnet.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityTokenFilter.java
- copied unchanged from r1674012,
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/NamedEntityTokenFilter.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/PlainFstLinkingComponnet.java
- copied unchanged from r1674012,
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/PlainFstLinkingComponnet.java
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
Thu Apr 16 08:26:19 2015
@@ -28,6 +28,7 @@ import static org.apache.stanbol.enhance
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -62,8 +63,8 @@ import org.apache.stanbol.enhancer.engin
import
org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
@@ -101,12 +102,23 @@ public class FstLinkingEngine implements
protected final TextProcessingConfig tpConfig;
protected final EntityLinkerConfig elConfig;
+
+ /**
+ * Used in the {@link LinkingModeEnum#NER} to filter entities. For that
configured
+ * mappings for the {@link NerTag#getType()} and {@link NerTag#getTag()}
values
+ * (the key) are mapped with the actual {@link Match#getTypes()} (the
value set).
+ * The <code>null</code> value is interpreted as wildCard (any type
matches). An
+ * empty mapping is interpreted as an blacklist (do not lookup Named
Entities
+ * with that {@link NerTag#getType() type}/{@link NerTag#getTag() tag}
+ */
+ protected final Map<String,Set<String>> neTypeMappings;
private IndexConfiguration indexConfig;
public FstLinkingEngine(String name, LinkingModeEnum linkingMode,
IndexConfiguration indexConfig,
- TextProcessingConfig tpConfig, EntityLinkerConfig elConfig) {
+ TextProcessingConfig tpConfig, EntityLinkerConfig elConfig,
+ Map<String,Set<String>> neTypeMappings) {
if (StringUtils.isBlank(name)) {
throw new IllegalArgumentException("The parsed name MUST NOT be
NULL nor blank!");
}
@@ -124,6 +136,11 @@ public class FstLinkingEngine implements
throw new IllegalArgumentException("The parsed Entity Linking
configuration MUST NOT be NULL");
}
this.elConfig = elConfig;
+ if(linkingMode == LinkingModeEnum.NER && neTypeMappings == null){
+ throw new IllegalArgumentException("The NamedEntity type mappings
MUST NOT be NULL "
+ + "if the LinkingMode is NER!");
+ }
+ this.neTypeMappings = neTypeMappings;
}
@Override
@@ -155,9 +172,17 @@ public class FstLinkingEngine implements
}
// we need a detected language, the AnalyzedText contentPart with
// Tokens.
- AnalysedText at = getAnalysedText(this, ci, false);
- if(at == null && linkingMode == LinkingModeEnum.PLAIN){
- return NlpEngineHelper.getPlainText(this, ci, false) != null ?
ENHANCE_ASYNC : CANNOT_ENHANCE;
+ AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+ if(at == null){
+ if( linkingMode == LinkingModeEnum.PLAIN){
+ return NlpEngineHelper.getPlainText(this, ci, false) != null ?
ENHANCE_ASYNC : CANNOT_ENHANCE;
+ } else {
+ log.warn("Unable to process {} with engine name={} and mode={}
"
+ + ": Missing AnalyzedText content part. Please ensure
that "
+ + "NLP processing results are available before FST
linking!",
+ new Object[]{ci,name,linkingMode});
+ return CANNOT_ENHANCE;
+ }
} else {
if(linkingMode == LinkingModeEnum.PLAIN){
return ENHANCE_ASYNC;
@@ -167,7 +192,7 @@ public class FstLinkingEngine implements
log.warn("Unable to process {} with engine name={} and mode={}
"
+ "as the AnalyzedText does not contain any Tokens!",
new Object[]{ci,name,linkingMode});
- return at.getTokens().hasNext() ? ENHANCE_ASYNC :
CANNOT_ENHANCE;
+ return CANNOT_ENHANCE;
}
}
}
@@ -243,7 +268,7 @@ public class FstLinkingEngine implements
log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
}
}
- int matches = match(content,tags.values());
+ int matches = match(content, tags.values(),
session.entityMentionTypes);
log.debug(" - loaded {} ({} loaded, {} cached, {} appended)
Matches in {} ms",
new Object[]{matches, session.getSessionDocLoaded(),
session.getSessionDocCached(),
session.getSessionDocAppended(),
@@ -273,7 +298,7 @@ public class FstLinkingEngine implements
tags.clear(); //help the GC
}
- private int match(String text, Collection<Tag> tags) {
+ private int match(String text, Collection<Tag> tags,
Map<int[],Set<String>> emTypes) {
log.trace(" ... process matches for {} extracted Tags:",tags.size());
int matchCount = 0;
Iterator<Tag> tagIt = tags.iterator();
@@ -294,7 +319,20 @@ public class FstLinkingEngine implements
log.trace(" {}. {}", i++, match.getUri());
}
matchCount++;
- if(!filterEntityByType(match.getTypes().iterator())){
+ final boolean filterType;
+ if(linkingMode == LinkingModeEnum.NER){
+ Set<String> types = emTypes.get(new int[]{tag.getStart(),
tag.getEnd()});
+ if(types == null){
+ log.warn(" - missing NE types for Named Entity [{},{}]
{}!",
+ new Object[]{tag.getStart(),
tag.getEnd(),tag.getAnchor()});
+ filterType = true;
+ } else {
+ filterType =
filterByNamedEntityType(match.getTypes().iterator(), types);
+ }
+ } else {
+ filterType =
filterEntityByType(match.getTypes().iterator());
+ }
+ if(!filterType){
int distance = Integer.MAX_VALUE;
Literal matchLabel = null;
for(Iterator<Literal> it = match.getLabels().iterator();
it.hasNext() && distance > 0;){
@@ -370,6 +408,44 @@ public class FstLinkingEngine implements
return matchCount;
}
/**
+ * Filter Entities based on matching the entity types with the named
entity types.
+ * The {@link #neTypeMappings} are used to convert named entity types to
+ * entity types.
+ * @param eTypes the types of the entity
+ * @param neTypes the types of the named entity
+ * @return
+ */
+ private boolean filterByNamedEntityType(Iterator<UriRef> eTypes,
Set<String> neTypes) {
+ //first collect the allowed entity types
+ Set<String> entityTypes = new HashSet<String>();
+ for(String neType : neTypes){
+ if(neType != null){
+ Set<String> mappings = neTypeMappings.get(neType);
+ if(mappings != null){
+ if(mappings.contains(null)){
+ //found an wildcard
+ return false; //do not filter
+ } else {
+ entityTypes.addAll(mappings);
+ }
+ } //else no mapping for neType (tag or uri) present
+ }
+ }
+ if(entityTypes.isEmpty()){
+ return true; //no match possible .. filter
+ }
+ //second check the actual entity types against the allowed
+ while(eTypes.hasNext()){
+ UriRef typeUri = eTypes.next();
+ if(typeUri != null &&
entityTypes.contains(typeUri.getUnicodeString())){
+ return false; //we found an match .. do not filter
+ }
+ }
+ //no match found ... filter
+ return true;
+ }
+
+ /**
* Applies the configured entity type based filters
* @param entityTypes
* @return
@@ -432,11 +508,23 @@ public class FstLinkingEngine implements
tokenStream = baseTokenStream;
reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
break;
-// case NER:
+ case NER:
+ //this uses the NamedEntityTokenFilter as tokenStream and a
+ //combination with the longest dominant right as reducer
+ NamedEntityTokenFilter neTokenFilter = new
NamedEntityTokenFilter(
+ baseTokenStream, at, session.getLanguage(),
neTypeMappings.keySet(),
+ session.entityMentionTypes);
+ tokenStream = neTokenFilter;
+ reducer = new ChainedTagClusterReducer(neTokenFilter,
+ TagClusterReducer.LONGEST_DOMINANT_RIGHT);
+ break;
case LINKABLE_TOKEN:
+ //this uses the LinkableTokenFilter as tokenStream
LinkableTokenFilter linkableTokenFilter = new
LinkableTokenFilter(baseTokenStream,
at, session.getLanguage(),
tpConfig.getConfiguration(session.getLanguage()),
elConfig.getMinChunkMatchScore(),
elConfig.getMinFoundTokens());
+ //NOTE that the LinkableTokenFilter implements longest
dominant right
+ // based on the matchable span of tags (instead of the whole
span).
reducer = new ChainedTagClusterReducer(
linkableTokenFilter,TagClusterReducer.ALL);
tokenStream = linkableTokenFilter;
@@ -446,11 +534,9 @@ public class FstLinkingEngine implements
+ linkingMode + "! Please adapt implementation to changed
Enumeration!");
}
log.debug(" - tokenStream: {}", tokenStream);
- log.debug(" - reducer: {}", reducer);
- //we use two TagClusterReducer implementations.
- // (1) the linkableTokenFilter filters all tags that do not overlap any
- // linkable Token
- // (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
+ log.debug(" - reducer: {} (class: {})", reducer,
reducer.getClass().getName());
+
+ //Now process the document
final long[] time = new long[]{0};
new Tagger(corpus.getFst(), tokenStream,
reducer,session.isSkipAltTokens()) {
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
Thu Apr 16 08:26:19 2015
@@ -36,9 +36,13 @@ import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
import java.util.Dictionary;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
+import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -63,6 +67,7 @@ import org.apache.felix.scr.annotations.
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.SolrCore;
+import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.commons.solr.IndexReference;
import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
@@ -70,6 +75,7 @@ import org.apache.stanbol.enhancer.engin
import
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import
org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
import
org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.FastLRUCacheManager;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
@@ -132,35 +138,12 @@ import com.google.common.util.concurrent
value=IndexConfiguration.DEFAULT_FST_FOLDER),
@Property(name=IndexConfiguration.SOLR_TYPE_FIELD, value="rdf:type"),
@Property(name=IndexConfiguration.SOLR_RANKING_FIELD,
value="entityhub:entityRank"),
-// @Property(name=REDIRECT_FIELD,value="rdfs:seeAlso"),
-// @Property(name=REDIRECT_MODE,options={
-// @PropertyOption(
-// value='%'+REDIRECT_MODE+".option.ignore",
-// name="IGNORE"),
-// @PropertyOption(
-// value='%'+REDIRECT_MODE+".option.addValues",
-// name="ADD_VALUES"),
-// @PropertyOption(
-// value='%'+REDIRECT_MODE+".option.follow",
-// name="FOLLOW")
-// },value="IGNORE"),
@Property(name=FstLinkingEngineComponent.FST_THREAD_POOL_SIZE,
intValue=FstLinkingEngineComponent.DEFAULT_FST_THREAD_POOL_SIZE),
@Property(name=FstLinkingEngineComponent.ENTITY_CACHE_SIZE,
intValue=FstLinkingEngineComponent.DEFAULT_ENTITY_CACHE_SIZE),
@Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
@Property(name=INCLUDE_SIMILAR_SCORE,
boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
- @Property(name=FstLinkingEngineComponent.LINKING_MODE, options={
- @PropertyOption(
-
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.plain",
- name="PLAIN"),
- @PropertyOption(
-
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.linkableToken",
- name="LINKABLE_TOKEN") //,
- //@PropertyOption(
- //
value='%'+FstLinkingEngineComponent.LINKING_MODE+".option.ner",
- // name="NER")
- },value="LINKABLE_TOKEN"),
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE,
boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
@Property(name=PROCESSED_LANGUAGES, cardinality=Integer.MAX_VALUE,
@@ -178,9 +161,6 @@ import com.google.common.util.concurrent
"dbp-ont:Event; schema:Event > dbp-ont:Event",
"schema:Product > schema:Product",
"skos:Concept > skos:Concept"}),
-// @Property(name=DEREFERENCE_ENTITIES,
boolValue=DEFAULT_DEREFERENCE_ENTITIES_STATE),
-// @Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
-//
value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
@Property(name=SERVICE_RANKING,intValue=0)
})
public class FstLinkingEngineComponent {
@@ -206,6 +186,13 @@ public class FstLinkingEngineComponent {
public static final String LINKING_MODE =
"enhancer.engines.linking.lucenefst.mode";
/**
+ * Allows to configure mappings of NamedEntity Types to types of Entities
in the
+ * vocabulary. Configured keys are matched against the {@link
NerTag#getTag()} AND
+ * {@link NerTag#getType()} values of NamedEntities. Configured Values are
mapped
+ * against the values of the configured {@link
IndexConfiguration#SOLR_TYPE_FIELD}.
+ */
+ public static final String NAMED_ENTITY_TYPE_MAPPINGS =
"enhancer.engines.linking.lucenefst.neTypeMapping";
+ /**
* The size of the thread pool used to create FST models (default=1).
Creating
* such models does need a lot of memory. Expect values up to 10times of
the
* build model. So while this task can easily performed concurrently users
need
@@ -242,7 +229,7 @@ public class FstLinkingEngineComponent {
*/
private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
- private final Logger log =
LoggerFactory.getLogger(FstLinkingEngineComponent.class);
+ protected final Logger log =
LoggerFactory.getLogger(FstLinkingEngineComponent.class);
/**
* the name for the EnhancementEngine registered by this component
*/
@@ -257,7 +244,7 @@ public class FstLinkingEngineComponent {
* used to resolve '{prefix}:{local-name}' used within the engines
configuration
*/
@Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
- protected NamespacePrefixService prefixService;
+ private NamespacePrefixService prefixService;
/**
* Holds the FST configuration parsed to the engine
@@ -322,7 +309,7 @@ public class FstLinkingEngineComponent {
* The bundle context for this component. Also used to track dependencies
* and register the {@link #engineRegistration}
*/
- private BundleContext bundleContext;
+ protected BundleContext bundleContext;
/**
* Thread pool used for the runtime creation of FST modles.
@@ -355,6 +342,8 @@ public class FstLinkingEngineComponent {
* The size of the EntityCache ( <code>0</code> ... means deactivated)
*/
private int entityCacheSize;
+
+ private Map<String,Set<String>> nerTypeMappings;
/**
* Default constructor as used by OSGI. This expects that
@@ -366,9 +355,63 @@ public class FstLinkingEngineComponent {
@Activate
@SuppressWarnings("unchecked")
protected void activate(ComponentContext ctx) throws
ConfigurationException {
- log.info("activate {}",getClass().getSimpleName());
+ log.info("activate {}", getClass().getSimpleName());
+ log.debug(" - instance: {}", this);
+ log.debug(" - config: {}", ctx.getProperties());
this.bundleContext = ctx.getBundleContext();
- Dictionary<String,Object> properties = ctx.getProperties();
+ //(0) parse the linking mode
+ applyConfig(parseLinkingMode(ctx), ctx.getProperties(), prefixService);
+ }
+
+ /**
+ * Parses the LinkingMode from the {@link #LINKING_MODE} property. This
+ * allows to use this component to configure FST linking engines for any
+ * supported LinkingMode. If the {@link #LINKING_MODE} is not present the
+ * default {@link LinkingModeEnum#LINKABLE_TOKEN} is returned. <p>
+ * <b>NOTE:</b>Typically
+ * users will want to use the <ul>
+ * <li>{@link PlainFstLinkingComponnet} to configure FST engines for the
+ * {@link LinkingModeEnum#PLAIN}
+ * <li> {@link NamedEntityFstLinkingComponnet} to configure FST engines for
+ * the {@link LinkingModeEnum#NER}
+ * </ul>
+ * but is is also fine to explicitly specify a {@link #LINKING_MODE}
linking
+ * mode when using this component to configure the FST linking engine.
+ * @param ctx the parsed component context
+ * @return the parsed {@link LinkingModeEnum}
+ * @throws ConfigurationException
+ */
+ private LinkingModeEnum parseLinkingMode(ComponentContext ctx) throws
ConfigurationException {
+ Object value = ctx.getProperties().get(LINKING_MODE);
+ LinkingModeEnum linkingMode;
+ if(value == null || StringUtils.isBlank(value.toString())){
+ linkingMode = LinkingModeEnum.LINKABLE_TOKEN;
+ } else {
+ try {
+ linkingMode = LinkingModeEnum.valueOf(value.toString());
+ } catch(IllegalArgumentException e){
+ throw new ConfigurationException(LINKING_MODE, "The parsed
value '"
+ +value+"' (type: "+value.getClass().getName()+") is not a
member "
+ + "of the enum (members: "+
Arrays.toString(LinkingModeEnum.values())
+ + ")!",e);
+ }
+ }
+ return linkingMode;
+ }
+ /**
+ * Called by {@link #activate(ComponentContext)},
+ * {@link PlainFstLinkingComponnet#activate(ComponentContext)} and
+ * {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
+ * apply the parsed {@link ComponentContext#getProperties()}. The
+ * {@link LinkingModeEnum linking mode} is parsed separately as OSGI does
not
+ * allow to modify the parsed config and sup-classes do need to override
+ * the linking mode.
+ * @param linkingMode the linking mode
+ * @param properties
+ * @throws ConfigurationException
+ */
+ protected void applyConfig(LinkingModeEnum linkingMode,
Dictionary<String,Object> properties, NamespacePrefixService prefixService)
+ throws ConfigurationException {
//(0) The name for the Enhancement Engine and the basic metadata
Object value = properties.get(PROPERTY_NAME);
if(value == null || value.toString().isEmpty()){
@@ -381,21 +424,10 @@ public class FstLinkingEngineComponent {
engineMetadata.put(PROPERTY_NAME, this.engineName);
value = properties.get(Constants.SERVICE_RANKING);
engineMetadata.put(Constants.SERVICE_RANKING, value == null ?
Integer.valueOf(0) : value);
- //(0) parse the linking mode
- value = properties.get(LINKING_MODE);
- if(value == null || StringUtils.isBlank(value.toString())){
- this.linkingMode = LinkingModeEnum.LINKABLE_TOKEN;
- } else {
- try {
- this.linkingMode = LinkingModeEnum.valueOf(value.toString());
- } catch(IllegalArgumentException e){
- throw new ConfigurationException(LINKING_MODE, "The parsed
value '"
- +value+"' (type: "+value.getClass().getName()+") is not a
member "
- + "of the enum (members: "+
Arrays.toString(LinkingModeEnum.values())
- + ")!",e);
- }
- }
- log.info(" - linking mode: {}",linkingMode);
+
+ //(0) set the linking mode
+ this.linkingMode = linkingMode;
+ log.info(" - linking mode: {}", linkingMode);
//(1) parse the TextProcessing configuration
//TODO: decide if we should use the TextProcessingConfig for this
engine
@@ -561,8 +593,70 @@ public class FstLinkingEngineComponent {
} else {
solrRankingField = value.toString().trim();
}
+ //(10) parse the NamedEntity type mappings (if linkingMode = NER)
+ if(linkingMode == LinkingModeEnum.NER){
+ nerTypeMappings = new HashMap<String,Set<String>>();
+ value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
+ if(value instanceof String[]){ //support array
+ value = Arrays.asList((String[])value);
+ } else if(value instanceof String) { //single value
+ value = Collections.singleton(value);
+ }
+ if(value instanceof Collection<?>){ //and collection
+ log.info(" - process Named Entity Type Mappings (used by
LinkingMode: {})",linkingMode);
+ configs : for(Object o : (Iterable<?>)value){
+ if(o != null){
+ StringBuilder usage = new StringBuilder("useage: ");
+ usage.append("'{namedEntity-tag-or-uri} >
{entityType-1}[,{entityType-n}]'");
+ String[] config = o.toString().split(">");
+ String namedEntityType = config[0].trim();
+ if(namedEntityType.isEmpty()){
+ log.warn("Invalid Type Mapping Config '{}':
Missing namedEntityType ({}) -> ignore this config",
+ o,usage);
+ continue configs;
+ }
+ if(NamespaceMappingUtils.getPrefix(namedEntityType) !=
null){
+ namedEntityType =
NamespaceMappingUtils.getConfiguredUri(
+ prefixService,
NAMED_ENTITY_TYPE_MAPPINGS,namedEntityType);
+ }
+ if(config.length < 2 || config[1].isEmpty()){
+ log.warn("Invalid Type Mapping Config '{}':
Missing dc:type URI '{}' ({}) -> ignore this config",
+ o,usage);
+ continue configs;
+ }
+ String entityTypes = config[1].trim();
+ if(config.length > 2){
+ log.warn("Configuration after 2nd '>' gets
ignored. Will use mapping '{} > {}' from config {}",
+ new Object[]{namedEntityType,entityTypes,o});
+ }
+ Set<String> types =
nerTypeMappings.get(namedEntityType);
+ if(types == null){ //add new element to the mapping
+ types = new HashSet<String>();
+ nerTypeMappings.put(namedEntityType, types);
+ }
+ for(String entityType : entityTypes.split(";")){
+ entityType = entityType.trim();
+ if(!entityType.isEmpty()){
+ String typeUri;
+ if("*".equals(entityType)){
+ typeUri = null; //null is used as wildcard
+ } else {
+ typeUri =
NamespaceMappingUtils.getConfiguredUri(
+ prefixService,
NAMED_ENTITY_TYPE_MAPPINGS, entityType);
+ }
+ log.info(" - add {} > {}", namedEntityType,
typeUri);
+ types.add(typeUri);
+ } //else ignore empty mapping
+ }
+ }
+ }
+ } else { //no mappings defined ... set wildcard mapping
+ log.info(" - No Named Entity type mappings configured. Will
use wildcard mappings");
+ nerTypeMappings = Collections.singletonMap(null,
Collections.<String>singleton(null));
+ }
+ }
- //(10) start tracking the SolrCore
+ //(11) start tracking the SolrCore
try {
solrServerTracker = new RegisteredSolrServerTracker(
bundleContext, indexReference, null){
@@ -599,7 +693,18 @@ public class FstLinkingEngineComponent {
throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name
'"
+ value.toString()+"' is invalid (expected:
'[{server-name}:]{indexname}'");
}
- solrServerTracker.open();
+ try {
+ solrServerTracker.open();
+ } catch(RuntimeException e){
+ //FIX for STANBOL-1416 (see
https://issues.apache.org/jira/browse/STANBOL-1416)
+ //If an available SolrCore can not be correctly initialized we will
+ //get the exception here. In this case we want this component to be
+ //activated and waiting for further service events. Because of that
+ //we catch here the exception.
+ log.debug("Error while processing existing SolrCore Service during
"
+ + "opening SolrServiceTracker ... waiting for further
service"
+ + "Events", e);
+ }
}
/**
@@ -712,18 +817,28 @@ public class FstLinkingEngineComponent {
} else {
log.info(" ... no corpus for default language {} available",
defaultCoprous);
}
- //set the index configuration to the field;
+
+ //check if the old configuration is still present
+ if(this.engineRegistration != null){
+ unregisterEngine();
+ }
+
+ //create the new configuration
+
+ //set the newly configured instances to the fields
this.indexConfig = indexConfig;
+ this.solrServerReference = reference;
+ this.solrCore = core;
+ //create the new FST linking engine instance
FstLinkingEngine engine = new FstLinkingEngine(engineName,
linkingMode, indexConfig,
- textProcessingConfig, entityLinkerConfig);
+ textProcessingConfig, entityLinkerConfig, nerTypeMappings);
+ //register it as a service
String[] services = new String [] {
EnhancementEngine.class.getName(),
ServiceProperties.class.getName()};
log.info(" ... register {}: {}",
engine.getClass().getSimpleName(),engineName);
this.engineRegistration =
bundleContext.registerService(services,engine, engineMetadata);
- this.solrServerReference = reference;
- this.solrCore = core;
}
@@ -765,12 +880,21 @@ public class FstLinkingEngineComponent {
* rests the fields. If no engine is registered this does nothing!
*/
private void unregisterEngine() {
+ log.debug("> in unregisterEngine() ...");
//use local copies for method calls to avoid concurrency issues
ServiceRegistration engineRegistration = this.engineRegistration;
if(engineRegistration != null){
log.info(" ... unregister Lucene FSTLinkingEngine {}",engineName);
- engineRegistration.unregister();
+ try {
+ engineRegistration.unregister();
+ } catch(IllegalStateException e) {
+ //this is unexpected but can be ignored
+ log.info("Unexpected State: Service for FSTLinkingEngine "
+ + engineName+" was already deactivated.", e);
+ }
this.engineRegistration = null; //reset the field
+ } else {
+ log.debug(" ... no engine registration present");
}
solrServerReference = null;
SolrCore solrServer = this.solrCore;
@@ -778,6 +902,8 @@ public class FstLinkingEngineComponent {
log.debug(" ... unregister SolrCore {}", solrServer.getName());
solrServer.close(); //decrease the reference count!!
this.solrCore = null; //rest the field
+ } else {
+ log.debug(" ... no SolrCore present");
}
//deactivate the index configuration if present
if(indexConfig != null){
@@ -790,6 +916,8 @@ public class FstLinkingEngineComponent {
cacheManager.close();
}
indexConfig = null;
+ } else {
+ log.debug(" ... no index config present");
}
}
@@ -834,7 +962,11 @@ public class FstLinkingEngineComponent {
*/
@Deactivate
protected void deactivate(ComponentContext ctx) {
- log.info(" ... deactivate {}: {}",getClass().getSimpleName(),
engineName);
+ log.info(" ... deactivate {}: {} (CompInst: {})",new Object[] {
+ getClass().getSimpleName(),
+ engineName, ctx.getComponentInstance()});
+ log.debug(" - instance: {}", this);
+ log.debug(" - config: {}", ctx.getProperties());
if(solrServerTracker != null){
//closing the tracker will also cause registered engines to be
//unregistered as service (see #updateEngineRegistration())
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Thu Apr 16 08:26:19 2015
@@ -57,12 +57,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Class the ensures that only {@link TokenData#isLinkable linkable} Tokens
+ * Class that ensures that only {@link TokenData#isLinkable linkable} Tokens
* are processed.<p>
* This is ensured on two places:<ol>
* <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link
TaggingAttribute}
* based on NLP processing results present in the {@link AnalysedText}. This
- * implementation Classifies Token similar to the {@link EntityLinkingEngine}.
+ * implementation classifies Token similar to the {@link EntityLinkingEngine}.
* It uses the {@link TextProcessingConfig} for its configuration.<p>
* <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL
tags}
* that do not overlap with any {@link TokenData#isLinkable linkable} are
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkingModeEnum.java
Thu Apr 16 08:26:19 2015
@@ -1,3 +1,19 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
package org.apache.stanbol.enhancer.engines.lucenefstlinking;
import
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
@@ -15,10 +31,10 @@ public enum LinkingModeEnum {
* or even only {@link Pos#ProperNoun} - depending on the
* {@link TextProcessingConfig}
*/
- LINKABLE_TOKEN //,
-// /**
-// * Only {@link NerTag}s are linked with the vocabualry
-// */
-// NER
+ LINKABLE_TOKEN,
+ /**
+ * Only {@link NerTag}s are linked with the vocabualry
+ */
+ NER
}
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
Thu Apr 16 08:26:19 2015
@@ -24,11 +24,12 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.NavigableMap;
import java.util.Set;
+import java.util.TreeMap;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
-import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.commons.lang.StringUtils;
@@ -40,24 +41,17 @@ import org.apache.lucene.document.String
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.queries.function.valuesource.IfFunction;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldLoader;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldType;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCache;
-import
org.apache.stanbol.enhancer.engines.lucenefstlinking.impl.ValueSourceAccessor;
-import org.apache.stanbol.enhancer.servicesapi.ContentItem;
-import org.apache.stanbol.enhancer.servicesapi.EngineException;
-import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.opensextant.solrtexttagger.TaggerFstCorpus;
-import org.opensextant.solrtexttagger.UnsupportedTokenException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.eventbus.AllowConcurrentEvents;
-
/**
* Profile created based on the {@link IndexConfiguration} for processing a
* parsed ContentItem. <p>
@@ -101,6 +95,18 @@ public class TaggingSession implements C
protected final String typeField;
protected final String redirectField;
protected final String rankingField;
+
+ /**
+ * Used in the {@link LinkingModeEnum#NER} to store the {@link
NerTag#getTag()}
+ * and {@link NerTag#getType()} values for the span of the Named Entity.<p>
+ * This information is collected by the {@link NamedEntityTokenFilter}
while
+ * iterating over the parsed text and is used in the processing of
+ * {@link Tag}s to filter Entities based on their types. <p>
+ * Not used in any linking mode other than <code>NER</code>
+ */
+ protected final NavigableMap<int[],Set<String>> entityMentionTypes =
+ new TreeMap<int[],Set<String>>(Tag.SPAN_COMPARATOR);
+
private final RefCounted<SolrIndexSearcher> searcherRef;
/**
* Document Cache and session statistics for the cache
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
Thu Apr 16 08:26:19 2015
@@ -25,9 +25,28 @@ one with the higher ranking will be used
#Properties specific to the FST linking engine
#===============================================================================
org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.name=Apache
\
-Stanbol Enhancer Engine: FST Linking
+Stanbol Enhancer Engine: FST Linking: Linkable Token
org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.description=Lucene
\
-FST based Entity Linking Engine implementation.
+FST based Entity Linking Engine that looks up Linkable Tokens in the
controlled vocabulary. \
+Typically Proper Nouns (or all Nouns) are considered as linkable. Also Noun
Phrases are \
+used to ensure that single word matches are not matched for phrases in the
text (e.g. that \
+"university" is not matched with "University of Munich" mentioned in the text).
+
+org.apache.stanbol.enhancer.engines.lucenefstlinking.NamedEntityFstLinkingComponnet.name=Apache
\
+Stanbol Enhancer Engine: FST Linking: Named Entities
+org.apache.stanbol.enhancer.engines.lucenefstlinking.NamedEntityFstLinkingComponnet=
Lucene \
+FST based Entity Linking Enigne that looks up Named Entities recognized in the
text in the \
+configured controlled vocabulary. This mode supports to filter possible
matches in the \
+vocabulary based on the type detected for the Named Entity.
+
+org.apache.stanbol.enhancer.engines.lucenefstlinking.PlainFstLinkingComponnet.name=Apache
\
+Stanbol Enhancer Engine: FST Linking: Plain
+org.apache.stanbol.enhancer.engines.lucenefstlinking.PlainFstLinkingComponnet.description=\
+Lucene FST based Entity Linking Engine that operates on the plain text. It
does not use \
+(and require) any NLP processing results (other than language detection). The
Query time \
+Lucene Analyzer is used to process the parsed text and every token is linked
with the \
+controlled vocabulary.
+
enhancer.engines.linking.lucenefst.solrcore.name=Solr Core
enhancer.engines.linking.lucenefst.solrcore.description=The reference to the
SolrCore. \
@@ -153,15 +172,23 @@ enhancer.engines.linking.entityTypes.nam
enhancer.engines.linking.entityTypes.description=Allows to define a
white/black list \
based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for
white \
listing. Include '*' to force white listing (e.g. to allow Entities without
any type). \
-Rules are processed based on their oder.
+Rules are processed based on their oder. NOTE: Not used in the NER linking mode
enhancer.engines.linking.lucenefst.mode.name=Linking Mode
enhancer.engines.linking.lucenefst.mode.description=The linking mode allows to
switch the \
operation mode of the FST linking engine: PLAIN will link every single word
with the \
vocabulary. No NLP processing is required in this mode; LINKABLE_TOKEN will
use NLP \
processing results to determine what tokens should be linked (typically all
Nouns or \
-only ProperNouns - configurable via the TextProcessing configuration);
-#finally the NER mode will only link Named Entities detected by a NER
component.
+only ProperNouns - configurable via the TextProcessing configuration); \
+finally the NER mode will only link Named Entities detected by a NER component.
enhancer.engines.linking.lucenefst.mode.option.plain=Plain
enhancer.engines.linking.lucenefst.mode.option.linkableToken=Linkable Tokens
-#enhancer.engines.linking.lucenefst.mode.option.ner=NER (not yet implemented)
+enhancer.engines.linking.lucenefst.mode.option.ner=NER
+
+enhancer.engines.linking.lucenefst.neTypeMapping.name=Named Entity Type
Mappings
+enhancer.engines.linking.lucenefst.neTypeMapping.description=Allows to map
Named \
+Entity Tags and Types to Entity types. Syntax: {ne-type} > {entity-type-1};
{entity-type-2}. \
+(e.g. a mapping for the tag "Person" to the type schema:Person - "Person >
http://schema.org/Person", \
+a second mapping for the type "dbpedia:Person" to person types of different
ontologies \
+"dbpedia:Person > dbpedia:Person; schema:Person; foaf:Person"). \
+NOTE: Only used in the NER linking mode.
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1674016&r1=1674015&r2=1674016&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
Thu Apr 16 08:26:19 2015
@@ -301,7 +301,7 @@ public class FstLinkingEngineTest {
elc.setMinFoundTokens(2);//this is assumed by this test
elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
- LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc);
+ LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
processConentItem(engine);
validateEnhancements(
Arrays.asList(
@@ -322,7 +322,7 @@ public class FstLinkingEngineTest {
elc.setMinFoundTokens(2);//this is assumed by this test
elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
- LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc);
+ LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
processConentItem(engine);
validateEnhancements(
Arrays.asList(