Author: rwesten
Date: Wed Oct 2 12:54:31 2013
New Revision: 1528453
URL: http://svn.apache.org/r1528453
Log:
STANBOL-1128: Added support skipAltToken attribute now supported by
SolrTextTagger; changed all FstLinkingEngine specific config properties to use
'lucenefst' instead of 'solrfst' in their key to make them similar to the java
package name. Added UnitTest for Noun linking.
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
Wed Oct 2 12:54:31 2013
@@ -364,7 +364,7 @@ public class FstLinkingEngine implements
TagClusterReducer reducer = new ChainedTagClusterReducer(
linkableTokenFilter,TagClusterReducer.LONGEST_DOMINANT_RIGHT);
final long[] time = new long[]{0};
- new Tagger(corpus.getFst(), linkableTokenFilter, reducer) {
+ new Tagger(corpus.getFst(), linkableTokenFilter,
reducer,session.isSkipAltTokens()) {
@Override
protected void tagCallback(int startOffset, int endOffset, long
docIdsKey) {
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
Wed Oct 2 12:54:31 2013
@@ -181,7 +181,7 @@ public class FstLinkingEngineComponent {
* The {@link SolrCore} is required to access the document ids for the
Entities
* as well as the analyzer chains of the fields used for the linking
*/
- public static final String SOLR_CORE =
"enhancer.engines.linking.solrfst.solrcore";
+ public static final String SOLR_CORE =
"enhancer.engines.linking.lucenefst.solrcore";
/**
* The size of the thread pool used to create FST models (default=1).
Creating
@@ -195,7 +195,7 @@ public class FstLinkingEngineComponent {
* '<code>{@link IndexConfiguration#PARAM_RUNTIME_GENERATION
generate}=true</code>' parameter
* for some languages in the {@link IndexConfiguration#FST_CONFIG}.
*/
- public static final String FST_THREAD_POOL_SIZE =
"enhancer.engines.linking.solrfst.fstThreadPoolSize";
+ public static final String FST_THREAD_POOL_SIZE =
"enhancer.engines.linking.lucenefst.fstThreadPoolSize";
/**
* The default number of threads used to create FST models (default=1)
*/
@@ -207,7 +207,7 @@ public class FstLinkingEngineComponent {
* for matched entities from the disc. The EntityCache is a LRU cache for
such
* information.
*/
- public static final String ENTITY_CACHE_SIZE =
"enhancer.engines.linking.solrfst.entityCacheSize";
+ public static final String ENTITY_CACHE_SIZE =
"enhancer.engines.linking.lucenefst.entityCacheSize";
/**
* The default size of the Entity Cache is set to 65k entities.
*/
@@ -320,6 +320,8 @@ public class FstLinkingEngineComponent {
private EntityCacheManager documentCacheFactory;
private IndexConfiguration indexConfig;
+
+ private Boolean skipAltTokensConfig;
/**
* Default constructor as used by OSGI. This expects that
@@ -370,6 +372,12 @@ public class FstLinkingEngineComponent {
+ Arrays.toString(FieldEncodingEnum.values()), e);
}
}
+ value = properties.get(IndexConfiguration.SKIP_ALT_TOKENS);
+ if(value instanceof Boolean){
+ skipAltTokensConfig = ((Boolean)value);
+ } else if(value != null){
+ skipAltTokensConfig = new Boolean(value.toString());
+ } // else no config -> will use the default
//(4) init the FST configuration
//We can create the default configuration only here, as it depends on
the
@@ -567,6 +575,9 @@ public class FstLinkingEngineComponent {
indexConfig.setFstDirectory(getFstDirectory(core, fstFolder));
//set the DocumentCacheFactory
indexConfig.setEntityCacheManager(documentCacheFactory);
+ if(skipAltTokensConfig != null){
+ indexConfig.setSkipAltTokens(skipAltTokensConfig);
+ }
//create a new searcher for creating FSTs
boolean foundCorpus;
try {
@@ -743,6 +754,7 @@ public class FstLinkingEngineComponent {
textProcessingConfig = null;
entityLinkerConfig = null;
bundleContext = null;
+ skipAltTokensConfig = null;
}
/**
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
Wed Oct 2 12:54:31 2013
@@ -31,6 +31,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.FieldInfo;
@@ -44,6 +45,7 @@ import org.apache.stanbol.commons.stanbo
import
org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.opensextant.solrtexttagger.UnsupportedTokenException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -104,6 +106,31 @@ public class IndexConfiguration {
private File fstDirectory;
/**
+ * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
+ * tokens should cause an {@link UnsupportedTokenException}.
+ */
+ private boolean skipAltTokens;
+ /**
+ * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
+ * tokens should cause an {@link UnsupportedTokenException}.
+ * <p>
+ * While enabling this will allow to use FST linking with query time Lucene
+ * {@link Analyzer}s that emit alternate tokens (e.g. the Kuromoji
analyzers
+ * for Japanese) but it also requires special care with index time
+ * {@link Analyzer} configurations. If enabled the index time analyzer
MUST
+ * produce all possible tokens emited by the query time analyzer as only if
+ * all such combinations are added to the FST model skipped alternate
+ * tokens can not prevent mentions from being detected.
+ * <p>
+ * By default <code>skipAltTokens</code> is enabled for
+ * {@link FieldEncodingEnum#SolrYard} and deactivated for all other field
+ * encoding setting. This is because all Solr <code>schema.xml</code> used
+ * by the Stanbol Entityhub SolrYard ensure the requirement stated above.
+ * For other Solr configurations users will neet to explicitly activate
this.
+ */
+ public static final String SKIP_ALT_TOKENS =
"enhancer.engines.linking.lucenefst.skipAltTokens";
+
+ /**
* Property used to configure the FieldName encoding of the SolrIndex. This
* is mainly needed for label fields of different languages (e.g. by using
* the iso language code as prefix/suffix of Solr fields. However this also
@@ -111,27 +138,27 @@ public class IndexConfiguration {
* Entityhub SolrYard implementation. See {@link FieldEncodingEnum} for
* supported values
*/
- public static final String FIELD_ENCODING =
"enhancer.engines.linking.solrfst.fieldEncoding";
+ public static final String FIELD_ENCODING =
"enhancer.engines.linking.lucenefst.fieldEncoding";
/**
* The name of the Solr field storing rankings for entities. Entities with
a
* higher value are considered as better (more popular).
*/
- public static final String SOLR_RANKING_FIELD =
"enhancer.engines.linking.solrfst.rankingField";
+ public static final String SOLR_RANKING_FIELD =
"enhancer.engines.linking.lucenefst.rankingField";
/**
* The name of the Solr field holding the entity type information
*/
- public static final String SOLR_TYPE_FIELD =
"enhancer.engines.linking.solrfst.typeField";
+ public static final String SOLR_TYPE_FIELD =
"enhancer.engines.linking.lucenefst.typeField";
/**
* Language configuration defining the language, solr field and the name
of the
* FST file. The FST file is looked up using the {@link DataFileProvider}.
*/
- public static final String FST_CONFIG =
"enhancer.engines.linking.solrfst.fstconfig";
+ public static final String FST_CONFIG =
"enhancer.engines.linking.lucenefst.fstconfig";
/**
* The folder used to store the FST files. The {@link DEFAULT_FST_FOLDER
default} is
* '<code>${solr-data-dir}/fst</code>' - this is '<code>./fst</code>'
relative to the
* {@link SolrCore#getDataDir()} of the current SolrCore.
*/
- public static final String FST_FOLDER =
"enhancer.engines.linking.solrfst.fstfolder";
+ public static final String FST_FOLDER =
"enhancer.engines.linking.lucenefst.fstfolder";
/**
* The default of the FST folder is '<code>${solr-data-dir}/fst</code>' -
* this is '<code>./fst</code>' relative to the {@link
SolrCore#getDataDir()}
@@ -180,6 +207,13 @@ public class IndexConfiguration {
fieldEncoding = FieldEncodingEnum.None;
}
this.fieldEncoding = fieldEncoding;
+ //In case of a SolrYard we can activate skipAltTokens (see javadoc for
+ //#SKIP_ALT_TOKENS for more information)
+ if(fieldEncoding == FieldEncodingEnum.SolrYard){
+ this.skipAltTokens = true;
+ } else {
+ this.skipAltTokens = false;
+ }
}
public CorpusInfo setDefaultCorpus(CorpusInfo corpus){
@@ -601,4 +635,13 @@ public class IndexConfiguration {
}
return fstName;
}
+
+ public boolean isSkipAltTokens() {
+ return skipAltTokens;
+ }
+
+ public void setSkipAltTokens(boolean skipAltTokens) {
+ this.skipAltTokens = skipAltTokens;
+
+ }
}
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
Wed Oct 2 12:54:31 2013
@@ -52,6 +52,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.opensextant.solrtexttagger.UnsupportedTokenException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -585,5 +586,9 @@ public class TaggingSession implements C
return corpusInfo.storedField;
}
}
+
+ public boolean isSkipAltTokens() {
+ return config.isSkipAltTokens();
+ }
}
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
Wed Oct 2 12:54:31 2013
@@ -29,29 +29,29 @@ Stanbol Enhancer Engine: FST Linking
org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.description=Lucene
\
FST based Entity Linking Engine implementation.
-enhancer.engines.linking.solrfst.solrcore.name=Solr Core
-enhancer.engines.linking.solrfst.solrcore.description=The reference to the
SolrCore. \
+enhancer.engines.linking.lucenefst.solrcore.name=Solr Core
+enhancer.engines.linking.lucenefst.solrcore.description=The reference to the
SolrCore. \
Supports the '{server-name}:{core-name}' syntax to reference a specific
Managed- / \
Referenced SolrServer. If {server-name} is not present the configured
{core-name} is \
assumed to be available on the default SolrServer. Remote SolrServer are NOT
supported!
-enhancer.engines.linking.solrfst.fieldEncoding.name=Field Name Encoding
-enhancer.engines.linking.solrfst.fieldEncoding.description=Specifies how
FieldNames \
+enhancer.engines.linking.lucenefst.fieldEncoding.name=Field Name Encoding
+enhancer.engines.linking.lucenefst.fieldEncoding.description=Specifies how
FieldNames \
of the SolrCore are encoded. This is mainly used to specify the pattern used
to \
name fields holding entity labels of different languages. The 'SolrYard'
supports \
the encoding used by the Stanbol Entityhub SolrYard implementation. If 'None'
is \
selected the exact field names used by the SolrCore need to be configured.
-enhancer.engines.linking.solrfst.fieldEncoding.option.none=None
-enhancer.engines.linking.solrfst.fieldEncoding.option.solrYard=SolrYard
-enhancer.engines.linking.solrfst.fieldEncoding.option.minusPrefix='-' Prefix:
'{lang}-{name}'
-enhancer.engines.linking.solrfst.fieldEncoding.option.underscorePrefix='_'
Prefix: '{lang}_{name}'
-enhancer.engines.linking.solrfst.fieldEncoding.option.minusSuffix='-' Suffix:
'{name}-{lang}'
-enhancer.engines.linking.solrfst.fieldEncoding.option.underscoreSuffix='_'
Suffix: '{name}_{lang}'
-enhancer.engines.linking.solrfst.fieldEncoding.option.atPrefix='@' Prefix:
'{lang}@{name}'
-enhancer.engines.linking.solrfst.fieldEncoding.option.atSuffix='@' Suffix:
'{name}@{lang}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.none=None
+enhancer.engines.linking.lucenefst.fieldEncoding.option.solrYard=SolrYard
+enhancer.engines.linking.lucenefst.fieldEncoding.option.minusPrefix='-'
Prefix: '{lang}-{name}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.underscorePrefix='_'
Prefix: '{lang}_{name}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.minusSuffix='-'
Suffix: '{name}-{lang}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.underscoreSuffix='_'
Suffix: '{name}_{lang}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.atPrefix='@' Prefix:
'{lang}@{name}'
+enhancer.engines.linking.lucenefst.fieldEncoding.option.atSuffix='@' Suffix:
'{name}@{lang}'
-enhancer.engines.linking.solrfst.fstconfig.name=FST Corpora configuration
-enhancer.engines.linking.solrfst.fstconfig.description=Configuration for the
FST \
+enhancer.engines.linking.lucenefst.fstconfig.name=FST Corpora configuration
+enhancer.engines.linking.lucenefst.fstconfig.description=Configuration for the
FST \
Corpora. Syntax:
'{lang};{param-name}={param-value};{param-name}={param-value};...' \
Supported {param-name}s: 'field' ... the field name of the SolrIndex used for
the \
FST corpus (default: rdfs:label). The configured field name is encoded using
the \
@@ -62,8 +62,8 @@ Files are located in the 'fst' folder re
configured SolrCore. \
'generate' ... Boolean switch that allows to enable runtime generation of FST \
corpora (default: false)
-enhancer.engines.linking.solrfst.fstfolder.name=FST Folder
-enhancer.engines.linking.solrfst.fstfolder.description=The Folder used to
store \
+enhancer.engines.linking.lucenefst.fstfolder.name=FST Folder
+enhancer.engines.linking.lucenefst.fstfolder.description=The Folder used to
store \
FST files. This supports property substitution (${property-name}) with all \
OSGI and System properties. In addition the following properties are
supported: \
${solr-data-dir} ... the data directory of the configured SolrCore; \
@@ -71,25 +71,25 @@ ${solr-index-dir} ... the index director
${solr-server-name} ... the name of the Referenced/Managed SolrServer of the
SolrCore \
${solr-core-name} ... the name of the SolrCore
-enhancer.engines.linking.solrfst.typeField.name=Entity Type Field
-enhancer.engines.linking.solrfst.typeField.description=The Solr Field holding
the \
+enhancer.engines.linking.lucenefst.typeField.name=Entity Type Field
+enhancer.engines.linking.lucenefst.typeField.description=The Solr Field
holding the \
type information of Entities. Values are expected to be URIs
-enhancer.engines.linking.solrfst.rankingField.name=Entity Ranking Field
-enhancer.engines.linking.solrfst.rankingField.description=The Solr Field
holding the \
+enhancer.engines.linking.lucenefst.rankingField.name=Entity Ranking Field
+enhancer.engines.linking.lucenefst.rankingField.description=The Solr Field
holding the \
Entity Ranking (importance of the Entity within the knowledge base). Values \
are expected to be floating point numbers.
-enhancer.engines.linking.solrfst.fstThreadPoolSize.name=FST Thread Pool Size
-enhancer.engines.linking.solrfst.fstThreadPoolSize.description=The size of the
\
+enhancer.engines.linking.lucenefst.fstThreadPoolSize.name=FST Thread Pool Size
+enhancer.engines.linking.lucenefst.fstThreadPoolSize.description=The size of
the \
thread pool used for the runtime creation of FST models. NOTE that memory
allocation \
during creation is considerable higher as for holding the built model (up to
to times) \
so creation multiple models in parallel may require a lot of heap space. If
memory \
allocation is not an issue this value should be set based on the available CPU
cores \
and the resources one would like to assign to the creation of FST models.
-enhancer.engines.linking.solrfst.entityCacheSize.name=Entity Cache Size
-enhancer.engines.linking.solrfst.entityCacheSize.description=Used to configure
\
+enhancer.engines.linking.lucenefst.entityCacheSize.name=Entity Cache Size
+enhancer.engines.linking.lucenefst.entityCacheSize.description=Used to
configure \
the size of the Cache used to for Entity information. While the FST linking is
\
fully performed in memory this engine needs still to load tagging relevant
fields \
(labels, types, redirectes and entity ranking) for matched entities from the
disc. \
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1528453&r1=1528452&r2=1528453&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
Wed Oct 2 12:54:31 2013
@@ -293,7 +293,7 @@ public class FstLinkingEngineTest {
DBPEDIA+"Social_Democratic_Party_of_Germany"));
}
- //@Test TODO
+ @Test
public void testFstLinkingWithNouns() throws Exception {
Dictionary<String,Object> dict = new Hashtable<String,Object>();
dict.put(PROCESSED_LANGUAGES,
Arrays.asList("en;lmmtip;uc=LINK;prob=0.75;pprob=0.75"));
@@ -307,12 +307,13 @@ public class FstLinkingEngineTest {
processConentItem(engine);
validateEnhancements(
Arrays.asList(
- "Angela Merkel", "Greece", "Germany", "CDU", "SPD"),
+ "Chancellor", "Angela Merkel", "Greece", "Greeks", "Germany",
"SPD",
+ "change","election", "party", "policy"),
Arrays.asList(
DBPEDIA+"Christian_Democratic_Union_(Germany)",
DBPEDIA+"Angela_Merkel", DBPEDIA+"Greece", DBPEDIA+"Germany",
- DBPEDIA+"Social_Democratic_Party_of_Germany"));
-
+ DBPEDIA+"Social_Democratic_Party_of_Germany",
DBPEDIA+"Chancellor",
+ DBPEDIA+"Election", DBPEDIA+"Party", DBPEDIA+"Policy"));
}
/**