Author: rwesten Date: Fri Aug 31 14:18:57 2012 New Revision: 1379463 URL: http://svn.apache.org/viewvc?rev=1379463&view=rev Log: KeywordlinkingEngine: UnitTests now work again, Improved Limit used by the EntitySearcher; Default Configuration: Corrected also some bugs in the configuration, KeywordLinkingEngine now uses 20 suggestions and 1 min found tokens (good for testing disambiguation as it results in a lot of suggestions)
Added: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config - copied, changed from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config Removed: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Copied: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config (from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config) URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config?p2=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config&p1=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config&r1=1379385&r2=1379463&rev=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config (original) +++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config Fri Aug 31 14:18:57 2012 @@ -1,3 +1,3 @@ -stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation" -stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"] -service.ranking=I"0" \ No newline at end of file +stanbol.enhancer.chain.name="default" +stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"] +service.ranking=I"-100" \ No newline at end of file Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config (original) +++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config Fri Aug 31 14:18:57 2012 @@ -1,3 +1,3 @@ -stanbol.enhancer.chain.name="default" -stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"] -service.ranking=I"-100" \ No newline at end of file +stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation" +stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"] +service.ranking=I"0" \ No newline at end of file Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config (original) +++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config Fri Aug 31 14:18:57 2012 @@ -16,4 +16,5 @@ org.apache.stanbol.enhancer.engines.keyw org.apache.stanbol.enhancer.engines.keywordextraction.redirectField="rdfs:seeAlso" stanbol.enhancer.engine.name="dbpediaKeyword" org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage="en" -org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false" +org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false" +org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens=I"1" Modified: incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java Fri Aug 31 14:18:57 2012 @@ -262,9 +262,7 @@ public class DisambiguatorEngine extends disData.allSelectedTexts, window); //savedEntity.getContext()); - disambiguationContext = unionString(false, - Collections.singleton(savedEntity.getName()), - contextSelections); + disambiguationContext = unionString(false, contextSelections); //(2) I do not understand this variant (see comment for the // EntitiesInRange(..) method @@ -278,6 +276,11 @@ public class DisambiguatorEngine extends // Collections.singleton(context), //the context // contextSelections); //other selected parsed in the context + //or just the name of the entity AND the context +// disambiguationContext = unionString(false, +// Collections.singleton(savedEntity.getName()), +// contextSelections); + //(4) TODO: I would also like to have the possibility to disambiguate // using URIs of Entities suggested for other TextAnnotations // within the context. Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Aug 31 14:18:57 2012 @@ -205,7 +205,12 @@ public class KeywordLinkingEngine * The literal representing the LangIDEngine as creator. */ public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine"); - + + /** + * The default value for the LIMIT of the {@link EntitySearcher} + */ + private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10; + private EntitySearcher entitySearcher; private EntityLinkerConfig linkerConfig; private TextAnalyzerConfig nlpConfig; @@ -873,9 +878,9 @@ public class KeywordLinkingEngine } //TODO: make limit configurable! if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){ - entitySearcher = new EntityhubSearcher(context.getBundleContext(),10); + entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT); } else { - entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10); + entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT); } } /** Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Aug 31 14:18:57 2012 @@ -56,6 +56,8 @@ public class EntityLinker { * The map holding the results of the linking process */ private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>(); + + private Integer lookupLimit; /** * After {@link #process()}ing this returns the entities linked for the @@ -79,6 +81,7 @@ public class EntityLinker { this.entitySearcher = taxonomy; this.config = config; this.state = new ProcessingState(content.getAnalysedText()); + this.lookupLimit = Math.max(10,config.getMaxSuggestions()*2); } /** * Steps over the sentences, chunks, tokens of the {@link #sentences} @@ -289,8 +292,11 @@ public class EntityLinker { private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException { Collection<? extends Representation> results; try { - results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(), - searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage()); + results = entitySearcher.lookup(config.getNameField(), + config.getSelectedFields(), + searchStrings, + new String[]{state.getSentence().getLanguage(),config.getDefaultLanguage()}, + lookupLimit); } catch (RuntimeException e) { throw new EngineException(e.getMessage(),e); } @@ -555,6 +561,7 @@ public class EntityLinker { //processable tokens are counted, but Exact also checks //of non-processable! foundTokens = coveredTokens; + foundProcessableTokens = coveredProcessableTokens; } else if((foundProcessableTokens >= config.getMinFoundTokens() || //NOTE (rwesten, 2012-05-21): Do not check if all covered // Tokens are found, but if all Tokens of the Label are Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Fri Aug 31 14:18:57 2012 @@ -44,10 +44,11 @@ public interface EntitySearcher { * to be included. Other fields MAY also be included. * @param search the tokens to search for. MUST NOT be <code>null</code> * @param languages the languages to include in the search + * @param limit The maximum number of resutls of <code>null</code> to use the default * @return the Representations found for the specified query * @throws T An exception while searching for concepts */ - Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException; + Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, String[] languages,Integer limit) throws IllegalStateException; /** * Lookup a concept of the taxonomy by the id. * @param id the id Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java Fri Aug 31 14:18:57 2012 @@ -62,13 +62,19 @@ public final class EntityhubSearcher ext public Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, - String... languages) throws IllegalStateException { + String[] languages, + Integer limit) throws IllegalStateException { Entityhub entityhub = getSearchService(); if(entityhub == null){ throw new IllegalStateException("The Entityhub is currently not active"); } FieldQuery query = EntitySearcherUtils.createFieldQuery(entityhub.getQueryFactory(), field, includeFields, search, languages); + if(limit != null && limit > 0){ + query.setLimit(limit); + } else if(this.limit != null){ + query.setLimit(this.limit); + } QueryResultList<Representation> results; try { results = entityhub.find(query); Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java Fri Aug 31 14:18:57 2012 @@ -69,7 +69,8 @@ public final class ReferencedSiteSearche public Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, - String... languages) throws IllegalStateException { + String[] languages, + Integer limit) throws IllegalStateException { //build the query and than return the result Site site = getSearchService(); if(site == null){ @@ -77,8 +78,10 @@ public final class ReferencedSiteSearche } FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(), field, includeFields, search, languages); - if(limit != null){ + if(limit != null && limit > 0){ query.setLimit(limit); + } else if(this.limit != null){ + query.setLimit(this.limit); } QueryResultList<Representation> results; try { Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java?rev=1379463&r1=1379462&r2=1379463&view=diff ============================================================================== --- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java (original) +++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Fri Aug 31 14:18:57 2012 @@ -74,7 +74,8 @@ public class TestSearcherImpl implements public Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, - String... languages) throws IllegalStateException { + String[] languages, + Integer limit) throws IllegalStateException { if(field.equals(nameField)){ //we do not need sorting //Representation needs to implement equals, therefore results filters multiple matches