Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference; + +import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Dictionary; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.felix.scr.annotations.Activate; +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.Deactivate; +import org.apache.felix.scr.annotations.Properties; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.Reference; +import org.apache.felix.scr.annotations.Service; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; +import org.apache.stanbol.enhancer.engines.entitycoreference.impl.CoreferenceFinder; +import org.apache.stanbol.enhancer.engines.entitycoreference.impl.NounPhraseFilterer; +import org.apache.stanbol.enhancer.nlp.NlpAnnotations; +import org.apache.stanbol.enhancer.nlp.model.AnalysedText; +import org.apache.stanbol.enhancer.nlp.model.Section; +import org.apache.stanbol.enhancer.nlp.model.Span; +import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum; +import org.apache.stanbol.enhancer.nlp.model.annotation.Value; +import org.apache.stanbol.enhancer.nlp.ner.NerTag; +import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag; +import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; +import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; +import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; +import org.apache.stanbol.entityhub.servicesapi.Entityhub; +import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; +import org.osgi.service.cm.ConfigurationException; +import org.osgi.service.component.ComponentContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This engine extracts references in the given text of noun phrases which point to NERs. The coreference is + * performed based on matching several of the named entity's dbpedia/yago properties to the noun phrase + * tokens. + * + * TODO - Be able to detect possessive coreferences such as Germany's prime minister + * TODO - be able to detect products and their developer such as Iphone 7 and Apple's new device. + * TODO - provide the ability via config for the user to also allow coreferencing of 1 word noun phrases based + * solely on comparison with entity class type? + * + * @author Cristian Petroaca + * + */ +@Component(immediate = true, metatype = true) +@Service(value = EnhancementEngine.class) +@Properties(value = { + @Property(name = EnhancementEngine.PROPERTY_NAME, value = "entity-coreference"), + @Property(name = EntityCoReferenceEngine.CONFIG_LANGUAGES, value = "en"), + @Property(name = EntityCoReferenceEngine.REFERENCED_SITE_ID, value = "entity-coref-dbpedia"), + @Property(name = EntityCoReferenceEngine.ENTITY_URI_BASE, value = "http://dbpedia.org/resource/"), + @Property(name = EntityCoReferenceEngine.MAX_DISTANCE, intValue = Constants.MAX_DISTANCE_DEFAULT_VALUE), + @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_PERSON, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_PERSON), + @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_ORGANIZATION, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_ORGANIZATION), + @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_PLACE, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_PLACE), + @Property(name = EntityCoReferenceEngine.ORG_ATTR_FOR_PERSON, value = Constants.DEFAULT_ORG_ATTR_FOR_PERSON), + @Property(name = EntityCoReferenceEngine.ENTITY_CLASSES_TO_EXCLUDE, value = Constants.DEFAULT_ENTITY_CLASSES_TO_EXCLUDE)}) +public class EntityCoReferenceEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> + implements EnhancementEngine, ServiceProperties { + + private static final Integer ENGINE_ORDERING = ServiceProperties.ORDERING_POST_PROCESSING + 91; + + /** + * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported + * are the languages given as default value. + */ + protected static final String CONFIG_LANGUAGES = "enhancer.engine.entitycoreference.languages"; + + /** + * Referenced site configuration. Defaults to dbpedia. + */ + protected static final String REFERENCED_SITE_ID = "enhancer.engine.entitycoreference.referencedSiteId"; + + /** + * + */ + protected static final String ENTITY_URI_BASE = "enhancer.engine.entitycoreference.entity.uri.base"; + + /** + * Maximum sentence distance between the ner and the noun phrase which mentions it. -1 means no distance + * constraint. + */ + protected static final String MAX_DISTANCE = "enhancer.engine.entitycoreference.maxDistance"; + + /** + * Attributes used for spatial coreference when dealing with a person entity. + */ + protected static final String SPATIAL_ATTR_FOR_PERSON = "enhancer.engine.entitycoreference.spatial.attr.person"; + + /** + * Attributes used for spatial coreference when dealing with an organization entity. + */ + protected static final String SPATIAL_ATTR_FOR_ORGANIZATION = "enhancer.engine.entitycoreference.spatial.attr.org"; + + /** + * Attributes used for spatial coreference when dealing with a place entity. + */ + protected static final String SPATIAL_ATTR_FOR_PLACE = "enhancer.engine.entitycoreference.spatial.attr.place"; + + /** + * Attributes used for organisational membership coreference when dealing with a person entity. + */ + protected static final String ORG_ATTR_FOR_PERSON = "enhancer.engine.entitycoreference.org.attr.person"; + + /** + * Entity classes which will be excluded when doing the entity class type matching + * because they are too general in nature. + */ + protected static final String ENTITY_CLASSES_TO_EXCLUDE = "enhancer.engine.entitycoreference.entity.classes.excluded"; + + /** + * Logger + */ + private final Logger log = LoggerFactory.getLogger(EntityCoReferenceEngine.class); + + /** + * Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup + * the configured Referenced Site when we need to enhance a content item. + */ + @Reference + protected SiteManager siteManager; + + /** + * Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local" + */ + @Reference + protected Entityhub entityhub; + + /** + * Specialized class which filters out bad noun phrases based on the language. + */ + private NounPhraseFilterer nounPhraseFilterer; + + /** + * Performs the logic needed to find corefs based on the NERs and noun phrases in the text. + */ + private CoreferenceFinder corefFinder; + + @SuppressWarnings("unchecked") + @Activate + protected void activate(ComponentContext ctx) throws ConfigurationException { + super.activate(ctx); + + Dictionary<String,Object> config = ctx.getProperties(); + + /* Step 1 - initialize the {@link NounPhraseFilterer} with the language config */ + String languages = (String) config.get(CONFIG_LANGUAGES); + + if (languages == null || languages.isEmpty()) { + throw new ConfigurationException(CONFIG_LANGUAGES, + "The Languages Config is a required Parameter and MUST NOT be NULL or an empty String!"); + } + + nounPhraseFilterer = new NounPhraseFilterer(languages.split(",")); + + /* Step 2 - initialize the {@link CoreferenceFinder} */ + String referencedSiteID = null; + Object referencedSiteIDfromConfig = config.get(REFERENCED_SITE_ID); + + if (referencedSiteIDfromConfig == null) { + throw new ConfigurationException(REFERENCED_SITE_ID, + "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!"); + } + + referencedSiteID = referencedSiteIDfromConfig.toString(); + if (referencedSiteID.isEmpty()) { + throw new ConfigurationException(REFERENCED_SITE_ID, + "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!"); + } + + if (Entityhub.ENTITYHUB_IDS.contains(referencedSiteID.toLowerCase())) { + log.debug("Init NamedEntityTaggingEngine instance for the Entityhub"); + referencedSiteID = null; + } + + int maxDistance; + Object maxDistanceFromConfig = config.get(MAX_DISTANCE); + + if (maxDistanceFromConfig == null) { + maxDistance = Constants.MAX_DISTANCE_DEFAULT_VALUE; + } else if (maxDistanceFromConfig instanceof Number) { + maxDistance = ((Number) maxDistanceFromConfig).intValue(); + } else { + try { + maxDistance = Integer.parseInt(maxDistanceFromConfig.toString()); + } catch (NumberFormatException nfe) { + throw new ConfigurationException(MAX_DISTANCE, "The Max Distance parameter must be a number"); + } + } + + if (maxDistance < -1) { + throw new ConfigurationException(MAX_DISTANCE, + "The Max Distance parameter must not be smaller than -1"); + } + + String entityUriBase = (String) config.get(ENTITY_URI_BASE); + if (entityUriBase == null || entityUriBase.isEmpty()) { + throw new ConfigurationException(ENTITY_URI_BASE, "The Entity Uri Base parameter cannot be empty"); + } + + String spatialAttrForPerson = (String) config.get(SPATIAL_ATTR_FOR_PERSON); + String spatialAttrForOrg = (String) config.get(SPATIAL_ATTR_FOR_ORGANIZATION); + String spatialAttrForPlace = (String) config.get(SPATIAL_ATTR_FOR_PLACE); + String orgAttrForPerson = (String) config.get(ORG_ATTR_FOR_PERSON); + String entityClassesToExclude = (String) config.get(ENTITY_CLASSES_TO_EXCLUDE); + + corefFinder = new CoreferenceFinder(languages.split(","), siteManager, entityhub, referencedSiteID, + maxDistance, entityUriBase, spatialAttrForPerson, spatialAttrForOrg, + spatialAttrForPlace, orgAttrForPerson, entityClassesToExclude); + + log.info("activate {}[name:{}]", getClass().getSimpleName(), getName()); + } + + @Override + public Map<String,Object> getServiceProperties() { + return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, + (Object) ENGINE_ORDERING)); + } + + @Override + public int canEnhance(ContentItem ci) throws EngineException { + String language = getLanguage(this, ci, false); + if (language == null) { + log.debug("Engine {} ignores ContentItem {} becuase language {} is not detected.", + new Object[] {getName(), ci.getUri(), language}); + return CANNOT_ENHANCE; + } + + if (!nounPhraseFilterer.supportsLanguage(language)) { + log.debug("Engine {} does not support language {}.", new Object[] {getName(), language}); + return CANNOT_ENHANCE; + } + + return ENHANCE_SYNCHRONOUS; + } + + @Override + public void computeEnhancements(ContentItem ci) throws EngineException { + /* + * Step 1 - Build the NER list and the noun phrase list. + * + * TODO - the noun phrases need to be lemmatized. + */ + Map<Integer,List<Span>> ners = new HashMap<Integer,List<Span>>(); + List<NounPhrase> nounPhrases = new ArrayList<NounPhrase>(); + extractNersAndNounPhrases(ci, ners, nounPhrases); + + /* + * If there are no NERs to reference there's nothing to do but exit. + */ + if (ners.size() == 0) { + log.info("Did not find any NERs for which to do the coreferencing"); + return; + } + + /* + * Step 2 - Filter out bad noun phrases. + */ + String language = getLanguage(this, ci, false); + if (language == null) { + log.info("Could not detect the language of the text"); + return; + } + + nounPhraseFilterer.filter(nounPhrases, language); + + /* + * If there are no good noun phrases there's nothing to do but exit. + */ + if (nounPhrases.size() == 0) { + log.info("Did not find any noun phrases with which to do the coreferencing"); + return; + } + + /* + * Step 3 - Extract corefs and write them as {@link NlpAnnotations.COREF_ANNOTATION}s in the {@link + * Span}s + */ + corefFinder.extractCorefs(ners, nounPhrases, language); + } + + @Deactivate + protected void deactivate(ComponentContext ctx) { + log.info("deactivate {}[name:{}]", getClass().getSimpleName(), getName()); + + nounPhraseFilterer = null; + corefFinder = null; + + super.deactivate(ctx); + } + + /** + * Extracts the NERs and the noun phrases from the given text and puts them in the given lists. + * + * @param ci + * @param ners + * @param nounPhrases + */ + private void extractNersAndNounPhrases(ContentItem ci, + Map<Integer,List<Span>> ners, + List<NounPhrase> nounPhrases) { + AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true); + Iterator<? extends Section> sections = at.getSentences(); + if (!sections.hasNext()) { // process as single sentence + sections = Collections.singleton(at).iterator(); + } + + int sentenceCnt = 0; + while (sections.hasNext()) { + sentenceCnt++; + Section section = sections.next(); + List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>(); + List<Span> sectionNers = new ArrayList<Span>(); + + Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk)); + while (chunks.hasNext()) { + Span chunk = chunks.next(); + + Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION); + if (ner != null) { + sectionNers.add(chunk); + } + + Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION); + if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) { + sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt)); + } + } + + for (NounPhrase nounPhrase : sectionNounPhrases) { + Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token)); + + while (tokens.hasNext()) { + Span token = tokens.next(); + + if (nounPhrase.containsSpan(token)) { + nounPhrase.addToken(token); + } + } + + for (Span sectionNer : sectionNers) { + if (nounPhrase.containsSpan(sectionNer)) { + nounPhrase.addNerChunk(sectionNer); + } + } + } + + nounPhrases.addAll(sectionNounPhrases); + + if (!sectionNers.isEmpty()) { + ners.put(sentenceCnt, sectionNers); + } + } + } +}
Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.datamodel; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.stanbol.enhancer.nlp.model.Span; + +/** + * Encapsulates span and sentence information about a noun phrase. + * + * @author Cristian Petroaca + * + */ +public class NounPhrase { + /** + * The {@link Span} which represents this noun phrase. + */ + private Span chunk; + + /* + * TODO - should use Set instead? + */ + /** + * The {@link Span}s - tokens - which make up this noun phrase. + */ + private List<Span> tokens; + + /** + * The {@link Span}s contained in this noun phrase which represent Ners. + */ + private List<Span> nerChunks; + + /** + * The sentence index in which this noun phrase is found. + */ + private int sentenceNo; + + public NounPhrase(Span chunk, int sentenceNo) { + if (chunk == null) { + throw new IllegalArgumentException("Chunk cannot be null"); + } + + this.chunk = chunk; + this.tokens = new ArrayList<Span>(); + this.nerChunks = new ArrayList<Span>(); + this.sentenceNo = sentenceNo; + } + + /** + * Gets the chunk representing this noun phrase. + * + * @return + */ + public Span getChunk() { + return chunk; + } + + /** + * Adds a new token which is found in this noun phrase. + * + * @param token + */ + public void addToken(Span token) { + /* + * TODO - validate token boundaries within this noun phrase. + */ + tokens.add(token); + } + + /** + * Gets the list of tokens which make up this noun phrase. + * + * @return + */ + public List<Span> getTokens() { + return tokens; + } + + /** + * Adds a new NER chunk which is found within this noun phrase. + * + * @param chunk + */ + public void addNerChunk(Span chunk) { + /* + * TODO - validate NER boundaries within this noun phrase. + */ + nerChunks.add(chunk); + } + + /** + * Gets the list of NERs within this noun phrase. + * + * @return + */ + public List<Span> getNerChunks() { + return nerChunks; + } + + /** + * Determines whether this noun phrase's {@link Span} contains the given {@link Span}. + * + * @param span + * @return + */ + public boolean containsSpan(Span span) { + return (span.getStart() >= chunk.getStart() && span.getEnd() <= chunk.getEnd()); + } + + /** + * Determines whether this noun phrase has NERs. + * + * @return + */ + public boolean hasNers() { + return nerChunks.size() > 0; + } + + /** + * Returns the sentence index in which this noun phrase is found. + * + * @return + */ + public int getSentenceNo() { + return this.sentenceNo; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + chunk.hashCode(); + result = prime * result + tokens.hashCode(); + result = prime * result + nerChunks.hashCode(); + + return result; + } + + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + + NounPhrase other = (NounPhrase) obj; + + return chunk.equals(other.chunk) && tokens.equals(other.tokens) && nerChunks.equals(other.nerChunks) + && sentenceNo == other.sentenceNo; + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.datamodel; + +import org.apache.clerezza.rdf.core.UriRef; + +/** + * Represents a place adjectival inside a {@link Span}. + * + * @author Cristian Petroaca + * + */ +public class PlaceAdjectival { + /** + * The start index in the {@link Span}. + */ + private int startIdx; + + /** + * The end index in the {@link Span}. + */ + private int endIdx; + + /** + * The {@link UriRef} in the {@link SiteManager} or {@link Entityhub} that this place adjectival points + * to. + */ + private UriRef placeUri; + + public PlaceAdjectival(int startIdx, int endIdx, UriRef placeUri) { + this.startIdx = startIdx; + this.endIdx = endIdx; + this.placeUri = placeUri; + } + + public UriRef getPlaceUri() { + return placeUri; + } + + public int getStart() { + return this.startIdx; + } + + public int getEnd() { + return this.endIdx; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + startIdx; + result = prime * result + endIdx; + result = prime * result + placeUri.hashCode(); + + return result; + } + + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + + PlaceAdjectival other = (PlaceAdjectival) obj; + + return this.startIdx == other.startIdx && this.endIdx == other.endIdx + && this.placeUri.equals(other.placeUri); + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,404 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.impl; + +import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.COREF_ANNOTATION; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.engines.entitycoreference.Constants; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival; +import org.apache.stanbol.enhancer.nlp.NlpAnnotations; +import org.apache.stanbol.enhancer.nlp.coref.CorefFeature; +import org.apache.stanbol.enhancer.nlp.model.Span; +import org.apache.stanbol.enhancer.nlp.model.annotation.Value; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; +import org.apache.stanbol.entityhub.servicesapi.Entityhub; +import org.apache.stanbol.entityhub.servicesapi.model.Entity; +import org.apache.stanbol.entityhub.servicesapi.model.Text; +import org.apache.stanbol.entityhub.servicesapi.query.Constraint; +import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery; +import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory; +import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList; +import org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint; +import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint; +import org.apache.stanbol.entityhub.servicesapi.site.Site; +import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; +import org.osgi.service.cm.ConfigurationException; + +/** + * Uses the list of NERs and the list of {@link NounPhrase}s found in the analyzed text to find possible + * co-references. + * + * @author Cristian Petroaca + * + */ +public class CoreferenceFinder { + /** + * The configured {@link SiteManager} for {@link Entity} storage. + */ + private SiteManager siteManager; + + /** + * The default {@link Entity} storage. + */ + private Entityhub entityHub; + + /** + * The name of the configured site for the {@link SiteManager}. + */ + private String referencedSiteID; + + /** + * In memory cache storing {@link Entity} types which are often used. + */ + private InMemoryEntityTypeIndex entityTypeIndex; + + /** + * Class holding configuration params. + */ + private CoreferenceFinderConfig config; + + /** + * Holds vocabulary.dictionary info such as the list of place adjectivals by language. + */ + private Dictionaries dictionaries; + + public CoreferenceFinder(String[] languages, + SiteManager siteManager, + Entityhub entityHub, + String referencedSiteID, + int maxDistance, + String entityUriBase, + String spatialAttrForPerson, + String spatialAttrForOrg, + String spatialAttrForPlace, + String orgAttributesForPerson, + String entityClassesToExclude) throws ConfigurationException { + this.siteManager = siteManager; + this.entityHub = entityHub; + this.referencedSiteID = referencedSiteID; + this.entityTypeIndex = new InMemoryEntityTypeIndex(); + this.config = new CoreferenceFinderConfig(maxDistance, spatialAttrForPerson, + spatialAttrForOrg, spatialAttrForPlace, orgAttributesForPerson, entityClassesToExclude); + this.dictionaries = new Dictionaries(languages, entityUriBase); + } + + /** + * Performs the actual coreference resolution by iterating through all the NERs and all the + * {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are + * written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s. + * + * @param ners + * @param nounPhrases + * @param language + * @throws EngineException + */ + public void extractCorefs(Map<Integer,List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException { + for (Map.Entry<Integer,List<Span>> entry : ners.entrySet()) { + int nerSentenceNo = entry.getKey(); + List<Span> nerSpans = entry.getValue(); + int maxDistance = this.config.getMaxDistance(); + + for (Span ner : nerSpans) { + Entity entity = null; + Set<String> typeLabels = null; + Set<Span> corefs = new HashSet<Span>(); + + for (NounPhrase nounPhrase : nounPhrases) { + int nounPhraseSentenceNo = nounPhrase.getSentenceNo(); + + if (nounPhrase.getChunk().getStart() > ner.getStart() + && (maxDistance != Constants.MAX_DISTANCE_NO_CONSTRAINT + && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) { + + if (entity == null) { + entity = lookupEntity(ner, language); + + /* + * If the entity is still null there's nothing to do but go to the next ner. + */ + if (entity == null) break; + + if (typeLabels == null) { + typeLabels = buildEntityTypeLabels(entity, language); + } + } + + if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) { + Set<Span> coreferencedNer = new HashSet<Span>(); + coreferencedNer.add(ner); + Span chunk = nounPhrase.getChunk(); + + chunk.addAnnotation(COREF_ANNOTATION, + Value.value(new CorefFeature(false, coreferencedNer))); + corefs.add(chunk); + } + } + } + + if (corefs.size() > 0) { + ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs))); + } + } + } + } + + /** + * Gets an Entity from the configured {@link Site} based on the NER text and type. + * + * @param ner + * @param language + * @return + * @throws EngineException + */ + private Entity lookupEntity(Span ner, String language) throws EngineException { + Site site = getReferencedSite(); + FieldQueryFactory queryFactory = site == null ? entityHub.getQueryFactory() : site.getQueryFactory(); + FieldQuery query = queryFactory.createFieldQuery(); + + Constraint labelConstraint; + String namedEntityLabel = ner.getSpan(); + labelConstraint = new TextConstraint(namedEntityLabel, false, language, null); + query.setConstraint(RDFS_LABEL.getUnicodeString(), labelConstraint); + query.setConstraint(RDF_TYPE.getUnicodeString(), + new ReferenceConstraint(ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType() + .getUnicodeString())); + query.setLimit(1); + QueryResultList<Entity> results = site == null ? // if site is NULL + entityHub.findEntities(query) + : // use the Entityhub + site.findEntities(query); // else the referenced site + + if (results.isEmpty()) return null; + + // We set the limit to 1 so if it found anything it should contain just 1 entry + return results.iterator().next(); + } + + /** + * Performs the coreference matching rules: 1. Match the entity type. 2. If the {@link NounPhrase} + * contains any NERs match the NER to any spatial/org membership/functional Entity properties from the + * {@link Site}. 3. If {@link NounPhrase} contains any place adjectivals perform spatial co-reference + * based on the entity spatial properties. + * + * @param typeLabels + * - a list of types (classes) that the given entity has. + * @param entity + * - the entity for which we want to do the coref. + * @param ner + * - the ner in the text for which we want to do the coref. + * @param nounPhrase + * - the {@link NounPhrase} which we want to test for coref. + * @param language + * - the language of the text. + * @return + * @throws EngineException + */ + private boolean isCoreferent(Set<String> typeLabels, + Entity entity, + Span ner, + NounPhrase nounPhrase, + String language) throws EngineException { + /* + * 1. Try to match the entity class to the noun phrase. + */ + String matchedClass = null; + String nounPhraseText = nounPhrase.getChunk().getSpan().toLowerCase(); + int classStart = 0; + int classEnd = 0; + + for (String label : typeLabels) { + if (nounPhraseText.matches(".*\\b" + label + "\\b.*") + && (matchedClass == null || label.split("\\s").length > matchedClass.split("\\s").length)) { + matchedClass = label; + classStart = nounPhrase.getChunk().getStart() + nounPhraseText.indexOf(label); + classEnd = classStart + label.length(); + } + } + + if (matchedClass == null) return false; + + /* + * 2. See if there are any NERs in the noun phrase to further identify the coref. Any NERs found + * should be separate words from the class matches from point 1. + */ + /* + * TODO - devise a coref confidence scheme? + */ + if (nounPhrase.hasNers()) { + List<Span> npNers = nounPhrase.getNerChunks(); + UriRef nerType = ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType(); + + for (Span npNer : npNers) { + /* + * Don't go any further if for some reason it turns out that the ner text is the same as the + * entity class text. + */ + if ((npNer.getStart() >= classStart && npNer.getStart() <= classEnd) + || (npNer.getEnd() >= classStart && npNer.getEnd() <= classEnd)) continue; + + Entity npEntity = lookupEntity(npNer, language); + + if (npEntity != null) { + UriRef npNerType = npNer.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType(); + Set<String> rulesOntologyAttr = new HashSet<String>(); + + if (OntologicalClasses.DBPEDIA_PLACE.equals(npNerType)) { + rulesOntologyAttr = this.config.getSpatialAttributes(nerType); + } else if (OntologicalClasses.DBPEDIA_ORGANISATION.equals(npNerType)) { + rulesOntologyAttr = this.config.getOrgMembershipAttributes(nerType); + } + + if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, npEntity.getId())) { + return true; + } + } + } + } + + /* + * 3. Detect any place adjectivals in noun phrases and use them for spatial coreference. Any place + * adjectivals found should be separate words from the class matches from point 1. + */ + PlaceAdjectival placeAdjectival = this.dictionaries.findPlaceAdjectival(language, nounPhrase); + + if (placeAdjectival != null + && (placeAdjectival.getEnd() < classStart || placeAdjectival.getStart() > classEnd)) { + /* + * We use the same spatial rules ontology attributes as before. + */ + Set<String> rulesOntologyAttr = this.config.getSpatialAttributes(ner + .getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType()); + + if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, placeAdjectival.getPlaceUri() + .getUnicodeString())) { + return true; + } + } + + /* + * If there was no additional info to do the coref and if the entity class matched and has more than 1 + * word then we consider this a good enough coreference. + */ + if (matchedClass.split("\\s").length > 1) return true; + + return false; + } + + /** + * Builds a Set of Entity Type labels given the Entity type uris. + * + * @param entity + * @param language + * @return + * @throws EngineException + */ + private Set<String> buildEntityTypeLabels(Entity entity, String language) throws EngineException { + Iterator<Object> typeUris = entity.getRepresentation().get(RDF_TYPE.getUnicodeString()); + Set<String> allTypeLabels = new HashSet<String>(); + + while (typeUris.hasNext()) { + String typeUri = typeUris.next().toString(); + + if (this.config.shouldExcludeClass(typeUri)) continue; + + // First try the in memory index + Set<String> labels = this.entityTypeIndex.lookupEntityType(new UriRef(typeUri), language); + + if (labels == null) { + Site site = getReferencedSite(); + Entity entityType = (site == null) ? this.entityHub.getEntity(typeUri) : site + .getEntity(typeUri); + + if (entityType != null) { + labels = new HashSet<String>(); + Iterator<Text> labelIterator = entityType.getRepresentation().get( + RDFS_LABEL.getUnicodeString(), language); + + while (labelIterator.hasNext()) { + labels.add(labelIterator.next().getText()); + } + + this.entityTypeIndex.addEntityType(new UriRef(typeUri), language, labels); + } + } + + if (labels != null) allTypeLabels.addAll(labels); + } + + return allTypeLabels; + } + + /** + * Checks whether any of the attributes in rulesOntologyAttr from the given Entity contain the given + * value. + * + * @param rulesOntologyAttr + * @param entity + * @param value + * @return + */ + private boolean valueExistsInEntityAttributes(Set<String> rulesOntologyAttr, Entity entity, String value) { + for (String attribute : rulesOntologyAttr) { + Iterator<Object> entityAttributes = entity.getRepresentation().get(attribute); + + while (entityAttributes.hasNext()) { + Object entityAttribute = entityAttributes.next(); + + if (entityAttribute.toString().equals(value)) { + return true; + } + } + } + + return false; + } + + /** + * Retrieves the configured {@link Site} which holds the NER properties. + * + * @return + * @throws EngineException + */ + private Site getReferencedSite() throws EngineException { + Site site = null; + + if (referencedSiteID != null) { // lookup the referenced site + site = siteManager.getSite(referencedSiteID); + // ensure that it is present + if (site == null) { + String msg = String + .format("Unable to enhance because Referenced Site %s is currently not active!", + referencedSiteID); + + throw new EngineException(msg); + } + } + + return site; + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.impl; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; +import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; +import org.osgi.service.cm.ConfigurationException; + +/** + * Contains configuration parameters for the {@link CoreferenceFinder}. + * + * @author Cristian Petroaca + * + */ +public class CoreferenceFinderConfig { + /** + * The maximum distance (in sentence numbers) between a NER and a {@link NounPhrase} for which we look for + * a coreference. + */ + private int maxDistance; + + /** + * The Uris for spatial properties for the NER to be inspected when doing the coref spatial match. + */ + private Map<UriRef,Set<String>> spatialAttributes; + + /** + * The Uris for org membership properties for the NER to be inspected when doing the coref match. + */ + private Map<UriRef,Set<String>> orgMembershipAttributes; + + /** + * Entity classes which will not be used for coreference because they are too general. + */ + private Set<String> entityClassesToExclude; + + public CoreferenceFinderConfig(int maxDistance, + String spatialAttrForPerson, + String spatialAttrForOrg, + String spatialAttrForPlace, + String orgAttrForPerson, + String entityClassesToExclude) throws ConfigurationException { + this.maxDistance = maxDistance; + + this.spatialAttributes = new HashMap<UriRef,Set<String>>(); + this.orgMembershipAttributes = new HashMap<UriRef, Set<String>>(); + + if (spatialAttrForPerson != null) { + Set<String> attributes = new HashSet<String>(); + for (String attribute : spatialAttrForPerson.split(",")) { + attributes.add(attribute); + } + this.spatialAttributes.put(OntologicalClasses.DBPEDIA_PERSON, attributes); + } + + if (spatialAttrForOrg != null) { + Set<String> attributes = new HashSet<String>(); + for (String attribute : spatialAttrForOrg.split(",")) { + attributes.add(attribute); + } + this.spatialAttributes.put(OntologicalClasses.DBPEDIA_ORGANISATION, attributes); + } + + + if (spatialAttrForPlace != null) { + Set<String> attributes = new HashSet<String>(); + for (String attribute : spatialAttrForPlace.split(",")) { + attributes.add(attribute); + } + this.spatialAttributes.put(OntologicalClasses.DBPEDIA_PLACE, attributes); + } + + if (orgAttrForPerson != null) { + Set<String> attributes = new HashSet<String>(); + for (String attribute : orgAttrForPerson.split(",")) { + attributes.add(attribute); + } + + this.orgMembershipAttributes.put(OntologicalClasses.DBPEDIA_PERSON, attributes); + } + + if (entityClassesToExclude != null) { + this.entityClassesToExclude = new HashSet<String>(); + + for (String clazz : entityClassesToExclude.split(",")) { + this.entityClassesToExclude.add(clazz); + } + } + } + + /** + * Gets the max distance parameter. + * + * @return + */ + public int getMaxDistance() { + return maxDistance; + } + + /** + * Gets the URIs for the spatial properties for a given Entity Type. + * + * @param uri + * of the Entity type for which we want to get the ontology. + * @return + */ + public Set<String> getSpatialAttributes(UriRef uri) { + return this.spatialAttributes.get(uri); + } + + /** + * Gets the URIs for the org membership properties for a given Entity Type. + * + * @param uri + * of the Entity type for which we want to get the ontology. + * @return + */ + public Set<String> getOrgMembershipAttributes(UriRef uri) { + return this.orgMembershipAttributes.get(uri); + } + + /** + * Checks whether we should exclude the given class based on our config. + * + * @param clazz + * @return + */ + public boolean shouldExcludeClass(String clazz) { + return this.entityClassesToExclude.contains(clazz); + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.impl; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.stanbol.enhancer.engines.entitycoreference.Constants; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival; +import org.apache.stanbol.enhancer.nlp.model.Span; +import org.osgi.service.cm.ConfigurationException; + +/** + * Contains information about several terms and properties of words we use in the {@link CoreferenceFinder}. + * + * @author Cristian Petroaca + * + */ +class Dictionaries { + /** + * Contains the list of place adjectivals in the form: language -> adjectival -> UriRef -> adjectival -> + * UriRef There are Places that have multiple adjectivals so in this map there are adjectivals that point + * to the same UriRef but that ensures a fast lookup. + */ + private Map<String,Map<String,UriRef>> placeAdjectivalsMap; + + public Dictionaries(String[] languages, String entityUriBase) throws ConfigurationException { + placeAdjectivalsMap = new HashMap<>(); + + for (String language : languages) { + String line = null; + Map<String,UriRef> languagePlaceAdjMap = new HashMap<>(); + InputStream langIn = null; + BufferedReader reader = null; + + try { + langIn = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/" + + language); + reader = new BufferedReader(new InputStreamReader(langIn)); + + while ((line = reader.readLine()) != null) { + String[] splittedLine = line.split("\t"); + String place = splittedLine[0]; + String adjectivals = splittedLine[1]; + UriRef ref = new UriRef(entityUriBase + place.trim()); + String[] adjectivalsArray = adjectivals.split(","); + + for (String adjectival : adjectivalsArray) { + languagePlaceAdjMap.put(adjectival.trim().toLowerCase(), ref); + } + } + + placeAdjectivalsMap.put(language, languagePlaceAdjMap); + } catch (IOException ioe) { + throw new ConfigurationException("", "Could not read " + Constants.PLACE_ADJECTIVALS_FOLDER + + "/" + language, ioe); + } finally { + if (langIn != null) { + try { + langIn.close(); + } catch (IOException e) {} + } + + if (reader != null) { + try { + reader.close(); + } catch (IOException e) {} + } + } + } + } + + /** + * Checks whether a {@link NounPhrase} contains a place adjectival and returns it. + * + * @param language + * @param nounPhrase + * @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not. + */ + public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) { + List<Span> tokens = nounPhrase.getTokens(); + Map<String,UriRef> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language); + /* + * Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams + * should be good enough since there are no 3-gram places at least from what I saw. + */ + for (int i = 0; i < tokens.size(); i++) { + Span currentToken = tokens.get(i); + String currentTokenString = currentToken.getSpan().toLowerCase(); + // First the current 1-gram + if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) { + return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(), + langPlaceAdjectivalsMap.get(currentTokenString)); + } + + // Then use the 2-gram with the token before it + StringBuilder concatTokens = new StringBuilder(); + String concatTokensString = null; + + if (i > 0) { + Span previousToken = tokens.get(i - 1); + String previousTokenString = previousToken.getSpan().toLowerCase(); + concatTokens = new StringBuilder(); + concatTokens.append(previousTokenString); + concatTokens.append(" "); + concatTokens.append(currentTokenString); + concatTokensString = concatTokens.toString(); + + if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) { + return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(), + langPlaceAdjectivalsMap.get(concatTokensString)); + } + } + + // Now use the 2-gram with the token after it + if (i < tokens.size() - 1) { + Span nextToken = tokens.get(i + 1); + String nextTokenString = nextToken.getSpan().toLowerCase(); + concatTokens = new StringBuilder(); + concatTokens.append(currentTokenString); + concatTokens.append(" "); + concatTokens.append(nextTokenString); + + concatTokensString = concatTokens.toString(); + + if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) { + return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(), + langPlaceAdjectivalsMap.get(concatTokensString)); + } + } + } + + return null; + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.impl; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.clerezza.rdf.core.UriRef; + +/** + * Memory cache for storing often used Entity Type (Class) information. + * + * @author Cristian Petroaca + * + */ +public class InMemoryEntityTypeIndex { + /** + * The index having as key the Uri of the class and the value the set of labels ordered by language. + */ + private Map<UriRef,Map<String,Set<String>>> index; + + public InMemoryEntityTypeIndex() { + index = new HashMap<UriRef,Map<String,Set<String>>>(); + } + + /** + * Searches for a given class URI for the given language. + * + * @param uri + * @param language + * @return + */ + public Set<String> lookupEntityType(UriRef uri, String language) { + Map<String,Set<String>> langMap = index.get(uri); + + if (langMap != null) { + return langMap.get(language); + } + + return null; + } + + /** + * Adds a new class URI's labels for the given language. + * + * @param uri + * @param language + * @param labels + */ + public void addEntityType(UriRef uri, String language, Set<String> labels) { + Map<String,Set<String>> langMap = index.get(uri); + + if (langMap == null) { + langMap = new HashMap<String,Set<String>>(); + index.put(uri, langMap); + } + + langMap.put(language, labels); + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java Wed Jul 22 18:58:38 2015 @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.entitycoreference.impl; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import org.apache.stanbol.enhancer.engines.entitycoreference.Constants; +import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; +import org.apache.stanbol.enhancer.nlp.NlpAnnotations; +import org.apache.stanbol.enhancer.nlp.model.Span; +import org.apache.stanbol.enhancer.nlp.model.annotation.Value; +import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; +import org.apache.stanbol.enhancer.nlp.pos.Pos; +import org.apache.stanbol.enhancer.nlp.pos.PosTag; +import org.osgi.service.cm.ConfigurationException; + +/** + * Filters out bad {@link NounPhrase}s based on pos information. + * + * @author Cristian Petroaca + * + */ +/* + * TODO - create a NounPhraseFilterer interface with multiple implementations to separate languages with + * appositional definite article from the others. + */ +public class NounPhraseFilterer { + private final static String WITHIN_TEXT_DET_PROP = "within.text.referencing.determiners"; + private final static short MIN_POS_NUMBER = 2; + + /** + * Set of determiners based on language which make a {@link NounPhrase} valid for being a coref mention. + */ + private Map<String,Set<String>> withinTextRefDeterminers; + + public NounPhraseFilterer(String[] languages) throws ConfigurationException { + withinTextRefDeterminers = new HashMap<String,Set<String>>(); + + for (String language : languages) { + Properties props = new Properties(); + String propertiesFile = Constants.POS_CONFIG_FOLDER + "/" + language + ".properties"; + InputStream in = null; + + try { + in = NounPhraseFilterer.class.getResourceAsStream(propertiesFile); + props.load(in); + } catch (IOException e) { + throw new ConfigurationException("", "Could not read " + propertiesFile); + } finally { + if (in != null) { + try { + in.close(); + } catch (IOException e) {} + } + } + + String determinersProperty = props.getProperty(WITHIN_TEXT_DET_PROP); + + if (determinersProperty == null) { + throw new ConfigurationException(WITHIN_TEXT_DET_PROP, "Missing property in " + + propertiesFile); + } + + Set<String> langDeterminerSet = new HashSet<String>(); + for (String determiner : determinersProperty.split(",")) { + langDeterminerSet.add(determiner); + } + + withinTextRefDeterminers.put(language, langDeterminerSet); + } + } + + /** + * Filters out noun phrases which do not contain a determiner from the given config and do not a token + * count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun + * phrases? + * + * @param nounPhrases + * @param language + */ + public void filter(List<NounPhrase> nounPhrases, String language) { + Set<String> langDeterminerSet = withinTextRefDeterminers.get(language); + Iterator<NounPhrase> it = nounPhrases.iterator(); + + while (it.hasNext()) { + NounPhrase nounPhrase = it.next(); + boolean hasGoodDeterminer = false; + short nounNo = 0; + + for (Span token : nounPhrase.getTokens()) { + Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION); + + if (pos != null) { + PosTag posTag = pos.value(); + + if (posTag.hasCategory(LexicalCategory.Noun) + || posTag.hasCategory(LexicalCategory.Adjective)) { + nounNo++; + } + + if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner) + && langDeterminerSet.contains(token.getSpan().toLowerCase())) { + hasGoodDeterminer = true; + } + } + } + + if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) { + it.remove(); + } + } + } + + public boolean supportsLanguage(String language) { + return withinTextRefDeterminers.containsKey(language); + } +} Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Jul 22 18:58:38 2015 @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +stanbol.enhancer.engine.name.name=Name +stanbol.enhancer.engine.name.description=The name of the enhancement engine as \ +used in the RESTful interface '/engine/<name>' + +service.ranking.name=Ranking +service.ranking.description=If two enhancement engines with the same name are active the \ +one with the higher ranking will be used to process parsed content items. + + +#=============================================================================== +#Properties and Options used to configure +#=============================================================================== +org.apache.stanbol.enhancer.engines.entitycoreference.EntityCoReferenceEngine.name=Apache \ +Stanbol Enhancer Engine: Entity Co-Reference +org.apache.stanbol.enhancer.engines.entitycoreference.EntityCoReferenceEngine.description=An Engine that finds \ +co-references of Named Entities based on dbpedia/yago concepts. + +enhancer.engine.entitycoreference.languages.name=Language configuration +enhancer.engine.entitycoreference.languages.description=Takes a list of ISO \ + language codes. '*' is the Wildcard; '!{lang}' to exclude a language +enhancer.engine.entitycoreference.referencedSiteId.name=Referenced Site +enhancer.engine.entitycoreference.referencedSiteId.description=The ID of the \ +Entityhub Referenced Site holding the Entity Index. +enhancer.engine.entitycoreference.entity.uri.base.name=Entity URI base +enhancer.engine.entitycoreference.entity.uri.base.description=The base uri which \ +is used to represent an Entity. +enhancer.engine.entitycoreference.maxDistance.name=Max sentence distance +enhancer.engine.entitycoreference.maxDistance.description=The maximum sentence distance between the Ner \ +and the noun phrase which mentions it. -1 means no distance constraint. + +enhancer.engine.entitycoreference.spatial.attr.person.name=Spatial Attributes for Person +enhancer.engine.entitycoreference.spatial.attr.person.description=Attributes used for spatial \ +coreference when dealing with a person entity. +enhancer.engine.entitycoreference.spatial.attr.org.name=Spatial Attributes for Organization +enhancer.engine.entitycoreference.spatial.attr.org.description=Attributes used for spatial \ +coreference when dealing with an organization entity. +enhancer.engine.entitycoreference.spatial.attr.place.name=Spatial Attributes for Place +enhancer.engine.entitycoreference.spatial.attr.place.description=Attributes used for spatial \ +coreference when dealing with a place entity. +enhancer.engine.entitycoreference.org.attr.person.name=Organisational Membership Attributes for Person +enhancer.engine.entitycoreference.org.attr.person.description=Attributes used for organisational \ +membership coreference when dealing with a person entity. + +enhancer.engine.entitycoreference.entity.classes.excluded.name=Entity classes to be excluded +enhancer.engine.entitycoreference.entity.classes.excluded.description=Entity classes which will \ +be excluded when doing the entity class type matching because they are too general in nature. \ No newline at end of file Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/config/pos/en.properties URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/config/pos/en.properties?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/config/pos/en.properties (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/config/pos/en.properties Wed Jul 22 18:58:38 2015 @@ -0,0 +1,2 @@ +# Determiners of a noun phrase which determine that the noun phrase is a good candidate for coref. +within.text.referencing.determiners=the,this,these \ No newline at end of file Added: stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/data/place_adjectivals/en URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/data/place_adjectivals/en?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/data/place_adjectivals/en (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/main/resources/data/place_adjectivals/en Wed Jul 22 18:58:38 2015 @@ -0,0 +1,236 @@ +Abkhazia Abkhaz, Abkhazian +Afghanistan Afghan +Albania Albanian +Algeria Algerian +American_Samoa American Samoan +Andorra Andorran +Angola Angolan +Anguilla Anguillan +Antigua_and_Barbuda Antiguan, Barbudan +Argentina Argentine, Argentinean, Argentinian +Armenia Armenian +Aruba Aruban +Australia Australian +Austria Austrian +Azerbaijan Azerbaijani, Azeri +Bahamas Bahamian +Bahrain Bahraini +Bangladesh Bangladeshi +Barbados Barbadian +Belarus Belarusian +Belgium Belgian +Belize Belizean +Benin Beninese, Beninois +Bermuda Bermudian, Bermudan +Bhutan Bhutanese +Bolivia Bolivian +Bosnia_and_Herzegovina Bosnian, Bosniak, Herzegovinian +Botswana Motswana, Botswanan +Brazil Brazilian +British Virgin Islands British Virgin Island +Brunei Bruneian +Bulgaria Bulgarian +Burkina_Fasoa Burkinabè +Burmab Burmese +Burundi Burundian +Cambodia Cambodian +Cameroon Cameroonian +Canada Canadian +Cape_Verde Cape Verdean +Cayman_Islands Caymanian +Central_African_Republic Central African +Chad Chadian +Chile Chilean +China Chinese +Christmas_Island Christmas Island +Cocos_Islands Cocos Island +Colombia Colombian +Comoros Comorian +Congo Congolese, Congo +Cook_Islands Cook Island, Cook Islands +Costa_Rica Costa Rican +Côte_d'Ivoire Ivorian +Croatia Croatian +Cuba Cuban +Cyprus Cypriot +Czech_Republic Czech +Denmark Danish +Djibouti Djiboutian +Dominica Dominicand +Dominican_Republic Dominicane +East_Timor Timorese +Ecuador Ecuadorian +Egypt Egyptian +El_Salvador Salvadoran +England English +Equatorial_Guinea Equatorial Guinean, Equatoguinean +Eritrea Eritrean +Estonia Estonian +Ethiopia Ethiopian +Falkland_Islands Falkland Island +Faroe_Islands Faroese +Fiji Fijian +Finland Finnish +France French +French_Guiana French Guianese +French_Polynesia French Polynesian +Gabon Gabonese +Gambia Gambian +Georgia Georgian +Germany German +Ghana Ghanaian +Gibraltar Gibraltar +Great_Britain British +Greece Greek, Greciang, Hellenic +Greenland Greenlandic +Grenada Grenadian +Guadeloupe Guadeloupe +Guam Guamanian, Guambat +Guatemala Guatemalan +Guinea Guinean +Guyana Guyanese +Haiti Haitian +Honduras Honduran +Hong_Kong Hong Kong, Hongkongese +Hungary Hungarian, Magyar +Iceland Icelandic +India Indian +Indonesia Indonesian +Iran Iranian, Persian +Iraq Iraqi +Ireland Irish +Isle_of_Man Manx +Israel Israeli +Italy Italian, Italic +Jamaica Jamaican +Japan Japanese +Jordan Jordanian +Kazakhstan Kazakh, Kazakhstani +Kenya Kenyan +Kiribati I-Kiribati +North_Korea North Korean +South_Korea South Korean +Kosovo Kosovar, Kosovan +Kuwait Kuwaiti +Kyrgyzstan Kyrgyzstani, Kyrgyz, Kirgiz, Kirghiz +Laos Laotian, Lao +Latvia Latvian +Lebanon Lebanese +Lesotho Basotho +Liberia Liberian +Libya Libyan +Liechtenstein Liechtenstein +Lithuania Lithuanian +Luxembourg Luxembourg, Luxembourgish +Macau Macanese, Chinese +Macedonia Macedonian +Madagascar Malagasy +Malawi Malawian +Malaysia Malaysian +Maldives Maldivian +Mali Malian +Malta Maltese +Marshall Islands Marshallese +Martinique Martiniquais, Martinican +Mauritania Mauritanian +Mauritius Mauritian +Mayotte Mahoran +Mexico Mexican +Micronesia Micronesian +Moldova Moldovan +Monaco Monégasque, Monacan +Mongolia Mongolian +Montenegro Montenegrin +Montserrat Montserratian +Morocco Moroccan +Mozambique Mozambican +Namibia Namibian +Nauru Nauruan +Nepal Nepalese, Nepali +Netherlands Dutch, Netherlandic +New_Caledonia New Caledonian +New_Zealand New Zealand, NZ +Nicaragua Nicaraguan +Niue Niuean +Niger Nigerien +Nigeria Nigerian +Norway Norwegian +Northern_Ireland Northern Irish, Irish +Northern_Marianas Northern Marianan +Oman Omani +Pakistan Pakistani +Palestine Palestinian +Palau Palauan +Panama Panamanian +Papua_New_Guinea Papua New Guinean, Papuan +Paraguay Paraguayan +Peru Peruvian +Philippines Philippine, Filipino +Pitcairn_Island Pitcairn Island +Poland Polish +Portugal Portuguese +Puerto_Rico Puerto Rican +Qatar Qatari +Ireland Irish +Réunion Réunionese, Réunionnais +Romania Romanian +Russia Russian +Rwanda Rwandan +St._Helena St. Helenian +St._Kitts_and_Nevis Kittitian, Nevisian +St._Lucia St. Lucian +Saint-Pierre_and_Miquelon Saint-Pierrais, Miquelonnais +St._Vincent_and_the_Grenadines St. Vincentian, Vincentian +Samoa Samoan +San_Marino Sammarinese +São_Tomé_and_Príncipe São Toméan +Saudi_Arabia Saudi, Saudi Arabian +Scotland Scots, Scottish, Scotchi +Senegal Senegalese +Serbia Serbian +Seychelles Seychellois +Sierra_Leone Sierra Leonean +Singapore Singaporean +Slovakia Slovak +Slovenia Slovenian, Slovene +Solomon_Islands Solomon Island +Somalia Somali, Somalian +South_Africa South African +South_Ossetia South Ossetian +South_Sudan South Sudanese +Spain Spanish +Sri_Lanka Sri Lankan +Sudan Sudanese +Surinam Surinamese +Swaziland Swazi +Sweden Swedish +Switzerland Swiss +Syria Syrian +Taiwan Taiwanese +Tajikistan Tajikistani +Tanzania Tanzanian +Thailand Thai +Togo Togolese +Tonga Tongan +Trinidad_and_Tobago Trinidadian, Tobagonian +Tunisia Tunisian +Turkey Turkish +Turkmenistan Turkmen +Tuvalu Tuvaluan +Uganda Ugandan +Ukraine Ukrainian +United_Arab_Emirates Emirati, Emirian +United_Kingdom British, UK +United_States American, US +Uruguay Uruguayan +Uzbekistan Uzbekistani, Uzbek +Vanuatu Ni-Vanuatu, Vanuatuan +Venezuela Venezuelan +Vietnam Vietnamese +Virgin_Islands Virgin Island +Wales Welsh +Wallis_and_Futuna Wallisian, Futunan +Western_Sahara Sahraw, Sahrawian, Sahraouian +Yemen Yemeni +Zambia Zambian +Zimbabwe Zimbabwean \ No newline at end of file Added: stanbol/trunk/enhancement-engines/entitycoreference/src/test/resources/log4j.properties URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycoreference/src/test/resources/log4j.properties?rev=1692320&view=auto ============================================================================== --- stanbol/trunk/enhancement-engines/entitycoreference/src/test/resources/log4j.properties (added) +++ stanbol/trunk/enhancement-engines/entitycoreference/src/test/resources/log4j.properties Wed Jul 22 18:58:38 2015 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n +log4j.logger.org.apache.stanbol.enhancer.engines.keywordextraction=DEBUG \ No newline at end of file Modified: stanbol/trunk/enhancement-engines/pom.xml URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pom.xml?rev=1692320&r1=1692319&r2=1692320&view=diff ============================================================================== --- stanbol/trunk/enhancement-engines/pom.xml (original) +++ stanbol/trunk/enhancement-engines/pom.xml Wed Jul 22 18:58:38 2015 @@ -114,9 +114,8 @@ <module>geonames</module> <!-- http://geonames.org --> <module>opencalais</module> <!-- http://opencalais.com/ --> <module>zemanta</module> <!-- htt://zemanta.com --> - - <!-- The entity co-refernece engine is not yet in trunk --> - <!-- module>entitycoreference</module --> + + <module>entitycoreference</module> </modules> <build>
