Author: rwesten
Date: Tue Nov 6 15:00:40 2012
New Revision: 1406168
URL: http://svn.apache.org/viewvc?rev=1406168&view=rev
Log:
STANBOL-797: The OpenNLP NER engine can now make use of the AnalyzedText
ContentPart
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml?rev=1406168&r1=1406167&r2=1406168&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
Tue Nov 6 15:00:40 2012
@@ -87,6 +87,11 @@
<version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
<version>0.9.0-incubating</version>
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1406168&r1=1406167&r2=1406168&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
Tue Nov 6 15:00:40 2012
@@ -41,23 +41,24 @@ import opennlp.tools.namefind.NameFinder
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import org.apache.clerezza.rdf.core.Language;
-import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -66,8 +67,6 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
-import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -135,32 +134,45 @@ public abstract class NEREngineCore
+ "method! -> This indicated an Bug in the implementation of
the "
+ "EnhancementJobManager!");
}
- Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
SUPPORTED_MIMETYPES);
- if(contentPart == null){
- throw new IllegalStateException("No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text;
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- if (text.trim().length() == 0) {
- // TODO: make the length of the data a field of the ContentItem
- // interface to be able to filter out empty items in the canEnhance
- // method
- log.warn("ContentPart {} of ContentItem {} does not contain any
text" +
- "to extract knowledge from in ContentItem {}",
- contentPart.getKey(),ci);
- return;
+ final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+ //validate data in the AnalysedText
+ final String text;
+ if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is
present and tokens are present
+ if(log.isDebugEnabled()){
+ log.debug("computeEnhancements from AnalysedText ContentPart
of ContentItem {}: text={}",
+ ci.getUri().getUnicodeString(),
StringUtils.abbreviate(at.getSpan(), 100));
+ }
+ text = null;
+ } else { //no AnalysedText with tokens ...
+ //fallback to processing the plain text is still supported
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
SUPPORTED_MIMETYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with Mimetype
'"
+ + TEXT_PLAIN_MIMETYPE+"' found for ContentItem
"+ci.getUri()
+ + ": This is also checked in the canEnhance method! ->
This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ // TODO: make the length of the data a field of the ContentItem
+ // interface to be able to filter out empty items in the
canEnhance
+ // method
+ log.warn("ContentPart {} of ContentItem {} does not contain
any text" +
+ "to extract knowledge from in ContentItem {}",
+ contentPart.getKey(),ci);
+ return;
+ }
+ if(log.isDebugEnabled()){
+ log.debug("computeEnhancements from ContentPart {} of
ContentItem {}: text={}",
+ new
Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
+ StringUtils.abbreviate(text, 100)});
+ }
}
- log.debug("computeEnhancements from ContentPart {} of ContentItem {}:
text={}",
- new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
- StringUtils.abbreviate(text, 100)});
try {
if(config.isProcessedLangage(language)){
for (String defaultModelType : config.getDefaultModelTypes()) {
@@ -168,7 +180,7 @@ public abstract class NEREngineCore
if(nameFinderModel == null){
log.info("No NER Model for {} and language {}
available!",defaultModelType,language);
} else {
- findNamedEntities(ci, text, language, nameFinderModel);
+ findNamedEntities(ci, at, text, language,
nameFinderModel);
}
}
} //else do not use default models for languages other than the
processed one
@@ -178,7 +190,7 @@ public abstract class NEREngineCore
try {
nameFinderModel =
openNLP.getModel(TokenNameFinderModel.class,
additionalModel, null);
- findNamedEntities(ci, text, language, nameFinderModel);
+ findNamedEntities(ci, at, text, language, nameFinderModel);
} catch (IOException e) {
log.warn("Unable to load TokenNameFinderModel model for
language '"+language
+ "' (model: "+additionalModel+")",e);
@@ -197,6 +209,7 @@ public abstract class NEREngineCore
}
protected void findNamedEntities(final ContentItem ci,
+ final AnalysedText at,
final String text,
final String lang,
final TokenNameFinderModel
nameFinderModel) {
@@ -204,8 +217,9 @@ public abstract class NEREngineCore
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be
NULL");
}
- if (text == null) {
- log.warn("NULL was parsed as text for content item " +
ci.getUri().getUnicodeString() + "! -> call ignored");
+ if (at == null && text == null) {
+ log.warn("NULL was parsed as AnalysedText AND Text for content
item "
+ + ci.getUri() + ". One of the two MUST BE present! -> call
ignored");
return;
}
final Language language;
@@ -216,11 +230,17 @@ public abstract class NEREngineCore
}
if(log.isDebugEnabled()){
log.debug("findNamedEntities model={}, language={}, text=",
- new Object[]{ nameFinderModel, language,
StringUtils.abbreviate(text, 100) });
+ new Object[]{ nameFinderModel, language,
+ StringUtils.abbreviate(at != null ?
at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
MGraph g = ci.getMetadata();
- Map<String,List<NameOccurrence>> entityNames =
extractNameOccurrences(nameFinderModel, text);
+ Map<String,List<NameOccurrence>> entityNames;
+ if(at != null){
+ entityNames = extractNameOccurrences(nameFinderModel, at, lang);
+ } else {
+ entityNames = extractNameOccurrences(nameFinderModel, text,lang);
+ }
//lock the ContentItem while writing the RDF data for found Named
Entities
ci.getLock().writeLock().lock();
try {
@@ -282,32 +302,74 @@ public abstract class NEREngineCore
}
}
+ @Deprecated
public Collection<String> extractPersonNames(String text) {
- return extractNames(getNameModel("person","en"),text);
+ return extractPersonNames(text, "en");
+ }
+ public Collection<String> extractPersonNames(String text,String lang) {
+ return extractNames(getNameModel("person",lang),text);
}
+ @Deprecated
public Collection<String> extractLocationNames(String text) {
- return extractNames(getNameModel("location","en"), text);
+ return extractLocationNames(text,"en");
}
-
+
+ public Collection<String> extractLocationNames(String text,String lang) {
+ return extractNames(getNameModel("location",lang), text);
+ }
+
+ @Deprecated
public Collection<String> extractOrganizationNames(String text) {
- return extractNames(getNameModel("organization","en"), text);
+ return extractOrganizationNames(text,"en");
}
-
+ public Collection<String> extractOrganizationNames(String text,String
lang) {
+ return extractNames(getNameModel("organization",lang), text);
+ }
+ /**
+ * extracts the PersonName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link #extractLocationNameOccurrences(String,String)}
instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>>
extractPersonNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("person","en"), text);
+ return this.extractPersonNameOccurrences(text, "en");
}
-
+ public Map<String,List<NameOccurrence>>
extractPersonNameOccurrences(String text, String lang) {
+ return extractNameOccurrences(getNameModel("person",lang), text, lang);
+ }
+ /**
+ * extracts the LocationName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link #extractLocationNameOccurrences(String,String)}
instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>>
extractLocationNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("location","en"), text);
+ return extractLocationNameOccurrences(text, "en");
+ }
+
+ public Map<String,List<NameOccurrence>>
extractLocationNameOccurrences(String text,String lang) {
+ return extractNameOccurrences(getNameModel("location",lang),
text,lang);
}
+ /**
+ * extracts the OrganizationName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link
#extractOrganizationNamesOccurrences(String,String)} instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>>
extractOrganizationNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("organization","en"), text);
+ return extractOrganizationNameOccurrences(text,"en");
+ }
+ public Map<String,List<NameOccurrence>>
extractOrganizationNameOccurrences(String text,String lang) {
+ return extractNameOccurrences(getNameModel("organization",lang),
text,lang);
}
protected Collection<String> extractNames(TokenNameFinderModel
nameFinderModel, String text) {
- return extractNameOccurrences(nameFinderModel, text).keySet();
+ return extractNameOccurrences(nameFinderModel, text,
nameFinderModel.getLanguage()).keySet();
}
/**
@@ -339,16 +401,28 @@ public abstract class NEREngineCore
type,language),e);
}
}
+ /**
+ * Loads the {@link SentenceModel} for the parsed language or
+ * English as fallback if one for the language is not available
+ * @param language
+ * @return
+ */
private SentenceModel getSentenceModel(String language) {
try {
SentenceModel model = openNLP.getSentenceModel(language);
if(model != null){
return model;
- } else {
- throw new IllegalStateException(String.format(
- "Unable to built Model for extracting sentences from '%s'
" +
- "language texts because the model data could not be
loaded.",
- language));
+ } else { //fallback to english
+ log.info("No sentence detection modle for {}. fallback to
English");
+ model = openNLP.getSentenceModel("en");
+ if(model == null){
+ throw new IllegalStateException(String.format(
+ "Unable to built Model for extracting sentences
neither for '%s' " +
+ "nor the fallback language 'en'.",
+ language));
+ } else {
+ return model;
+ }
}
} catch (InvalidFormatException e) {
throw new IllegalStateException(String.format(
@@ -360,10 +434,78 @@ public abstract class NEREngineCore
language),e);
}
}
-
- protected Map<String,List<NameOccurrence>>
extractNameOccurrences(TokenNameFinderModel nameFinderModel,
- String
text) {
+ /**
+ * THis method extracts NamedEntity occurrences by using existing {@link
Token}s and
+ * {@link Sentence}s in the parsed {@link AnalysedText}.
+ * @param nameFinderModel the model used to find NamedEntities
+ * @param at the Analysed Text
+ * @param language the language of the text
+ * @return the found named Entity Occurrences
+ */
+ protected Map<String,List<NameOccurrence>>
extractNameOccurrences(TokenNameFinderModel nameFinderModel,
+ AnalysedText at, String language) {
+ // version with explicit sentence endings to reflect heading /
paragraph
+ // structure of an HTML or PDF document converted to text
+
+ NameFinderME finder = new NameFinderME(nameFinderModel);
+ Map<String,List<NameOccurrence>> nameOccurrences = new
LinkedHashMap<String,List<NameOccurrence>>();
+ List<Section> sentences = new ArrayList<Section>();
+ //Holds the tokens of the previouse (pos 0) current (pos 1) and next
(pos 2) sentence
+ AnalysedTextUtils.appandToList(at.getSentences(), sentences);
+ if(!sentences.isEmpty()){ //no sentence annotations
+ sentences.add(at); //process as a single section
+ }
+ for (int i=0;i<sentences.size();i++) {
+ String sentence = sentences.get(i).getSpan();
+
+ // build a context by concatenating three sentences to be used for
+ // similarity ranking / disambiguation + contextual snippet in the
+ // extraction structure
+ List<String> contextElements = new ArrayList<String>();
+ contextElements.add(sentence);
+ //three sentences as context
+ String context = at.getSpan().substring(
+ sentences.get(Math.max(0, i-1)).getStart(),
+ sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());
+
+ // get the tokens, words of the current sentence
+ List<Token> tokens = new ArrayList<Token>(32);
+ List<String> words = new ArrayList<String>(32);
+ for(Iterator<Token> it
=sentences.get(i).getTokens();it.hasNext();){
+ Token t = it.next();
+ tokens.add(t);
+ words.add(t.getSpan());
+ }
+ Span[] nameSpans = finder.find(words.toArray(new
String[words.size()]));
+ double[] probs = finder.probs();
+ //int lastStartPosition = 0;
+ for (int j = 0; j < nameSpans.length; j++) {
+ String name =
at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
+ tokens.get(nameSpans[j].getEnd()-1).getEnd());
+ Double confidence = 1.0;
+ for (int k = nameSpans[j].getStart(); k <
nameSpans[j].getEnd(); k++) {
+ confidence *= probs[k];
+ }
+ int absoluteStart =
tokens.get(nameSpans[j].getStart()).getStart();
+ int absoluteEnd = absoluteStart + name.length();
+ UriRef mappedType =
config.getMappedType(nameSpans[j].getType());
+ NameOccurrence occurrence = new NameOccurrence(name,
absoluteStart, absoluteEnd,
+ mappedType, context, confidence);
+ List<NameOccurrence> occurrences = nameOccurrences.get(name);
+ if (occurrences == null) {
+ occurrences = new ArrayList<NameOccurrence>();
+ }
+ occurrences.add(occurrence);
+ nameOccurrences.put(name, occurrences);
+ }
+ }
+ finder.clearAdaptiveData();
+ log.debug("{} name occurrences found: {}", nameOccurrences.size(),
nameOccurrences);
+ return nameOccurrences;
+ }
+
+ protected Map<String,List<NameOccurrence>>
extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text,
String language) {
// version with explicit sentence endings to reflect heading /
paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
@@ -374,7 +516,7 @@ public abstract class NEREngineCore
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
- Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String,List<NameOccurrence>> nameOccurrences = new
LinkedHashMap<String,List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence =
sentenceSpans[i].getCoveredText(text).toString().trim();
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1406168&r1=1406167&r2=1406168&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Tue Nov 6 15:00:40 2012
@@ -176,7 +176,7 @@ public class TestNamedEntityExtractionEn
expectedValues.put(Properties.DC_TYPE, new
UriRef("http://www.bootstrep.eu/ontology/GRO#DNA"));
MGraph g = ci.getMetadata();
int textAnnotationCount =
validateAllTextAnnotations(g,EHEALTH,expectedValues);
- assertEquals(6, textAnnotationCount);
+ assertEquals(7, textAnnotationCount);
}