Author: rwesten
Date: Fri Nov 16 14:30:35 2012
New Revision: 1410367
URL: http://svn.apache.org/viewvc?rev=1410367&view=rev
Log:
STANBOL-797: the opennlp-ner engine now also adds NER annotations to the
AnalyzedText contentpart. This feature is only active if an AnalyzedText
contentpart is already present. It does NOT create one.
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java?rev=1410367&r1=1410366&r2=1410367&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
Fri Nov 16 14:30:35 2012
@@ -6,12 +6,15 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
@@ -32,10 +35,11 @@ public class NEREngineConfig {
}
/**
- * Holds the mappings of rdf:type used by concepts to dc:type values used
- * by TextAnnotations.
+ * Holds the configured {@link NerTag}s - the mappings from the
+ * named entity name to the {@link UriRef} type used for the
+ * <code>dc:type</code> value for <code>fise:TextAnnotation</code>s
*/
- private Map<String,UriRef> typeMappings = new
HashMap<String,UriRef>(DEFAULT_ENTITY_TYPE_MAPPINGS);
+ private TagSet<NerTag> nerTagSet = new TagSet<NerTag>("NER TagSet");
private Map<String,Collection<String>> additionalNerModels = new
HashMap<String,Collection<String>>();
/**
@@ -50,6 +54,12 @@ public class NEREngineConfig {
private String defaultLanguage;
+ public NEREngineConfig(){
+ for(Entry<String,UriRef> mapping :
DEFAULT_ENTITY_TYPE_MAPPINGS.entrySet()){
+ nerTagSet.addTag(new NerTag(mapping.getKey(), mapping.getValue()));
+ }
+ }
+
public synchronized void addCustomNameFinderModel(String lang, String
modelFileName){
if(lang == null || lang.isEmpty()){
throw new IllegalArgumentException("The parsed lanaguage MUST NOT
be NULL or empty!");
@@ -115,17 +125,40 @@ public class NEREngineConfig {
Collection<String> modelNames = additionalNerModels.get(lang);
return modelNames == null ? Collections.EMPTY_LIST : modelNames;
}
-
- public UriRef getMappedType(String namedEntityType){
- return typeMappings.get(namedEntityType);
+ /**
+ * Getter for the {@link NerTag} of the parsed Named Entity
+ * name. If not yet present a new {@link NerTag} (with no
+ * <code>dc:type</code> mapping) is created and added to the
+ * configuration.
+ * @param namedEntityType the NamedEntity name.
+ * @return the NerTag. Guaranteed to be not <code>null</code>
+ * @throws IllegalArgumentException if the parsed NamedEntity
+ * type is <code>null</code> or an empty String.
+ */
+ public NerTag getNerTag(String namedEntityType){
+ if(namedEntityType == null || namedEntityType.isEmpty()){
+ throw new IllegalArgumentException("The parsed NamedEntity string
MUST NOT be NULL nor empty!");
+ }
+ NerTag tag = nerTagSet.getTag(namedEntityType);
+ if(tag == null){
+ tag = new NerTag(namedEntityType);
+ nerTagSet.addTag(tag);
+ }
+ return tag;
}
+ /**
+ * Setter for a NamedEntity name > <code>dc:tyoe</code>
+ * mapping.
+ * @param namedEntityType the Named Entity type (as
+ * used by the OpenNLP NameFinder model)
+ * @param dcType the <code>dc:Type</code> used for the
+ * NamedEntity or <code>nulll</code> if non
+ * @throws IllegalArgumentException if the parsed NamedEntity
+ * type is <code>null</code> or an empty String.
+ */
public void setMappedType(String namedEntityType,UriRef dcType){
if(namedEntityType != null && !namedEntityType.isEmpty()){
- if(dcType == null){
- typeMappings.remove(namedEntityType);
- } else {
- typeMappings.put(namedEntityType, dcType);
- }
+ nerTagSet.addTag(new NerTag(namedEntityType, dcType));
} else {
throw new IllegalArgumentException("The parsed NamedEntity type
MUST NOT be NULL nor empty!");
}
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1410367&r1=1410366&r2=1410367&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
Fri Nov 16 14:30:35 2012
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
@@ -54,11 +55,15 @@ import org.apache.clerezza.rdf.core.impl
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -90,6 +95,7 @@ public abstract class NEREngineCore
protected NEREngineConfig config;
+
/** Comments about our models */
public static final Map<String, String> DATA_FILE_COMMENTS;
static {
@@ -486,18 +492,22 @@ public abstract class NEREngineCore
for (int k = nameSpans[j].getStart(); k <
nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
- int absoluteStart =
tokens.get(nameSpans[j].getStart()).getStart();
- int absoluteEnd = absoluteStart + name.length();
- UriRef mappedType =
config.getMappedType(nameSpans[j].getType());
- NameOccurrence occurrence = new NameOccurrence(name,
absoluteStart, absoluteEnd,
- mappedType, context, confidence);
-
+ int start = tokens.get(nameSpans[j].getStart()).getStart();
+ int end = start + name.length();
+ NerTag nerTag = config.getNerTag(nameSpans[j].getType());
+ //create the occurrence for writing fise:TextAnnotations
+ NameOccurrence occurrence = new NameOccurrence(name, start,
end, nerTag.getType(),
+ context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
+ //add also the NerAnnotation to the AnalysedText
+ Chunk chunk = at.addChunk(start, end);
+ //TODO: build AnnotationModel based on the configured Mappings
+ chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag,
confidence));
}
}
finder.clearAdaptiveData();
@@ -553,9 +563,9 @@ public abstract class NEREngineCore
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
- UriRef mappedType =
config.getMappedType(nameSpans[j].getType());
+ NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name,
absoluteStart, absoluteEnd,
- mappedType, context, confidence);
+ nerTag.getType(),context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java?rev=1410367&r1=1410366&r2=1410367&view=diff
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
(original)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
Fri Nov 16 14:30:35 2012
@@ -21,7 +21,9 @@ import org.apache.clerezza.rdf.core.UriR
public class NameOccurrence {
public final String name;
-
+
+ public final UriRef type;
+
public final Integer start;
public final Integer end;
@@ -30,14 +32,12 @@ public class NameOccurrence {
public final Double confidence;
- public final UriRef type;
-
- public NameOccurrence(String name, Integer start, Integer end, UriRef type,
- String context, Double confidence) {
- this.start = start;
- this.end = end;
+ public NameOccurrence(String name, Integer start, Integer end,
+ UriRef type,String context, Double confidence) {
this.name = name;
this.type = type;
+ this.start = start;
+ this.end = end;
this.context = context;
this.confidence = confidence;
}