Author: rwesten
Date: Tue Nov 6 14:59:07 2012
New Revision: 1406165
URL: http://svn.apache.org/viewvc?rev=1406165&view=rev
Log:
STANBOL-796: implementation of the OpenNLP Sentence Detection Engine
Added:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/
(with props)
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/pom.xml
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/metatype.properties
Propchange:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Nov 6 14:59:07 2012
@@ -0,0 +1,7 @@
+.settings
+
+.classpath
+
+target
+
+.project
Added:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/pom.xml?rev=1406165&view=auto
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/pom.xml
(added)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/pom.xml
Tue Nov 6 14:59:07 2012
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
+ license agreements. See the NOTICE file distributed with this work for
additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <groupId>org.apache.stanbol</groupId>
+ <version>0.10.0-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.sentence</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer Enhancement Engine: OpenNLP Sentence
Detection</name>
+ <description>A Stanbol engine adding Sentence annotations to the AnalyzedText
+ content part to a content item for further processing</description>
+
+ <inceptionYear>2012</inceptionYear>
+
+ <scm>
+ <connection>
+
scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-sentence/
+ </connection>
+ <developerConnection>
+
scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-sentence/
+ </developerConnection>
+ <url>http://stanbol.apache.org/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Import-Package>
+ org.apache.stanbol.enhancer.servicesapi; provide:=true,
+ org.apache.stanbol.enhancer.servicesapi.impl; provide:=true,
+ *
+ </Import-Package>
+ <Private-Package>
+
org.apache.stanbol.enhancer.engines.opennlp.sentence.impl;version=${project.version}
+ </Private-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <!-- AL20 License -->
+ <exclude>src/license/THIRD-PARTY.properties</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.opennlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ </dependencies>
+
+</project>
Added:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java?rev=1406165&view=auto
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java
(added)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java
Tue Nov 6 14:59:07 2012
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.opennlp.sentence.impl;
+
+import static
org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static
org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
+
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * EnhancementEngine that uses the OpenNLP {@link SentenceDetector} to
+ * add {@link Sentence} annotations to the {@link AnalysedText}
+ * content part of the parsed {@link ContentItem}.<p>
+ * While the opennlp-pos engine does also support adding of {@link Sentence}
+ * annotations this engine can be used in cases where no POS tagging is
+ * needed. In addition this engine also allows to configure custom
+ * {@link SentenceModel}s with by using the {@link #MODEL_NAME_PARAM}
+ * with the language configuration
+ * <code><pre>
+ * {lang};model={model-name}
+ * </pre></code>
+ */
+@Component(immediate = true, metatype = true,
+ policy = ConfigurationPolicy.OPTIONAL) //create a default instance with
the default configuration
+@Service
+@Properties(value={
+ @Property(name=
EnhancementEngine.PROPERTY_NAME,value="opennlp-sentence"),
+ @Property(name=OpenNlpSentenceDetectionEngine.CONFIG_LANGUAGES, value
= {"*"},cardinality=Integer.MAX_VALUE),
+ @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the
default instance a ranking < 0
+})
+public class OpenNlpSentenceDetectionEngine extends
AbstractEnhancementEngine<RuntimeException,RuntimeException> implements
ServiceProperties {
+
+
+ private static final Map<String,Object> SERVICE_PROPERTIES;
+ static {
+ Map<String,Object> props = new HashMap<String,Object>();
+ props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
+ ServiceProperties.ORDERING_NLP_SENTENCE_DETECTION);
+ props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
+ NlpProcessingRole.SentenceDetection);
+ SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+ }
+
+ /**
+ * Language configuration. Takes a list of ISO language codes of supported
languages. Currently supported
+ * are the languages given as default value.
+ */
+ public static final String CONFIG_LANGUAGES =
"org.apache.stanbol.enhancer.sentence.languages";
+
+ /**
+ * The parameter name used to configure the name of the OpenNLP model used
for pos tagging
+ */
+ private static final String MODEL_NAME_PARAM = "model";
+
+
+ private static Logger log =
LoggerFactory.getLogger(OpenNlpSentenceDetectionEngine.class);
+
+ //Langauge configuration
+ private LanguageConfiguration languageConfig = new
LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});
+
+ @Reference
+ private OpenNLP openNLP;
+
+ @Reference
+ private AnalysedTextFactory analysedTextFactory;
+
+ /**
+ * Indicate if this engine can enhance supplied ContentItem, and if it
+ * suggests enhancing it synchronously or asynchronously. The
+ * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}
can force sync/async mode if desired, it is
+ * just a suggestion from the engine.
+ * <p/>
+ * Returns ENHANCE_ASYNC in case there is a text/plain content part and a
tagger for the language identified for
+ * the content item, CANNOT_ENHANCE otherwise.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the introspecting process of the content item
+ * fails
+ */
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ // check if content is present
+ Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci,
false);
+ if(entry == null || entry.getValue() == null) {
+ return CANNOT_ENHANCE;
+ }
+
+ String language = getLanguage(this,ci,false);
+ if(language == null) {
+ return CANNOT_ENHANCE;
+ }
+ if(!languageConfig.isLanguage(language)){
+ log.trace(" > can NOT enhance ContentItem {} because language {}
is "
+ + "not enabled by this engines configuration",ci,language);
+ return CANNOT_ENHANCE;
+ }
+ if(getSentenceDetector(language) == null){
+ log.trace(" > can NOT enhance ContentItem {} because no sentence "
+ + "deteciton model for language {} is
available.",ci,language);
+ return CANNOT_ENHANCE;
+ }
+
+ log.trace(" > can enhance ContentItem {} with language
{}",ci,language);
+ return ENHANCE_ASYNC;
+ }
+
+ /**
+ * Compute enhancements for supplied ContentItem. The results of the
process
+ * are expected to be stored in the metadata of the content item.
+ * <p/>
+ * The client (usually an {@link
org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take
care of
+ * persistent storage of the enhanced {@link
org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+ * <p/>
+ * This method creates a new POSContentPart using {@link
org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart}
from a text/plain part and
+ * stores it as a new part in the content item. The metadata is not
changed.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the underlying process failed to work as
+ * expected
+ */
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
+ String language = getLanguage(this, ci, true);
+ SentenceDetector sentenceDetector = getSentenceDetector(language);
+ if(sentenceDetector != null){
+ for(opennlp.tools.util.Span sentSpan :
sentenceDetector.sentPosDetect(at.getSpan())) {
+ //detect sentences and add it to the AnalyzedText.
+ Sentence sentence = at.addSentence(sentSpan.getStart(),
sentSpan.getEnd());
+ log.trace(" > add {}",sentence);
+ }
+ } else {
+ log.warn("SentenceDetector model for language {} is no longer
available. "
+ + "This might happen if the model becomes unavailable during
enhancement. "
+ + "If this happens more often it might also indicate an bug in
the used "
+ + "EnhancementJobManager implementation as the availability is
also checked "
+ + "in the canEnhance(..) method of this Enhancement Engine.");
+ }
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return SERVICE_PROPERTIES;
+ }
+
+ /**
+ * Activate and read the properties. Configures and initialises a
POSTagger for each language configured in
+ * CONFIG_LANGUAGES.
+ *
+ * @param ce the {@link org.osgi.service.component.ComponentContext}
+ */
+ @Activate
+ protected void activate(ComponentContext ce) throws ConfigurationException
{
+ log.info("activating POS tagging engine");
+ super.activate(ce);
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ce.getProperties();
+
+ languageConfig.setConfiguration(properties);
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext context) {
+ languageConfig.setDefault();
+ super.deactivate(context);
+ }
+
+ /**
+ * Obtains the {@link SentenceDetectorME} model for the given
+ * language form the {@link #openNLP} service. If a custom
+ * model is configured for the parsed language than it is
+ * loaded by using {@link OpenNLP#getModel(Class, String, Map)}
+ * otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
+ * is retrieved
+ * @param language the language
+ * @return the model of <code>null</code> if non is available or
+ * an exception was encountered while loading
+ */
+ private SentenceDetector getSentenceDetector(String language) {
+ SentenceModel model;
+ String modelName = languageConfig.getParameter(language,
MODEL_NAME_PARAM);
+ if(modelName == null){
+ try {
+ model = openNLP.getSentenceModel(language);
+ } catch (Exception e) {
+ log.warn("Unable to load default Sentence Detection model for
language '"+language+"'!",e);
+ return null;
+ }
+ } else {
+ try {
+ model = openNLP.getModel(SentenceModel.class, modelName, null);
+ } catch (Exception e) {
+ log.warn("Unable to load Sentence Detection model for language
'"
+ +language+"' from the configured model
'"+modelName+"'!",e);
+ return null;
+ }
+ }
+ if(model != null) {
+ log.debug("Sentence Detection Model {} for lanugage '{}' version:
{}",
+ new Object[]{model.getClass().getSimpleName(),
+ model.getLanguage(),
+ model.getVersion() != null ? model.getVersion() :
"undefined"});
+ return new SentenceDetectorME(model);
+ }
+ log.debug("Sentence Detection Model for Language '{}' not available.",
language);
+ return null;
+ }
+
+}
Added:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1406165&view=auto
==============================================================================
---
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/metatype.properties
(added)
+++
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-sentence/src/main/resources/OSGI-INF/metatype/metatype.properties
Tue Nov 6 14:59:07 2012
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+org.apache.stanbol.enhancer.engines.opennlp.sentence.impl.OpenNlpSentenceDetectionEngine.name=Apache
\
+Stanbol Enhancer Engine: OpenNLP Sentence Detection
+org.apache.stanbol.enhancer.engines.opennlp.sentence.impl.OpenNlpSentenceDetectionEngine.description=Enhancement
\
+Engine that provides Sentence annotations for parsed Texts based on the
configured OpenNLP \
+SentenceDetection models.
+
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as
\
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are
active the \
+one with the higher ranking will be used to process parsed content items.
+
+org.apache.stanbol.enhancer.sentence.languages.name=Language configuration
+org.apache.stanbol.enhancer.sentence.languages.description=Takes a list of ISO
\
+ language codes of supported languages. '*' is the Wildcard; '!{lang}' to
exclude \
+ a language; '{lang};model={sentence-detection-model-file-name}' to configure
a \
+ custom OpenNLP model for a language. Models are loaded via the Stanbol \
+ DataFileProvider service. So users can e.g. put models in the datafiles
directory \
+ (defaults to '{stanbol-working-dir}/stanbol/datafiles')