Author: colen
Date: Wed Apr 16 15:26:24 2014
New Revision: 1587944
URL: http://svn.apache.org/r1587944
Log:
OPENNLP-674 Added factory to Doccat
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
(with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
(with props)
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/DoccatSample.txt
(with props)
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
Wed Apr 16 15:26:24 2014
@@ -34,8 +34,10 @@ import opennlp.tools.cmdline.doccat.Docc
import opennlp.tools.cmdline.params.CVParams;
import opennlp.tools.doccat.DoccatCrossValidator;
import opennlp.tools.doccat.DoccatEvaluationMonitor;
+import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.eval.EvaluationMonitor;
import opennlp.tools.util.model.ModelUtil;
@@ -88,13 +90,18 @@ public final class DoccatCrossValidatorT
FeatureGenerator[] featureGenerators = DoccatTrainerTool
.createFeatureGenerators(params.getFeatureGenerators());
+ Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
+ .getTokenizer());
+
DoccatEvaluationMonitor[] listenersArr = listeners
.toArray(new DoccatEvaluationMonitor[listeners.size()]);
DoccatCrossValidator validator;
try {
+ DoccatFactory factory = DoccatFactory.create(params.getFactory(),
+ tokenizer, featureGenerators);
validator = new DoccatCrossValidator(params.getLang(), mlParams,
- featureGenerators, listenersArr);
+ factory, listenersArr);
validator.evaluate(sampleStream, params.getFolds());
} catch (IOException e) {
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
Wed Apr 16 15:26:24 2014
@@ -26,16 +26,19 @@ import opennlp.tools.cmdline.TerminateTo
import opennlp.tools.cmdline.doccat.DoccatTrainerTool.TrainerToolParams;
import opennlp.tools.cmdline.params.TrainingToolParams;
import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
+import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ext.ExtensionLoader;
import opennlp.tools.util.model.ModelUtil;
public class DoccatTrainerTool
extends AbstractTrainerTool<DocumentSample, TrainerToolParams> {
-
+
interface TrainerToolParams extends TrainingParams, TrainingToolParams {
}
@@ -47,7 +50,7 @@ public class DoccatTrainerTool
public String getShortDescription() {
return "trainer for the learnable document categorizer";
}
-
+
@Override
public void run(String format, String[] args) {
super.run(format, args);
@@ -64,10 +67,14 @@ public class DoccatTrainerTool
FeatureGenerator[] featureGenerators = createFeatureGenerators(params
.getFeatureGenerators());
+ Tokenizer tokenizer = createTokenizer(params.getTokenizer());
+
DoccatModel model;
try {
+ DoccatFactory factory = DoccatFactory.create(params.getFactory(),
+ tokenizer, featureGenerators);
model = DocumentCategorizerME.train(params.getLang(), sampleStream,
- mlParams, featureGenerators);
+ mlParams, factory);
} catch (IOException e) {
throw new TerminateToolException(-1, "IO error while reading training
data or indexing data: " +
e.getMessage(), e);
@@ -79,10 +86,17 @@ public class DoccatTrainerTool
// sorry that this can fail
}
}
-
+
CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
}
+ static Tokenizer createTokenizer(String tokenizer) {
+ if(tokenizer != null) {
+ return ExtensionLoader.instantiateExtension(Tokenizer.class, tokenizer);
+ }
+ return WhitespaceTokenizer.INSTANCE;
+ }
+
static FeatureGenerator[] createFeatureGenerators(String
featureGeneratorsNames) {
if(featureGeneratorsNames == null) {
FeatureGenerator[] def = {new BagOfWordsFeatureGenerator()};
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
Wed Apr 16 15:26:24 2014
@@ -32,4 +32,12 @@ interface TrainingParams extends BasicTr
@OptionalParameter
String getFeatureGenerators();
+ @ParameterDescription(valueName = "tokenizer", description = "Tokenizer
implementation. WhitespaceTokenizer is used if not specified.")
+ @OptionalParameter
+ String getTokenizer();
+
+ @ParameterDescription(valueName = "factoryName", description = "A sub-class
of DoccatFactory where to get implementation and resources.")
+ @OptionalParameter
+ String getFactory();
+
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
Wed Apr 16 15:26:24 2014
@@ -34,18 +34,19 @@ public class DoccatCrossValidator {
private DoccatEvaluationMonitor[] listeners;
- private FeatureGenerator[] featureGenarators;
+ private DoccatFactory factory;
+
/**
* Creates a {@link DoccatCrossValidator} with the given
* {@link FeatureGenerator}s.
*/
public DoccatCrossValidator(String languageCode, TrainingParameters mlParams,
- FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[]
listeners) {
+ DoccatFactory factory, DoccatEvaluationMonitor ... listeners) {
this.languageCode = languageCode;
this.params = mlParams;
this.listeners = listeners;
- this.featureGenarators = featureGenerators;
+ this.factory = factory;
}
/**
@@ -70,7 +71,7 @@ public class DoccatCrossValidator {
.next();
DoccatModel model = DocumentCategorizerME.train(languageCode,
- trainingSampleStream, params, featureGenarators);
+ trainingSampleStream, params, factory);
DocumentCategorizerEvaluator evaluator = new
DocumentCategorizerEvaluator(
new DocumentCategorizerME(model), listeners);
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java?rev=1587944&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
Wed Apr 16 15:26:24 2014
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+/**
+ * The factory that provides Doccat default implementations and resources
+ */
+public class DoccatFactory extends BaseToolFactory {
+
+ private static final String FEATURE_GENERATORS = "doccat.featureGenerators";
+ private static final String TOKENIZER_NAME = "doccat.tokenizer";
+
+ private FeatureGenerator[] featureGenerators;
+ private Tokenizer tokenizer;
+
+ /**
+ * Creates a {@link DoccatFactory} that provides the default implementation
of
+ * the resources.
+ */
+ public DoccatFactory() {
+ }
+
+ /**
+ * Creates a {@link DoccatFactory}. Use this constructor to programmatically
+ * create a factory.
+ *
+ * @param tokenizer
+ * @param featureGenerators
+ */
+ public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[]
featureGenerators) {
+ this.init(tokenizer, featureGenerators);
+ }
+
+ protected void init(Tokenizer tokenizer, FeatureGenerator[]
featureGenerators) {
+
+ this.featureGenerators = featureGenerators;
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public Map<String, String> createManifestEntries() {
+ Map<String, String> manifestEntries = super.createManifestEntries();
+
+ if (getTokenizer() != null) {
+ manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
+ .getCanonicalName());
+ }
+
+ if (getFeatureGenerators() != null) {
+ manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString());
+ }
+
+ return manifestEntries;
+ }
+
+ private String featureGeneratorsAsString() {
+ List<FeatureGenerator> fgs = Arrays.asList(getFeatureGenerators());
+ Iterator<FeatureGenerator> iter = fgs.iterator();
+ StringBuilder sb = new StringBuilder();
+ if (iter.hasNext()) {
+ sb.append(iter.next().getClass().getCanonicalName());
+ while (iter.hasNext()) {
+ sb.append(',').append(iter.next().getClass().getCanonicalName());
+ }
+ }
+ return sb.toString();
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // nothing to validate
+ }
+
+ public static DoccatFactory create(String subclassName, Tokenizer tokenizer,
+ FeatureGenerator[] featureGenerators) throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new DoccatFactory(tokenizer, featureGenerators);
+ }
+ try {
+ DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
+ DoccatFactory.class, subclassName);
+ theFactory.init(tokenizer, featureGenerators);
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ System.err.println(msg);
+ e.printStackTrace();
+ throw new InvalidFormatException(msg, e);
+ }
+
+ }
+
+ private FeatureGenerator[] loadFeatureGenerators(String classNames) {
+ String[] classes = classNames.split(",");
+ FeatureGenerator[] fgs = new FeatureGenerator[classes.length];
+
+ for (int i = 0; i < classes.length; i++) {
+ fgs[i] = ExtensionLoader.instantiateExtension(FeatureGenerator.class,
+ classes[i]);
+ }
+ return fgs;
+ }
+
+ public FeatureGenerator[] getFeatureGenerators() {
+ if (featureGenerators == null) {
+ if (artifactProvider != null) {
+ String classNames = artifactProvider
+ .getManifestProperty(FEATURE_GENERATORS);
+ if (classNames != null) {
+ this.featureGenerators = loadFeatureGenerators(classNames);
+ }
+ }
+ if (featureGenerators == null) { // could not load using artifact
provider
+ // load bag of words as default
+ FeatureGenerator[] bow = { new BagOfWordsFeatureGenerator() };
+ this.featureGenerators = bow;
+ }
+ }
+ return featureGenerators;
+ }
+
+ public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
+ this.featureGenerators = featureGenerators;
+ }
+
+ public Tokenizer getTokenizer() {
+ if (this.tokenizer == null) {
+ if (artifactProvider != null) {
+ String className =
artifactProvider.getManifestProperty(TOKENIZER_NAME);
+ if (className != null) {
+ this.tokenizer = ExtensionLoader.instantiateExtension(
+ Tokenizer.class, className);
+ }
+ }
+ if (this.tokenizer == null) { // could not load using artifact provider
+ this.tokenizer = WhitespaceTokenizer.INSTANCE;
+ }
+ }
+ return tokenizer;
+ }
+
+ public void setTokenizer(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+}
Propchange:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
Wed Apr 16 15:26:24 2014
@@ -25,34 +25,50 @@ import java.util.Map;
import opennlp.tools.ml.model.AbstractModel;
import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
public class DoccatModel extends BaseModel {
-
+
private static final String COMPONENT_NAME = "DocumentCategorizerME";
private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model";
-
- protected DoccatModel(String languageCode, MaxentModel doccatModel,
- Map<String, String> manifestInfoEntries) {
- super(COMPONENT_NAME, languageCode, manifestInfoEntries);
-
+
+ public DoccatModel(String languageCode, MaxentModel doccatModel,
+ Map<String, String> manifestInfoEntries, DoccatFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
artifactMap.put(DOCCAT_MODEL_ENTRY_NAME, doccatModel);
checkArtifactMap();
}
-
+
+ /**
+ * @deprecated Use
+ * {@link #DoccatModel(String, MaxentModel, Map, DoccatFactory)}
+ * instead and pass in a {@link DoccatFactory}
+ */
+ protected DoccatModel(String languageCode, MaxentModel doccatModel,
+ Map<String, String> manifestInfoEntries) {
+ this(languageCode, doccatModel, manifestInfoEntries, new DoccatFactory());
+ }
+
+ /**
+ * @deprecated Use
+ * {@link #DoccatModel(String, MaxentModel, Map, DoccatFactory)}
+ * instead and pass in a {@link DoccatFactory}
+ */
public DoccatModel(String languageCode, MaxentModel doccatModel) {
this(languageCode, doccatModel, null);
}
-
+
public DoccatModel(InputStream in) throws IOException,
InvalidFormatException {
super(COMPONENT_NAME, in);
}
-
+
public DoccatModel(File modelFile) throws IOException,
InvalidFormatException {
super(COMPONENT_NAME, modelFile);
}
-
+
public DoccatModel(URL modelURL) throws IOException, InvalidFormatException {
super(COMPONENT_NAME, modelURL);
}
@@ -66,7 +82,23 @@ public class DoccatModel extends BaseMod
}
}
+ public DoccatFactory getFactory() {
+ return (DoccatFactory) this.toolFactory;
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return DoccatFactory.class;
+ }
+
+ /**
+ * @deprecated Use {@link #getMaxentModel()} instead.
+ */
public MaxentModel getChunkerModel() {
return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
}
+
+ public MaxentModel getMaxentModel() {
+ return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
+ }
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
Wed Apr 16 15:26:24 2014
@@ -25,7 +25,6 @@ import java.util.Map;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.TrainUtil;
-import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
@@ -40,29 +39,35 @@ public class DocumentCategorizerME imple
* Shared default thread safe feature generator.
*/
private static FeatureGenerator defaultFeatureGenerator = new
BagOfWordsFeatureGenerator();
-
- private MaxentModel model;
+
+ private DoccatModel model;
private DocumentCategorizerContextGenerator mContextGenerator;
/**
- * Initializes a the current instance with a doccat model and custom feature
generation.
- * The feature generation must be identical to the configuration at training
time.
- *
+ * Initializes a the current instance with a doccat model and custom feature
+ * generation. The feature generation must be identical to the configuration
+ * at training time.
+ *
* @param model
* @param featureGenerators
+ *
+ * @deprecated train a {@link DoccatModel} with a specific
+ * {@link DoccatFactory} to customize the {@link
FeatureGenerator}s
*/
public DocumentCategorizerME(DoccatModel model, FeatureGenerator...
featureGenerators) {
- this.model = model.getChunkerModel();
+ this.model = model;
this.mContextGenerator = new
DocumentCategorizerContextGenerator(featureGenerators);
}
-
+
/**
* Initializes the current instance with a doccat model. Default feature
generation is used.
- *
+ *
* @param model
*/
public DocumentCategorizerME(DoccatModel model) {
- this(model, defaultFeatureGenerator);
+ this.model = model;
+ this.mContextGenerator = new DocumentCategorizerContextGenerator(this.model
+ .getFactory().getFeatureGenerators());
}
/**
@@ -71,7 +76,7 @@ public class DocumentCategorizerME imple
* @param text
*/
public double[] categorize(String text[]) {
- return model.eval(mContextGenerator.getContext(text));
+ return model.getMaxentModel().eval(mContextGenerator.getContext(text));
}
/**
@@ -79,57 +84,79 @@ public class DocumentCategorizerME imple
* is passed to the feature generation.
*/
public double[] categorize(String documentText) {
- Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ Tokenizer tokenizer = model.getFactory().getTokenizer();
return categorize(tokenizer.tokenize(documentText));
}
public String getBestCategory(double[] outcome) {
- return model.getBestOutcome(outcome);
+ return model.getMaxentModel().getBestOutcome(outcome);
}
public int getIndex(String category) {
- return model.getIndex(category);
+ return model.getMaxentModel().getIndex(category);
}
public String getCategory(int index) {
- return model.getOutcome(index);
+ return model.getMaxentModel().getOutcome(index);
}
public int getNumberOfCategories() {
- return model.getNumOutcomes();
+ return model.getMaxentModel().getNumOutcomes();
}
public String getAllResults(double results[]) {
- return model.getAllOutcomes(results);
+ return model.getMaxentModel().getAllOutcomes(results);
}
+ /**
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TrainingParameters,
DoccatFactory)}
+ * instead.
+ */
public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
TrainingParameters mlParams, FeatureGenerator... featureGenerators)
throws IOException {
-
+
if (featureGenerators.length == 0) {
featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
}
-
+
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
+
MaxentModel model = TrainUtil.train(
new DocumentCategorizerEventStream(samples, featureGenerators),
mlParams.getSettings(), manifestInfoEntries);
-
+
return new DoccatModel(languageCode, model, manifestInfoEntries);
}
-
+
+ public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
+ TrainingParameters mlParams, DoccatFactory factory)
+ throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ MaxentModel model = TrainUtil.train(
+ new DocumentCategorizerEventStream(samples,
factory.getFeatureGenerators()),
+ mlParams.getSettings(), manifestInfoEntries);
+
+ return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
+ }
+
/**
* Trains a doccat model with default feature generation.
- *
+ *
* @param languageCode
* @param samples
- *
+ *
* @return the trained doccat model
- *
+ *
* @throws IOException
- * @throws ObjectStreamException
+ * @throws ObjectStreamException
+ *
+ * @deprecated Use
+ * {@link #train(String, ObjectStream, TrainingParameters,
DoccatFactory)}
+ * instead.
*/
public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples) throws IOException {
return train(languageCode, samples,
ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
Wed Apr 16 15:26:24 2014
@@ -52,7 +52,7 @@ public class SentenceDetectorFactory ext
/**
* Creates a {@link SentenceDetectorFactory}. Use this constructor to
* programmatically create a factory.
- *
+ *
* @param languageCode
* @param abbreviationDictionary
* @param eosCharacters
@@ -61,7 +61,7 @@ public class SentenceDetectorFactory ext
Dictionary abbreviationDictionary, char[] eosCharacters) {
this.init(languageCode, useTokenEnd, abbreviationDictionary,
eosCharacters);
}
-
+
protected void init(String languageCode, boolean useTokenEnd,
Dictionary abbreviationDictionary, char[] eosCharacters) {
this.languageCode = languageCode;
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java?rev=1587944&r1=1587943&r2=1587944&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
Wed Apr 16 15:26:24 2014
@@ -17,6 +17,8 @@
package opennlp.tools.util.ext;
+import java.lang.reflect.Field;
+
/**
* The {@link ExtensionLoader} is responsible to load extensions to the
OpenNLP library.
* <p>
@@ -64,6 +66,24 @@ public class ExtensionLoader {
} catch (InstantiationException e) {
throw new ExtensionNotLoadedException(e);
} catch (IllegalAccessException e) {
+ // constructor is private. Try to load using INSTANCE
+ Field instanceField;
+ try {
+ instanceField = extClazz.getDeclaredField("INSTANCE");
+ } catch (NoSuchFieldException e1) {
+ throw new ExtensionNotLoadedException(e1);
+ } catch (SecurityException e1) {
+ throw new ExtensionNotLoadedException(e1);
+ }
+ if(instanceField != null) {
+ try {
+ return (T) instanceField.get(null);
+ } catch (IllegalArgumentException e1) {
+ throw new ExtensionNotLoadedException(e1);
+ } catch (IllegalAccessException e1) {
+ throw new ExtensionNotLoadedException(e1);
+ }
+ }
throw new ExtensionNotLoadedException(e);
}
}
Added:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java?rev=1587944&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
(added)
+++
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
Wed Apr 16 15:26:24 2014
@@ -0,0 +1,100 @@
+package opennlp.tools.doccat;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link DoccatFactory} class.
+ */
+public class DoccatFactoryTest {
+
+ private static ObjectStream<DocumentSample> createSampleStream()
+ throws IOException {
+
+ InputStreamFactory isf = new ResourceAsStreamFactory(
+ DoccatFactoryTest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+ return new DocumentSampleStream(new PlainTextByLineStream(isf, "UTF-8"));
+ }
+
+ private static DoccatModel train() throws IOException {
+ return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
+ TrainingParameters.defaultParams());
+ }
+
+ private static DoccatModel train(DoccatFactory factory) throws IOException {
+ return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
+ TrainingParameters.defaultParams(), factory);
+ }
+
+ @Test
+ public void testDefault() throws IOException {
+ DoccatModel model = train();
+
+ assertNotNull(model);
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ DoccatModel fromSerialized = new DoccatModel(in);
+
+ DoccatFactory factory = fromSerialized.getFactory();
+
+ assertNotNull(factory);
+
+ assertEquals(1, factory.getFeatureGenerators().length);
+ assertEquals(BagOfWordsFeatureGenerator.class,
+ factory.getFeatureGenerators()[0].getClass());
+
+ assertEquals(WhitespaceTokenizer.INSTANCE, factory.getTokenizer());
+
+ }
+
+ @Test
+ public void testCustom() throws IOException {
+ FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(),
+ new NGramFeatureGenerator() };
+ DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
+ featureGenerators);
+
+ DoccatModel model = train(factory);
+
+ assertNotNull(model);
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+ DoccatModel fromSerialized = new DoccatModel(in);
+
+ factory = fromSerialized.getFactory();
+
+ assertNotNull(factory);
+
+ assertEquals(2, factory.getFeatureGenerators().length);
+ assertEquals(BagOfWordsFeatureGenerator.class,
+ factory.getFeatureGenerators()[0].getClass());
+ assertEquals(NGramFeatureGenerator.class,
+ factory.getFeatureGenerators()[1].getClass());
+
+ assertEquals(SimpleTokenizer.INSTANCE.getClass(), factory.getTokenizer()
+ .getClass());
+
+ }
+
+}
Propchange:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain