OPENNLP-994: Remove deprecated methods from the Document Categorizer, this closes apache/opennlp#133
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/76609f5c Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/76609f5c Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/76609f5c Branch: refs/heads/parser_regression Commit: 76609f5c105bcfc3abab6e2d19de283d945c96a6 Parents: 81acc6e Author: smarthi <smar...@apache.org> Authored: Mon Feb 27 17:23:40 2017 -0500 Committer: Jörn Kottmann <jo...@apache.org> Committed: Sun Apr 16 19:24:54 2017 +0200 ---------------------------------------------------------------------- .../doccat/DoccatCrossValidatorTool.java | 7 +- .../tools/cmdline/doccat/DoccatTool.java | 11 +- .../tools/cmdline/doccat/DoccatTrainerTool.java | 5 +- .../opennlp/tools/doccat/DoccatFactory.java | 93 +---------------- .../tools/doccat/DocumentCategorizer.java | 54 ++-------- .../doccat/DocumentCategorizerEvaluator.java | 2 +- .../tools/doccat/DocumentCategorizerME.java | 101 ++----------------- .../opennlp/tools/doccat/DocumentSample.java | 6 -- .../formats/LeipzigDoccatSampleStream.java | 19 ++-- .../tools/doccat/DocumentCategorizerMETest.java | 18 ++-- .../tools/doccat/DocumentCategorizerNBTest.java | 17 ++-- .../tools/doccat/DocumentSampleTest.java | 4 +- .../doccat/AbstractDocumentCategorizer.java | 29 +++--- .../java/opennlp/uima/util/AnnotatorUtil.java | 6 +- 14 files changed, 66 insertions(+), 306 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java index f0f1712..a73aba7 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java @@ -36,7 +36,6 @@ import opennlp.tools.doccat.DoccatEvaluationMonitor; import opennlp.tools.doccat.DoccatFactory; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.doccat.FeatureGenerator; -import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.eval.EvaluationMonitor; import opennlp.tools.util.model.ModelUtil; @@ -84,16 +83,12 @@ public final class DoccatCrossValidatorTool extends FeatureGenerator[] featureGenerators = DoccatTrainerTool .createFeatureGenerators(params.getFeatureGenerators()); - Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params - .getTokenizer()); - DoccatEvaluationMonitor[] listenersArr = listeners .toArray(new DoccatEvaluationMonitor[listeners.size()]); DoccatCrossValidator validator; try { - DoccatFactory factory = DoccatFactory.create(params.getFactory(), - tokenizer, featureGenerators); + DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); validator = new DoccatCrossValidator(params.getLang(), mlParams, factory, listenersArr); http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java index a01d354..49a640c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java @@ -28,6 +28,7 @@ import opennlp.tools.cmdline.SystemInputStreamFactory; import opennlp.tools.doccat.DoccatModel; import opennlp.tools.doccat.DocumentCategorizerME; import opennlp.tools.doccat.DocumentSample; +import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ParagraphStream; import opennlp.tools.util.PlainTextByLineStream; @@ -36,7 +37,7 @@ public class DoccatTool extends BasicCmdLineTool { @Override public String getShortDescription() { - return "learnable document categorizer"; + return "learned document categorizer"; } @Override @@ -53,7 +54,7 @@ public class DoccatTool extends BasicCmdLineTool { DoccatModel model = new DoccatModelLoader().load(new File(args[0])); - DocumentCategorizerME doccat = new DocumentCategorizerME(model); + DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model); /* * moved initialization to the try block to catch new IOException @@ -68,10 +69,10 @@ public class DoccatTool extends BasicCmdLineTool { new SystemInputStreamFactory(), SystemInputStreamFactory.encoding())); String document; while ((document = documentStream.read()) != null) { - String[] tokens = model.getFactory().getTokenizer().tokenize(document); + String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document); - double[] prob = doccat.categorize(tokens); - String category = doccat.getBestCategory(prob); + double[] prob = documentCategorizerME.categorize(tokens); + String category = documentCategorizerME.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, tokens); System.out.println(sample.toString()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java index 6ef5d88..8ebb5a8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java @@ -66,12 +66,9 @@ public class DoccatTrainerTool FeatureGenerator[] featureGenerators = createFeatureGenerators(params .getFeatureGenerators()); - Tokenizer tokenizer = createTokenizer(params.getTokenizer()); - DoccatModel model; try { - DoccatFactory factory = DoccatFactory.create(params.getFactory(), - tokenizer, featureGenerators); + DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); model = DocumentCategorizerME.train(params.getLang(), sampleStream, mlParams, factory); } catch (IOException e) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java index a6c815b..babab7c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java @@ -22,8 +22,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.BaseToolFactory; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.ext.ExtensionLoader; @@ -34,47 +32,17 @@ import opennlp.tools.util.ext.ExtensionLoader; public class DoccatFactory extends BaseToolFactory { private static final String FEATURE_GENERATORS = "doccat.featureGenerators"; - private static final String TOKENIZER_NAME = "doccat.tokenizer"; private FeatureGenerator[] featureGenerators; - private Tokenizer tokenizer; /** * Creates a {@link DoccatFactory} that provides the default implementation of * the resources. */ - public DoccatFactory() { - this.tokenizer = WhitespaceTokenizer.INSTANCE; - } + public DoccatFactory() {} public DoccatFactory(final FeatureGenerator[] featureGenerators) { - this.tokenizer = WhitespaceTokenizer.INSTANCE; - this.featureGenerators = featureGenerators; - } - - /** - * Creates a {@link DoccatFactory}. Use this constructor to programmatically - * create a factory. - * - * @deprecated will be removed after 1.7.1 release. Don't use it. - * @param tokenizer the tokenizer - * @param featureGenerators the feature generators - */ - @Deprecated - public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) { - this.init(tokenizer, featureGenerators); - } - - /** - * @deprecated will be removed after 1.7.1 release. Don't use it. - * @param tokenizer the tokenizer - * @param featureGenerators feature generators - */ - @Deprecated - protected void init(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) { - this.featureGenerators = featureGenerators; - this.tokenizer = tokenizer; } protected void init(FeatureGenerator[] featureGenerators) { @@ -85,11 +53,6 @@ public class DoccatFactory extends BaseToolFactory { public Map<String, String> createManifestEntries() { Map<String, String> manifestEntries = super.createManifestEntries(); - if (getTokenizer() != null) { - manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass() - .getCanonicalName()); - } - if (getFeatureGenerators() != null) { manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString()); } @@ -115,31 +78,6 @@ public class DoccatFactory extends BaseToolFactory { // nothing to validate } - /** - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - public static DoccatFactory create(String subclassName, Tokenizer tokenizer, - FeatureGenerator[] featureGenerators) throws InvalidFormatException { - if (subclassName == null) { - // will create the default factory - return new DoccatFactory(tokenizer, featureGenerators); - } - try { - DoccatFactory theFactory = ExtensionLoader.instantiateExtension( - DoccatFactory.class, subclassName); - theFactory.init(tokenizer, featureGenerators); - return theFactory; - } catch (Exception e) { - String msg = "Could not instantiate the " + subclassName - + ". The initialization throw an exception."; - System.err.println(msg); - e.printStackTrace(); - throw new InvalidFormatException(msg, e); - } - - } - public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators) throws InvalidFormatException { if (subclassName == null) { @@ -192,33 +130,4 @@ public class DoccatFactory extends BaseToolFactory { this.featureGenerators = featureGenerators; } - /** - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - public Tokenizer getTokenizer() { - if (this.tokenizer == null) { - if (artifactProvider != null) { - String className = artifactProvider.getManifestProperty(TOKENIZER_NAME); - if (className != null) { - this.tokenizer = ExtensionLoader.instantiateExtension( - Tokenizer.class, className); - } - } - if (this.tokenizer == null) { // could not load using artifact provider - this.tokenizer = WhitespaceTokenizer.INSTANCE; - } - } - return tokenizer; - } - - /** - * @deprecated will be removed after 1.7.1 release. Don't use it. - * @param tokenizer tokenizer - */ - @Deprecated - public void setTokenizer(Tokenizer tokenizer) { - this.tokenizer = tokenizer; - } - } http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java index 88bf8f9..b180549 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java @@ -27,23 +27,21 @@ import java.util.SortedMap; public interface DocumentCategorizer { /** - * Categorizes the given text, provided in separate tokens. + * Categorize the given text provided as tokens along with + * the provided extra information * * @param text the tokens of text to categorize + * @param extraInformation extra information * @return per category probabilities */ - double[] categorize(String[] text); + double[] categorize(String[] text, Map<String, Object> extraInformation); /** * Categorizes the given text, provided in separate tokens. - * - * @param text the tokens of text to categorize - * @param extraInformation optional extra information to pass for evaluation + * @param text the tokens of text to categorize * @return per category probabilities - * @deprecated will be removed after 1.7.1 release. Don't use it. */ - @Deprecated - double[] categorize(String[] text, Map<String, Object> extraInformation); + double[] categorize(String[] text); /** * get the best category from previously generated outcome probabilities @@ -77,25 +75,6 @@ public interface DocumentCategorizer { int getNumberOfCategories(); /** - * categorize a piece of text - * - * @param documentText the text to categorize - * @return the probabilities of each category (sum up to 1) - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - double[] categorize(String documentText); - - /** - * categorize a piece of text, providing extra metadata. - * - * @param documentText the text to categorize - * @param extraInformation extra metadata - * @return the probabilities of each category (sum up to 1) - */ - double[] categorize(String documentText, Map<String, Object> extraInformation); - - /** * get the name of the category associated with the given probabilties * * @param results the probabilities of each category @@ -108,16 +87,6 @@ public interface DocumentCategorizer { * * @param text the input text to classify * @return a map with the score as a key. The value is a Set of categories with the score. - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - Map<String, Double> scoreMap(String text); - - /** - * Returns a map in which the key is the category name and the value is the score - * - * @param text the input text to classify - * @return a map with the score as a key. The value is a Set of categories with the score. */ Map<String, Double> scoreMap(String[] text); @@ -127,17 +96,6 @@ public interface DocumentCategorizer { * * @param text the input text to classify * @return a map with the score as a key. The value is a Set of categories with the score. - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - SortedMap<Double, Set<String>> sortedScoreMap(String text); - - /** - * Get a map of the scores sorted in ascending aorder together with their associated categories. - * Many categories can have the same score, hence the Set as value - * - * @param text the input text to classify - * @return a map with the score as a key. The value is a Set of categories with the score. */ SortedMap<Double, Set<String>> sortedScoreMap(String[] text); http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java index 63e0768..c501280 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java @@ -59,7 +59,7 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> { String[] document = sample.getText(); - double[] probs = categorizer.categorize(document, sample.getExtraInformation()); + double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java index e743b9d..9dc41d7 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java @@ -29,8 +29,6 @@ import java.util.TreeMap; import opennlp.tools.ml.EventTrainer; import opennlp.tools.ml.TrainerFactory; import opennlp.tools.ml.model.MaxentModel; -import opennlp.tools.tokenize.SimpleTokenizer; -import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.TrainingParameters; @@ -48,22 +46,6 @@ public class DocumentCategorizerME implements DocumentCategorizer { private DocumentCategorizerContextGenerator mContextGenerator; /** - * Initializes the current instance with a doccat model and custom feature - * generation. The feature generation must be identical to the configuration - * at training time. - * - * @param model the doccat model - * @param featureGenerators the feature generators - * @deprecated train a {@link DoccatModel} with a specific - * {@link DoccatFactory} to customize the {@link FeatureGenerator}s - */ - @Deprecated - public DocumentCategorizerME(DoccatModel model, FeatureGenerator... featureGenerators) { - this.model = model; - this.mContextGenerator = new DocumentCategorizerContextGenerator(featureGenerators); - } - - /** * Initializes the current instance with a doccat model. Default feature * generation is used. * @@ -75,6 +57,13 @@ public class DocumentCategorizerME implements DocumentCategorizer { .getFactory().getFeatureGenerators()); } + /** + * Categorize the given text provided as tokens along with + * the provided extra information + * + * @param text text tokens to categorize + * @param extraInformation additional information + */ @Override public double[] categorize(String[] text, Map<String, Object> extraInformation) { return model.getMaxentModel().eval( @@ -83,58 +72,15 @@ public class DocumentCategorizerME implements DocumentCategorizer { /** * Categorizes the given text. + * * @param text the text to categorize */ + @Override public double[] categorize(String[] text) { return this.categorize(text, Collections.emptyMap()); } /** - * Categorizes the given text. The Tokenizer is obtained from - * {@link DoccatFactory#getTokenizer()} and defaults to - * {@link SimpleTokenizer}. - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - @Override - public double[] categorize(String documentText, - Map<String, Object> extraInformation) { - Tokenizer tokenizer = model.getFactory().getTokenizer(); - return categorize(tokenizer.tokenize(documentText), extraInformation); - } - - /** - * Categorizes the given text. The text is tokenized with the SimpleTokenizer - * before it is passed to the feature generation. - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - public double[] categorize(String documentText) { - Tokenizer tokenizer = model.getFactory().getTokenizer(); - return categorize(tokenizer.tokenize(documentText), Collections.emptyMap()); - } - - /** - * Returns a map in which the key is the category name and the value is the score - * - * @param text the input text to classify - * @return the score map - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - public Map<String, Double> scoreMap(String text) { - Map<String, Double> probDist = new HashMap<>(); - - double[] categorize = categorize(text); - int catSize = getNumberOfCategories(); - for (int i = 0; i < catSize; i++) { - String category = getCategory(i); - probDist.put(category, categorize[getIndex(category)]); - } - return probDist; - } - - /** * Returns a map in which the key is the category name and the value is the score * * @param text the input text to classify @@ -160,35 +106,6 @@ public class DocumentCategorizerME implements DocumentCategorizer { * * @param text the input text to classify * @return the sorted score map - * @deprecated will be removed after 1.7.1 release. Don't use it. - */ - @Deprecated - @Override - public SortedMap<Double, Set<String>> sortedScoreMap(String text) { - SortedMap<Double, Set<String>> descendingMap = new TreeMap<>(); - double[] categorize = categorize(text); - int catSize = getNumberOfCategories(); - for (int i = 0; i < catSize; i++) { - String category = getCategory(i); - double score = categorize[getIndex(category)]; - if (descendingMap.containsKey(score)) { - descendingMap.get(score).add(category); - } else { - Set<String> newset = new HashSet<>(); - newset.add(category); - descendingMap.put(score, newset); - } - } - return descendingMap; - } - - /** - * Returns a map with the score as a key in ascending order. - * The value is a Set of categories with the score. - * Many categories can have the same score, hence the Set as value - * - * @param text the input text to classify - * @return the sorted score map */ @Override public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java index 3d107fa..adddc27 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java @@ -24,8 +24,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import opennlp.tools.tokenize.WhitespaceTokenizer; - /** * Class which holds a classified document and its category. */ @@ -35,10 +33,6 @@ public class DocumentSample { private final List<String> text; private final Map<String, Object> extraInformation; - public DocumentSample(String category, String text) { - this(category, WhitespaceTokenizer.INSTANCE.tokenize(text)); - } - public DocumentSample(String category, String[] text) { this(category, text, null); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java index 1ca0484..8ed0036 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java @@ -20,6 +20,9 @@ package opennlp.tools.formats; import java.io.IOException; import java.io.PrintStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.tokenize.SimpleTokenizer; @@ -36,7 +39,7 @@ import opennlp.tools.util.PlainTextByLineStream; * <p> * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce - * exactly the same tokenization during testing and training. + * exactly the same tokenization during testing and training.ø */ public class LeipzigDoccatSampleStream extends FilterObjectStream<String, DocumentSample> { @@ -79,10 +82,8 @@ public class LeipzigDoccatSampleStream extends } public DocumentSample read() throws IOException { - int count = 0; - - StringBuilder sampleText = new StringBuilder(); + List<String> tokensList = new ArrayList<>(); String line; while (count < sentencesPerDocument && (line = samples.read()) != null) { @@ -94,17 +95,13 @@ public class LeipzigDoccatSampleStream extends } // Always skip first token, that is the sentence number! - for (int i = 1; i < tokens.length; i++) { - sampleText.append(tokens[i]); - sampleText.append(' '); - } + tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length)); count++; } - - if (sampleText.length() > 0) { - return new DocumentSample(language, sampleText.toString()); + if (tokensList.size() > 0) { + return new DocumentSample(language, tokensList.toArray(new String[tokensList.size()])); } return null; http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java index 6389530..220df87 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java @@ -42,27 +42,23 @@ public class DocumentCategorizerMETest { new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model); - double[] aProbs = doccat.categorize("a"); + double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); - double[] bProbs = doccat.categorize("x"); + double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. - SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a"); - for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) { - Assert.assertEquals("1", cat); - break; - } - System.out.println(""); - + SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); + Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); + Assert.assertEquals(1, cat.size()); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java index de3f098..0847690 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java @@ -44,8 +44,8 @@ public class DocumentCategorizerNBTest { new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, @@ -53,19 +53,16 @@ public class DocumentCategorizerNBTest { DocumentCategorizer doccat = new DocumentCategorizerME(model); - double[] aProbs = doccat.categorize("a"); + double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); - double[] bProbs = doccat.categorize("x"); + double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. - SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a"); - for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) { - Assert.assertEquals("1", cat); - break; - } - System.out.println(""); + SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); + Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); + Assert.assertEquals(1, cat.size()); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java index 232158b..8cf8fef 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java @@ -31,11 +31,11 @@ public class DocumentSampleTest { } public static DocumentSample createGoldSample() { - return new DocumentSample("aCategory", "a small text"); + return new DocumentSample("aCategory", new String[]{"a", "small", "text"}); } public static DocumentSample createPredSample() { - return new DocumentSample("anotherCategory", "a small text"); + return new DocumentSample("anotherCategory", new String[]{"a", "small", "text"}); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java index db9c075..4b49dca 100644 --- a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java +++ b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java @@ -17,12 +17,17 @@ package opennlp.uima.doccat; +import java.util.ArrayList; +import java.util.List; + import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.CasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.resource.ResourceAccessException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; @@ -72,29 +77,25 @@ abstract class AbstractDocumentCategorizer extends CasAnnotator_ImplBase { mCategorizer = new DocumentCategorizerME(model); } - public void typeSystemInit(TypeSystem typeSystem) - throws AnalysisEngineProcessException { + public void typeSystemInit(TypeSystem typeSystem) throws AnalysisEngineProcessException { mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem, - UimaUtil.SENTENCE_TYPE_PARAMETER); + UimaUtil.TOKEN_TYPE_PARAMETER); } protected abstract void setBestCategory(CAS cas, String bestCategory); public void process(CAS cas) { - double[] result; - - if (mTokenType != null) { - // TODO: - // count tokens - // create token array - // pass array to doccat - // create result annotation - result = mCategorizer.categorize(cas.getDocumentText()); - } else { - result = mCategorizer.categorize(cas.getDocumentText()); + FSIterator<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(mTokenType).iterator(); + List<String> tokensList = new ArrayList<>(); + + while (tokenAnnotations.hasNext()) { + tokensList.add(tokenAnnotations.next().getCoveredText()); } + double[] result = + mCategorizer.categorize(tokensList.toArray(new String[tokensList.size()])); + String bestCategory = mCategorizer.getBestCategory(result); setBestCategory(cas, bestCategory); http://git-wip-us.apache.org/repos/asf/opennlp/blob/76609f5c/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java index 8847107..730d6be 100644 --- a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java +++ b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java @@ -329,8 +329,7 @@ public final class AnnotatorUtil { } else { throw new ResourceInitializationException( ExceptionMessages.MESSAGE_CATALOG, - ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter, - "String array"}); + ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter, "String array"}); } } @@ -443,8 +442,7 @@ public final class AnnotatorUtil { if (inResource == null) { throw new ResourceInitializationException( ExceptionMessages.MESSAGE_CATALOG, - ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name - + " could not be found!"}); + ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name + " could not be found!"}); } return inResource;