OPENNLP-1029: Add tests for InsufficientTrainingDataException, closes apache/opennlp#167
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d447459a Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d447459a Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d447459a Branch: refs/heads/LangDetect Commit: d447459a682cdc7e06b3980a59100ea94a6a180a Parents: 45ea3f7 Author: jzonthemtn <jeff...@mtnfog.com> Authored: Wed Apr 19 14:11:20 2017 -0400 Committer: smarthi <smar...@apache.org> Committed: Wed Apr 19 14:11:20 2017 -0400 ---------------------------------------------------------------------- .../opennlp/tools/chunker/ChunkerMETest.java | 18 +++++++++++ .../tools/doccat/DocumentCategorizerMETest.java | 17 ++++++++++ .../tools/lemmatizer/LemmatizerMETest.java | 21 +++++++++++-- .../TokenNameFinderCrossValidatorTest.java | 33 +++++++++++++++++--- .../opennlp/tools/postag/POSTaggerMETest.java | 24 ++++++++++++-- .../sentdetect/SentenceDetectorMETest.java | 28 +++++++++++++++-- .../opennlp/tools/tokenize/TokenizerMETest.java | 26 +++++++++++++++ .../opennlp/tools/chunker/test-insufficient.txt | 1 + .../tools/lemmatizer/trial.old-insufficient.tsv | 1 + .../namefind/AnnotatedSentencesInsufficient.txt | 5 +++ .../postag/AnnotatedSentencesInsufficient.txt | 1 + .../tools/sentdetect/SentencesInsufficient.txt | 1 + .../tools/tokenize/token-insufficient.train | 1 + 13 files changed, 166 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java index 51112df..facb408 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java @@ -27,6 +27,7 @@ import org.junit.Test; import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Sequence; @@ -128,5 +129,22 @@ public class ChunkerMETest { Assert.assertEquals(Arrays.asList(expect1), preds[0].getOutcomes()); Assert.assertNotSame(Arrays.asList(expect1), preds[1].getOutcomes()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/chunker/test-insufficient.txt"); + + ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "70"); + params.put(TrainingParameters.CUTOFF_PARAM, "1"); + + ChunkerME.train("en", sampleStream, params, new ChunkerFactory()); + + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java index 220df87..391125e 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java @@ -24,6 +24,7 @@ import java.util.SortedMap; import org.junit.Assert; import org.junit.Test; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ObjectStreamUtils; import opennlp.tools.util.TrainingParameters; @@ -61,4 +62,20 @@ public class DocumentCategorizerMETest { Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void insufficientTestData() throws IOException { + + ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( + new DocumentSample("1", new String[]{"a", "b", "c"})); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); + + DocumentCategorizerME.train("x-unspecified", samples, + params, new DoccatFactory()); + + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java index 4631763..f00f2b4 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java @@ -24,6 +24,7 @@ import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.MockInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; @@ -68,8 +69,8 @@ public class LemmatizerMETest { new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8")); TrainingParameters params = new TrainingParameters(); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory()); @@ -84,5 +85,21 @@ public class LemmatizerMETest { Assert.assertArrayEquals(expect, lemmas); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream( + new PlainTextByLineStream(new MockInputStreamFactory( + new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")), + "UTF-8")); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); + + LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory()); + + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java index 679726d..9e31987 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java @@ -28,6 +28,7 @@ import org.junit.Test; import opennlp.tools.cmdline.namefind.NameEvaluationErrorListener; import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; @@ -50,8 +51,8 @@ public class TokenNameFinderCrossValidatorTest { new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); @@ -77,8 +78,8 @@ public class TokenNameFinderCrossValidatorTest { new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); @@ -95,4 +96,28 @@ public class TokenNameFinderCrossValidatorTest { Assert.assertTrue(out.size() > 0); Assert.assertNotNull(cv.getFMeasure()); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testWithInsufficientData() throws Exception { + + InputStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt"); + + ObjectStream<NameSample> sampleStream = new NameSampleDataStream( + new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "1"); + + mlParams.put(TrainingParameters.ALGORITHM_PARAM, + ModelType.MAXENT.toString()); + + TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", + TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null); + + cv.evaluate(sampleStream, 2); + + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java index 51cae2c..e2bca48 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java @@ -25,6 +25,7 @@ import org.junit.Test; import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; @@ -50,8 +51,8 @@ public class POSTaggerMETest { static POSModel trainPOSModel(ModelType type) throws IOException { TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, type.toString()); - params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5)); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); return POSTaggerME.train("en", createSampleStream(), params, new POSTaggerFactory()); @@ -85,4 +86,23 @@ public class POSTaggerMETest { ObjectStream<POSSample> samples = createSampleStream(); POSTaggerME.buildNGramDictionary(samples, 0); } + + @Test(expected = InsufficientTrainingDataException.class) + public void insufficientTestData() throws IOException { + + InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, + "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt"); + + ObjectStream<POSSample> stream = new WordTagSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name()); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "5"); + + POSTaggerME.train("en", stream, params, new POSTaggerFactory()); + + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java index 43d5829..220650d 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java @@ -26,6 +26,7 @@ import org.junit.Test; import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; import opennlp.tools.util.TrainingParameters; @@ -42,12 +43,14 @@ public class SentenceDetectorMETest { "/opennlp/tools/sentdetect/Sentences.txt"); TrainingParameters mlParams = new TrainingParameters(); - mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100)); - mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0)); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "0"); + + SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null); SentenceModel sentdetectModel = SentenceDetectorME.train( "en", new SentenceSampleStream(new PlainTextByLineStream(in, - StandardCharsets.UTF_8)), true, null, mlParams); + StandardCharsets.UTF_8)), factory, mlParams); Assert.assertEquals("en", sentdetectModel.getLanguage()); @@ -132,4 +135,23 @@ public class SentenceDetectorMETest { Assert.assertEquals(new Span(16, 56), pos[1]); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + InputStreamFactory in = new ResourceAsStreamFactory(getClass(), + "/opennlp/tools/sentdetect/SentencesInsufficient.txt"); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "0"); + + SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null); + + SentenceDetectorME.train("en", + new SentenceSampleStream( + new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams); + + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java index 5a7a811..14b9185 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java @@ -18,10 +18,18 @@ package opennlp.tools.tokenize; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InsufficientTrainingDataException; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + /** * Tests for the {@link TokenizerME} class. * @@ -65,4 +73,22 @@ public class TokenizerMETest { Assert.assertEquals("through", tokens[7]); Assert.assertEquals("!", tokens[8]); } + + @Test(expected = InsufficientTrainingDataException.class) + public void testInsufficientData() throws IOException { + + InputStreamFactory trainDataIn = new ResourceAsStreamFactory( + TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train"); + + ObjectStream<TokenSample> samples = new TokenSampleStream( + new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8)); + + TrainingParameters mlParams = new TrainingParameters(); + mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100"); + mlParams.put(TrainingParameters.CUTOFF_PARAM, "5"); + + TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams); + + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt new file mode 100644 index 0000000..a578590 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt @@ -0,0 +1 @@ +Rockwell NNP B-NP \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv new file mode 100644 index 0000000..89c2aee --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv @@ -0,0 +1 @@ +The DT the \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt new file mode 100644 index 0000000..c70ec6d --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt @@ -0,0 +1,5 @@ +Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years. +I just knew his name , <START> Alan McKennedy <END> , and I'd heard the rumour that he'd moved to Scotland, the country of his ancestors. +So I called <START> Julie <END> , a friend who's still in contact with him. +She told me that he lived in 23213 Edinburgh, Worcesterstreet 12. +I wrote him a letter right away and he answered soon, sounding very happy and delighted. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt new file mode 100644 index 0000000..786f182 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt @@ -0,0 +1 @@ +Find_VB out_RP. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt new file mode 100644 index 0000000..0465ce2 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt @@ -0,0 +1 @@ +Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years. http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train new file mode 100644 index 0000000..db4a49d --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train @@ -0,0 +1 @@ +I tried to find out the address of an old school . \ No newline at end of file