OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/580e0d1e Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/580e0d1e Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/580e0d1e Branch: refs/heads/LangDetect Commit: 580e0d1e26ab2a9275f01506f3af56fe8fc32988 Parents: e220a72 Author: koji <k...@apache.org> Authored: Wed Apr 19 10:14:47 2017 +0900 Committer: koji <k...@apache.org> Committed: Wed Apr 19 10:14:47 2017 +0900 ---------------------------------------------------------------------- .../doccat/BagOfWordsFeatureGenerator.java | 6 +- .../tools/doccat/NGramFeatureGenerator.java | 11 +- .../doccat/BagOfWordsFeatureGeneratorTest.java | 62 +++++++++ .../tools/doccat/NGramFeatureGeneratorTest.java | 129 +++++++++++++++++++ 4 files changed, 201 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java index ac39afc..51a3277 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java @@ -21,6 +21,7 @@ package opennlp.tools.doccat; import java.util.ArrayList; import java.util.Collection; import java.util.Map; +import java.util.Objects; import opennlp.tools.util.featuregen.StringPattern; @@ -29,9 +30,10 @@ import opennlp.tools.util.featuregen.StringPattern; */ public class BagOfWordsFeatureGenerator implements FeatureGenerator { - private boolean useOnlyAllLetterTokens = false; + private final boolean useOnlyAllLetterTokens; public BagOfWordsFeatureGenerator() { + this(false); } BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) { @@ -40,7 +42,7 @@ public class BagOfWordsFeatureGenerator implements FeatureGenerator { @Override public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInformation) { - + Objects.requireNonNull(text, "text must not be null"); Collection<String> bagOfWords = new ArrayList<>(text.length); for (String word : text) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java index 967b105..6e1786f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Objects; import opennlp.tools.util.InvalidFormatException; @@ -30,9 +31,8 @@ import opennlp.tools.util.InvalidFormatException; */ public class NGramFeatureGenerator implements FeatureGenerator { - //default values for bigrams - private int minGram = 2; - private int maxGram = 2; + private final int minGram; + private final int maxGram; /** * Constructor for ngrams. @@ -59,7 +59,8 @@ public class NGramFeatureGenerator implements FeatureGenerator { /** * Default constructor for Bi grams */ - public NGramFeatureGenerator() { + public NGramFeatureGenerator() throws InvalidFormatException { + this(2, 2); } /** @@ -70,7 +71,7 @@ public class NGramFeatureGenerator implements FeatureGenerator { * @return a collection of n gram features */ public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) { - + Objects.requireNonNull(text, "text must not be null"); List<String> features = new ArrayList<>(); for (int i = 0; i <= text.length - minGram; i++) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java new file mode 100644 index 0000000..2b128d9 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.doccat; + +import java.util.Collections; + +import org.junit.Assert; +import org.junit.Test; + +public class BagOfWordsFeatureGeneratorTest { + + @Test + public void testNull() { + BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); + try { + generator.extractFeatures(null, Collections.emptyMap()); + Assert.fail("NullPointerException must be thrown"); + } + catch (NullPointerException expected) { + } + } + + @Test + public void testEmpty() { + BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); + + Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size()); + } + + @Test + public void testUseAllTokens() { + BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); + + Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=12.345", "bow=feet", "bow=long"}, + generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"}, + Collections.emptyMap()).toArray()); + } + + @Test + public void testOnlyLetterTokens() { + BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(true); + + Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=feet", "bow=long"}, + generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"}, + Collections.emptyMap()).toArray()); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java new file mode 100644 index 0000000..0aef3ea --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.doccat; + +import java.util.Collections; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.util.InvalidFormatException; + +public class NGramFeatureGeneratorTest { + + static final String[] TOKENS = new String[]{"a", "b", "c", "d", "e", "f", "g"}; + + @Test + public void testNull() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(); + try { + generator.extractFeatures(null, Collections.emptyMap()); + Assert.fail("NullPointerException must be thrown"); + } + catch (NullPointerException expected) { + } + } + + @Test + public void testEmpty() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(); + + Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size()); + } + + @Test + public void testInvalidGramSize1() { + try { + new NGramFeatureGenerator(0, 1); + Assert.fail("InvalidFormatException must be thrown"); + } + catch (InvalidFormatException expected) { + } + } + + @Test + public void testInvalidGramSize2() { + try { + new NGramFeatureGenerator(2, 1); + Assert.fail("InvalidFormatException must be thrown"); + } + catch (InvalidFormatException expected) { + } + } + + @Test + public void testUnigram() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 1); + + Assert.assertArrayEquals( + new String[]{"ng=:a", "ng=:b", "ng=:c", "ng=:d", "ng=:e", "ng=:f", "ng=:g"}, + generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray()); + } + + @Test + public void testBigram() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(2, 2); + + Assert.assertArrayEquals( + new String[]{"ng=:a:b", "ng=:b:c", "ng=:c:d", "ng=:d:e", "ng=:e:f", "ng=:f:g"}, + generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray()); + } + + @Test + public void testTrigram() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(3, 3); + + Assert.assertArrayEquals( + new String[]{"ng=:a:b:c", "ng=:b:c:d", "ng=:c:d:e", "ng=:d:e:f", "ng=:e:f:g"}, + generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray()); + } + + @Test + public void test12gram() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 2); + + Assert.assertArrayEquals( + new String[]{ + "ng=:a", "ng=:a:b", + "ng=:b", "ng=:b:c", + "ng=:c", "ng=:c:d", + "ng=:d", "ng=:d:e", + "ng=:e", "ng=:e:f", + "ng=:f", "ng=:f:g", + "ng=:g" + }, + generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray()); + } + + @Test + public void test13gram() throws Exception { + NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 3); + + Assert.assertArrayEquals( + new String[]{ + "ng=:a", "ng=:a:b", "ng=:a:b:c", + "ng=:b", "ng=:b:c", "ng=:b:c:d", + "ng=:c", "ng=:c:d", "ng=:c:d:e", + "ng=:d", "ng=:d:e", "ng=:d:e:f", + "ng=:e", "ng=:e:f", "ng=:e:f:g", + "ng=:f", "ng=:f:g", + "ng=:g" + }, + generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray()); + } +}