Repository: opennlp Updated Branches: refs/heads/LangDetect 21a1f84b4 -> 4a3a707b8
Repair test cases Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/4a3a707b Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/4a3a707b Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/4a3a707b Branch: refs/heads/LangDetect Commit: 4a3a707b8a84861ffda47b992ebb3ac026d3cd90 Parents: 21a1f84 Author: Jörn Kottmann <[email protected]> Authored: Tue Jun 6 11:49:29 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Tue Jun 6 11:49:29 2017 +0200 ---------------------------------------------------------------------- .../LanguageDetectorContextGenerator.java | 22 +++++++++++++------- .../LanguageDetectorContextGeneratorTest.java | 17 +++++---------- .../LanguageDetectorCrossValidatorTest.java | 4 ++-- 3 files changed, 21 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a3a707b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java index dcfe0e9..b28c601 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -25,6 +25,12 @@ import opennlp.tools.util.StringList; import opennlp.tools.util.StringUtil; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; import opennlp.tools.util.normalizer.CharSequenceNormalizer; +import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; +import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; +import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; +import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; +import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer; +import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; /** * Context generator for document categorizer @@ -40,12 +46,12 @@ class LanguageDetectorContextGenerator { this.maxLength = maxLength; this.normalizer = new AggregateCharSequenceNormalizer( - // EmojiCharSequenceNormalizer.getInstance(), - //UrlCharSequenceNormalizer.getInstance(), - //TwitterCharSequenceNormalizer.getInstance(), - //NumberCharSequenceNormalizer.getInstance(), - //UnicodeCharSequenceNormalizer.getInstance(), - //ShrinkCharSequenceNormalizer.getInstance()); + EmojiCharSequenceNormalizer.getInstance(), + UrlCharSequenceNormalizer.getInstance(), + TwitterCharSequenceNormalizer.getInstance(), + NumberCharSequenceNormalizer.getInstance(), + UnicodeCharSequenceNormalizer.getInstance(), + ShrinkCharSequenceNormalizer.getInstance() ); } @@ -53,7 +59,7 @@ class LanguageDetectorContextGenerator { * Initializes the current instance with min 2 length and max 5 length of ngrams. */ LanguageDetectorContextGenerator() { - this(3, 3); + this(2, 3); } public String[] getContext(String document) { @@ -66,7 +72,7 @@ class LanguageDetectorContextGenerator { for (StringList tokenList : model) { if (tokenList.size() > 0) { - context.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); + context.add(StringUtil.toLowerCase(tokenList.getToken(0))); } } return context.toArray(new String[context.size()]); http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a3a707b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java index 787dc1e..f6c8b18 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java @@ -34,17 +34,10 @@ public class LanguageDetectorContextGeneratorTest { Collection<String> features = Arrays.asList(cg.getContext(doc)); - Assert.assertEquals(38, features.size()); - Assert.assertTrue(features.contains("ng=ab")); - Assert.assertTrue(features.contains("ng=abc")); - Assert.assertTrue(features.contains("ng=abcd")); - Assert.assertTrue(features.contains("ng=abcde")); - Assert.assertTrue(features.contains("ng=abcde")); - - Assert.assertTrue(features.contains("ng= f")); - Assert.assertTrue(features.contains("ng= fg")); - Assert.assertTrue(features.contains("ng= fgh")); - Assert.assertTrue(features.contains("ng= fghi")); - + Assert.assertEquals(21, features.size()); + Assert.assertTrue(features.contains("ab")); + Assert.assertTrue(features.contains("abc")); + Assert.assertTrue(features.contains("e f")); + Assert.assertTrue(features.contains(" fg")); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a3a707b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java index 8e814e8..cb38b27 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java @@ -56,8 +56,8 @@ public class LanguageDetectorCrossValidatorTest { cv.evaluate(sampleStream, 2); - Assert.assertEquals(58, cv.getDocumentCount()); - Assert.assertEquals(0.83, cv.getDocumentAccuracy(), 0.01); + Assert.assertEquals(99, cv.getDocumentCount()); + Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01); } }
