Repository: opennlp Updated Branches: refs/heads/LangDetect 6e9da1c00 -> 0b5e4a491
Generate 1 - 3 char grams, disable unicode normalizer Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/0b5e4a49 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/0b5e4a49 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/0b5e4a49 Branch: refs/heads/LangDetect Commit: 0b5e4a491c5480ce53ebb48632a3c12afcdc8b29 Parents: 6e9da1c Author: Jörn Kottmann <[email protected]> Authored: Wed Jun 14 12:23:12 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Wed Jun 14 12:23:12 2017 +0200 ---------------------------------------------------------------------- .../langdetect/LanguageDetectorContextGenerator.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/0b5e4a49/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java index b28c601..699d2eb 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -17,19 +17,17 @@ package opennlp.tools.langdetect; +import java.util.ArrayList; import java.util.Collection; -import java.util.LinkedList; import opennlp.tools.ngram.NGramModel; import opennlp.tools.util.StringList; -import opennlp.tools.util.StringUtil; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; import opennlp.tools.util.normalizer.CharSequenceNormalizer; import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; -import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer; import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; /** @@ -50,21 +48,21 @@ class LanguageDetectorContextGenerator { UrlCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), - UnicodeCharSequenceNormalizer.getInstance(), + // UnicodeCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance() ); } /** - * Initializes the current instance with min 2 length and max 5 length of ngrams. + * Initializes the current instance with min 1 length and max 3 length of ngrams. */ LanguageDetectorContextGenerator() { - this(2, 3); + this(1, 3); } public String[] getContext(String document) { - Collection<String> context = new LinkedList<>(); + Collection<String> context = new ArrayList<>(); NGramModel model = new NGramModel(); String normalized = normalizer.normalize(document).toString(); @@ -72,7 +70,7 @@ class LanguageDetectorContextGenerator { for (StringList tokenList : model) { if (tokenList.size() > 0) { - context.add(StringUtil.toLowerCase(tokenList.getToken(0))); + context.add(tokenList.getToken(0)); } } return context.toArray(new String[context.size()]);
