Repository: opennlp Updated Branches: refs/heads/LangDetect 260f52f06 -> 8d731904b
Changed the LD factory to allow customization Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/8d731904 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/8d731904 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/8d731904 Branch: refs/heads/LangDetect Commit: 8d731904bdf3a6742fdea49e0070083e2cc8bf24 Parents: 260f52f Author: William D C M SILVA <[email protected]> Authored: Wed Jun 14 10:55:15 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Wed Jun 14 10:55:15 2017 -0300 ---------------------------------------------------------------------- .../LanguageDetectorContextGenerator.java | 51 ++++++++++--------- .../langdetect/LanguageDetectorEventStream.java | 6 +-- .../langdetect/LanguageDetectorFactory.java | 14 ++++++ .../tools/langdetect/LanguageDetectorME.java | 4 +- .../opennlp/tools/langdetect/DummyFactory.java | 53 ++++++++++++++++++++ .../LanguageDetectorContextGeneratorTest.java | 2 +- .../langdetect/LanguageDetectorFactoryTest.java | 30 ++++++++--- .../langdetect/LanguageDetectorMETest.java | 1 + 8 files changed, 125 insertions(+), 36 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java index d1f1f4b..a467521 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -24,48 +24,53 @@ import opennlp.tools.ngram.NGramModel; import opennlp.tools.util.StringList; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; import opennlp.tools.util.normalizer.CharSequenceNormalizer; -import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; -import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; -import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; -import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; -import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; /** - * Context generator for document categorizer + * A context generator for language detector. */ class LanguageDetectorContextGenerator { - private final int minLength; - private final int maxLength; - private final CharSequenceNormalizer normalizer; + protected final int minLength; + protected final int maxLength; + protected final CharSequenceNormalizer normalizer; - LanguageDetectorContextGenerator(int minLength, int maxLength) { + /** + * Creates a customizable @{@link LanguageDetectorContextGenerator} that computes ngrams from text + * @param minLength min ngrams chars + * @param maxLength max ngrams chars + * @param normalizers zero or more normalizers to + * be applied in to the text before extracting ngrams + */ + public LanguageDetectorContextGenerator(int minLength, int maxLength, + CharSequenceNormalizer... normalizers) { this.minLength = minLength; this.maxLength = maxLength; - this.normalizer = new AggregateCharSequenceNormalizer( - EmojiCharSequenceNormalizer.getInstance(), - UrlCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance() - ); + this.normalizer = new AggregateCharSequenceNormalizer(normalizers); } /** - * Initializes the current instance with min 1 length and max 3 length of ngrams. + * Generates the context for a document. It normalizers the text using normalizers before. + * Classes that extends @{@link LanguageDetectorContextGenerator} should not extend this method, + * but {@link #getContextNormalized(String)}. + * @param document document to extract context from + * @return the generated context */ - LanguageDetectorContextGenerator() { - this(1, 3); + public final String[] getContext(String document) { + return getContextNormalized(this.normalizer.normalize(document).toString()); } - public String[] getContext(String document) { + /** + * Extension point of the {@link LanguageDetectorContextGenerator}. + * @param document document to extract context from + * @return the generated context + */ + protected String[] getContextNormalized(String document) { Collection<String> context = new ArrayList<>(); NGramModel model = new NGramModel(); - String normalized = normalizer.normalize(document).toString(); - model.add(normalized, minLength, maxLength); + model.add(document, minLength, maxLength); for (StringList tokenList : model) { if (tokenList.size() > 0) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java index b556a4d..19e6d46 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java @@ -35,11 +35,11 @@ public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSam * * @param data {@link ObjectStream} of {@link LanguageSample}s */ - public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) { + public LanguageDetectorEventStream(ObjectStream<LanguageSample> data, + LanguageDetectorContextGenerator contextGenerator) { super(data); - mContextGenerator = - new LanguageDetectorContextGenerator(); + mContextGenerator = contextGenerator; } @Override http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java index 5cebbba..11357ec 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java @@ -20,10 +20,24 @@ package opennlp.tools.langdetect; import opennlp.tools.util.BaseToolFactory; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.ext.ExtensionLoader; +import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; +import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; +import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; +import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; +import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; public class LanguageDetectorFactory extends BaseToolFactory { + public LanguageDetectorContextGenerator getContextGenerator() { + return new LanguageDetectorContextGenerator(1, 3, + EmojiCharSequenceNormalizer.getInstance(), + UrlCharSequenceNormalizer.getInstance(), + TwitterCharSequenceNormalizer.getInstance(), + NumberCharSequenceNormalizer.getInstance(), + ShrinkCharSequenceNormalizer.getInstance()); + } + public static LanguageDetectorFactory create(String subclassName) throws InvalidFormatException { if (subclassName == null) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java index 74a1cea..3af6afd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java @@ -45,7 +45,7 @@ public class LanguageDetectorME implements LanguageDetector { */ public LanguageDetectorME(LanguageDetectorModel model) { this.model = model; - this.mContextGenerator = new LanguageDetectorContextGenerator(); + this.mContextGenerator = model.getFactory().getContextGenerator(); } @Override @@ -90,7 +90,7 @@ public class LanguageDetectorME implements LanguageDetector { mlParams, manifestInfoEntries); MaxentModel model = trainer.train( - new LanguageDetectorEventStream(samples)); + new LanguageDetectorEventStream(samples, factory.getContextGenerator())); return new LanguageDetectorModel(model, manifestInfoEntries, factory); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java index cbe7d1a..f3c7dd8 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java @@ -18,6 +18,16 @@ package opennlp.tools.langdetect; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import opennlp.tools.ngram.NGramModel; +import opennlp.tools.tokenize.SimpleTokenizer; +import opennlp.tools.util.StringList; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + public class DummyFactory extends LanguageDetectorFactory { @@ -30,4 +40,47 @@ public class DummyFactory extends LanguageDetectorFactory { super.init(); } + @Override + public LanguageDetectorContextGenerator getContextGenerator() { + return new DummyFactory.MyContectGenerator(1, 5, + new DummyFactory.UpperCaseNormalizer()); + } + + public class UpperCaseNormalizer implements CharSequenceNormalizer { + @Override + public CharSequence normalize(CharSequence text) { + return text.toString().toUpperCase(); + } + } + + public class MyContectGenerator extends LanguageDetectorContextGenerator { + + public MyContectGenerator(int min, int max, CharSequenceNormalizer ... normalizers) { + super(min, max, normalizers); + } + + @Override + public String[] getContextNormalized(String document) { + String[] superContext = super.getContextNormalized(document); + + List<String> context = new ArrayList(Arrays.asList(superContext)); + + SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; + String[] words = tokenizer.tokenize(document); + NGramModel tokenNgramModel = new NGramModel(); + if(words.length > 0) { + tokenNgramModel.add(new StringList(words), 1, 3); + Iterator tokenNgramIterator = tokenNgramModel.iterator(); + + while(tokenNgramIterator.hasNext()) { + StringList tokenList = (StringList)tokenNgramIterator.next(); + if(tokenList.size() > 0) { + context.add("tg=" + tokenList.toString()); + } + } + } + + return context.toArray(new String[context.size()]); + } + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java index c800688..dc6ca26 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java @@ -30,7 +30,7 @@ public class LanguageDetectorContextGeneratorTest { public void extractContext() throws Exception { String doc = "abcde fghijk"; - LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(); + LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3); Collection<String> features = Arrays.asList(cg.getContext(doc)); http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java index 2a6c0ce..781326b 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java @@ -20,9 +20,12 @@ package opennlp.tools.langdetect; import java.io.ByteArrayInputStream; import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; import org.junit.Assert; -import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import opennlp.tools.formats.ResourceAsStreamFactory; @@ -32,10 +35,10 @@ import opennlp.tools.util.TrainingParameters; public class LanguageDetectorFactoryTest { - private LanguageDetectorModel model; + private static LanguageDetectorModel model; - @Before - public void train() throws Exception { + @BeforeClass + public static void train() throws Exception { ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); @@ -47,8 +50,9 @@ public class LanguageDetectorFactoryTest { TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, "100"); params.put(TrainingParameters.CUTOFF_PARAM, "0"); + params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); - this.model = LanguageDetectorME.train(sampleStream, params, new DummyFactory()); + model = LanguageDetectorME.train(sampleStream, params, new DummyFactory()); } @Test @@ -63,13 +67,25 @@ public class LanguageDetectorFactoryTest { @Test public void testDummyFactory() throws Exception { - byte[] serialized = LanguageDetectorMETest.serializeModel( - LanguageDetectorMETest.trainModel(new DummyFactory())); + byte[] serialized = LanguageDetectorMETest.serializeModel(model); LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized)); Assert.assertTrue(myModel.getFactory() instanceof DummyFactory); + } + + @Test + public void testDummyFactoryContextGenerator() throws Exception { + LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator(); + String[] context = cg.getContext( + "a dummy text phrase to test if the context generator works!!!!!!!!!!!!"); + + Set<String> set = new HashSet(Arrays.asList(context)); + + Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated ! + Assert.assertTrue(set.contains("a dum")); + Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]")); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/8d731904/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java index 8caca1d..beb7589 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java @@ -98,6 +98,7 @@ public class LanguageDetectorMETest { TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, "100"); params.put(TrainingParameters.CUTOFF_PARAM, "2"); + params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); return LanguageDetectorME.train(sampleStream, params, factory); }
