Repository: opennlp Updated Branches: refs/heads/OPENNLP-778 f0dcf22d4 -> 3d7a20708 (forced update)
OPENNLP-778: Add LanguageDetector infrastructure classes Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3d7a2070 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3d7a2070 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3d7a2070 Branch: refs/heads/OPENNLP-778 Commit: 3d7a207087db4503d5111b67604603f13956dd8c Parents: 11d7581 Author: William D C M SILVA <[email protected]> Authored: Mon Mar 13 13:48:33 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Tue Mar 14 13:34:19 2017 -0300 ---------------------------------------------------------------------- .../java/opennlp/tools/langdetect/Language.java | 34 +++++ .../tools/langdetect/LanguageDetector.java | 10 +- .../LanguageDetectorContextGenerator.java | 63 +++++++++ .../langdetect/LanguageDetectorEventStream.java | 69 ++++++++++ .../langdetect/LanguageDetectorFactory.java | 53 ++++++++ .../tools/langdetect/LanguageDetectorME.java | 99 ++++++++++++++ .../tools/langdetect/LanguageDetectorModel.java | 82 ++++++++++++ .../langdetect/LanguageDetectorSample.java | 75 +++++++++++ .../LanguageDetectorSampleStream.java | 58 ++++++++ .../opennlp/tools/langdetect/DummyFactory.java | 33 +++++ .../LanguageDetectorContextGeneratorTest.java | 50 +++++++ .../langdetect/LanguageDetectorFactoryTest.java | 64 +++++++++ .../langdetect/LanguageDetectorMETest.java | 134 +++++++++++++++++++ .../langdetect/LanguageDetectorSampleTest.java | 89 ++++++++++++ .../opennlp/tools/langdetect/LanguageTest.java | 97 ++++++++++++++ 15 files changed, 1004 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java index 773201f..57655b4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java @@ -17,6 +17,8 @@ package opennlp.tools.langdetect; +import java.util.Objects; + /** * Class for holding the document language and its confidence */ @@ -24,7 +26,12 @@ public class Language { private final String lang; private final double confidence; + public Language(String lang) { + this(lang, 0); + } + public Language(String lang, double confidence) { + Objects.requireNonNull(lang, "lang must not be null"); this.lang = lang; this.confidence = confidence; } @@ -36,4 +43,31 @@ public class Language { public double getConfidence() { return confidence; } + + @Override + public String toString() { + + return getLang(); + } + + @Override + public int hashCode() { + return Objects.hash(getLang(), getConfidence()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof Language) { + Language a = (Language) obj; + + return getLang().equals(a.getLang()) + && getConfidence() == a.getConfidence(); + } + + return false; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java index ca897fd..0004494 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java @@ -17,17 +17,15 @@ package opennlp.tools.langdetect; -import java.util.Set; - /** - * The interface for name finders which provide name tags for a sequence of tokens. + * The interface for LanguageDetector which provide the @{@link Language} according to the context. */ public interface LanguageDetector { - Language[] detectLanguage(CharSequence content); + Language[] predictLanguages(CharSequence content); - Set<String> getSupportedLanguages(); + Language predictLanguage(CharSequence content); - String getLanguageCoding(); + String[] getSupportedLanguages(); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java new file mode 100644 index 0000000..b3caeea --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Collection; +import java.util.LinkedList; + +import opennlp.tools.ngram.NGramModel; +import opennlp.tools.util.StringList; +import opennlp.tools.util.StringUtil; + +/** + * Context generator for document categorizer + */ +class LanguageDetectorContextGenerator { + + private final int minLength; + private final int maxLength; + + LanguageDetectorContextGenerator(int minLength, int maxLength) { + this.minLength = minLength; + this.maxLength = maxLength; + } + + /** + * Initializes the current instance with min 2 length and max 5 length of ngrams. + */ + LanguageDetectorContextGenerator() { + this(2, 5); + } + + public String[] getContext(String document) { + + Collection<String> context = new LinkedList<>(); + + NGramModel model = new NGramModel(); + model.add(document, minLength, maxLength); + + for (StringList tokenList : model) { + if (tokenList.size() > 0) { + context.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); + } + } + + + return context.toArray(new String[context.size()]); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java new file mode 100644 index 0000000..cfe5f7c --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Iterator; + +import opennlp.tools.ml.model.Event; +import opennlp.tools.util.AbstractEventStream; +import opennlp.tools.util.ObjectStream; + +/** + * Iterator-like class for modeling language detector events. + */ +public class LanguageDetectorEventStream extends AbstractEventStream<LanguageDetectorSample> { + + private LanguageDetectorContextGenerator mContextGenerator; + + /** + * Initializes the current instance via samples and feature generators. + * + * @param data {@link ObjectStream} of {@link LanguageDetectorSample}s + */ + public LanguageDetectorEventStream(ObjectStream<LanguageDetectorSample> data) { + super(data); + + mContextGenerator = + new LanguageDetectorContextGenerator(); + } + + @Override + protected Iterator<Event> createEvents(final LanguageDetectorSample sample) { + + return new Iterator<Event>() { + + private boolean isVirgin = true; + + public boolean hasNext() { + return isVirgin; + } + + public Event next() { + + isVirgin = false; + + return new Event(sample.getLanguage().getLang(), + mContextGenerator.getContext(sample.getContext().toString())); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java new file mode 100644 index 0000000..5cebbba --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ext.ExtensionLoader; + + +public class LanguageDetectorFactory extends BaseToolFactory { + + public static LanguageDetectorFactory create(String subclassName) + throws InvalidFormatException { + if (subclassName == null) { + // will create the default factory + return new LanguageDetectorFactory(); + } + try { + LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension( + LanguageDetectorFactory.class, subclassName); + theFactory.init(); + return theFactory; + } catch (Exception e) { + String msg = "Could not instantiate the " + subclassName + + ". The initialization throw an exception."; + throw new InvalidFormatException(msg, e); + } + } + + public void init() { + // nothing to do + } + + @Override + public void validateArtifactMap() throws InvalidFormatException { + // nothing to validate + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java new file mode 100644 index 0000000..29c7f15 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import opennlp.tools.ml.EventTrainer; +import opennlp.tools.ml.TrainerFactory; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; + +/** + * Implements learnable Language Detector + */ +public class LanguageDetectorME implements LanguageDetector { + + private LanguageDetectorModel model; + private LanguageDetectorContextGenerator mContextGenerator; + + /** + * Initializes the current instance with a language detector model. Default feature + * generation is used. + * + * @param model the language detector model + */ + public LanguageDetectorME(LanguageDetectorModel model) { + this.model = model; + this.mContextGenerator = new LanguageDetectorContextGenerator(); + } + + @Override + public Language[] predictLanguages(CharSequence content) { + double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString())); + Language[] arr = new Language[eval.length]; + for (int i = 0; i < eval.length; i++) { + arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]); + } + + Arrays.sort(arr, new Comparator<Language>() { + @Override + public int compare(Language o1, Language o2) { + return Double.compare(o2.getConfidence(), o1.getConfidence()); + } + }); + return arr; + } + + @Override + public Language predictLanguage(CharSequence content) { + return predictLanguages(content)[0]; + } + + @Override + public String[] getSupportedLanguages() { + int numberLanguages = model.getMaxentModel().getNumOutcomes(); + String[] languages = new String[numberLanguages]; + for (int i = 0; i < numberLanguages; i++) { + languages[i] = model.getMaxentModel().getOutcome(i); + } + return languages; + } + + + public static LanguageDetectorModel train(ObjectStream<LanguageDetectorSample> samples, + TrainingParameters mlParams, + LanguageDetectorFactory factory) + throws IOException { + + Map<String, String> manifestInfoEntries = new HashMap<>(); + + EventTrainer trainer = TrainerFactory.getEventTrainer( + mlParams, manifestInfoEntries); + + MaxentModel model = trainer.train( + new LanguageDetectorEventStream(samples)); + + return new LanguageDetectorModel(model, manifestInfoEntries, factory); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java new file mode 100644 index 0000000..c0d9703 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Map; + +import opennlp.tools.ml.model.AbstractModel; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.BaseModel; + +/** + * A model for language detection + */ +public class LanguageDetectorModel extends BaseModel { + + private static final String COMPONENT_NAME = "LanguageDetectorME"; + private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model"; + + public LanguageDetectorModel(MaxentModel langdetectModel, + Map<String, String> manifestInfoEntries, + LanguageDetectorFactory factory) { + super(COMPONENT_NAME, "und", manifestInfoEntries, factory); + + artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel); + checkArtifactMap(); + } + + public LanguageDetectorModel(InputStream in) throws IOException { + super(COMPONENT_NAME, in); + } + + public LanguageDetectorModel(File modelFile) throws IOException { + super(COMPONENT_NAME, modelFile); + } + + public LanguageDetectorModel(URL modelURL) throws IOException { + super(COMPONENT_NAME, modelURL); + } + + @Override + protected void validateArtifactMap() throws InvalidFormatException { + super.validateArtifactMap(); + + if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) { + throw new InvalidFormatException("Language detector model is incomplete!"); + } + } + + public LanguageDetectorFactory getFactory() { + return (LanguageDetectorFactory) this.toolFactory; + } + + @Override + protected Class<? extends BaseToolFactory> getDefaultFactory() { + return LanguageDetectorFactory.class; + } + + public MaxentModel getMaxentModel() { + return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java new file mode 100644 index 0000000..2c30044 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Objects; + +/** + * Class which holds a classified document and its @{@link Language}. + */ +public class LanguageDetectorSample { + + private final Language language; + private final CharSequence context; + + public LanguageDetectorSample(Language language, CharSequence context) { + Objects.requireNonNull(context, "context must not be null"); + Objects.requireNonNull(language, "language must not be null"); + this.language = language; + this.context = context; + } + + public Language getLanguage() { + return language; + } + + public CharSequence getContext() { + return context; + } + + @Override + public String toString() { + + StringBuilder sampleString = new StringBuilder(); + + sampleString.append(language.getLang()).append('\t').append(context); + + return sampleString.toString(); + } + + @Override + public int hashCode() { + return Objects.hash(getContext(), getLanguage()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof LanguageDetectorSample) { + LanguageDetectorSample a = (LanguageDetectorSample) obj; + + return getLanguage().equals(a.getLanguage()) + && getContext().equals(a.getContext()); + } + + return false; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java new file mode 100644 index 0000000..b8be3df --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.IOException; + +import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.ObjectStream; + +/** + * This class reads in string encoded training samples, parses them and + * outputs {@link LanguageDetectorSample} objects. + * <p> + * Format:<br> + * Each line contains one sample document.<br> + * The language is the first string in the line followed by a tab and the document content.<br> + * Sample line: category-string tab-char document line-break-char(s)<br> + */ +public class LanguageDetectorSampleStream + extends FilterObjectStream<String, LanguageDetectorSample> { + + public LanguageDetectorSampleStream(ObjectStream<String> samples) { + super(samples); + } + + public LanguageDetectorSample read() throws IOException { + String sampleString = samples.read(); + + if (sampleString != null) { + + int tabIndex = sampleString.indexOf("\t"); + if (tabIndex > 0) { + String lang = sampleString.substring(0, tabIndex); + String context = sampleString.substring(tabIndex + 1); + + return new LanguageDetectorSample(new Language(lang), context); + } + } else { + throw new IOException("Empty lines, or lines with only a category string are not allowed!"); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java new file mode 100644 index 0000000..21efd1b --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +public class DummyFactory extends LanguageDetectorFactory { + + + public DummyFactory() { + + } + + @Override + public void init() { + super.init(); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java new file mode 100644 index 0000000..787dc1e --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageDetectorContextGeneratorTest { + + @Test + public void extractContext() throws Exception { + String doc = "abcde fghijk"; + + LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(); + + Collection<String> features = Arrays.asList(cg.getContext(doc)); + + Assert.assertEquals(38, features.size()); + Assert.assertTrue(features.contains("ng=ab")); + Assert.assertTrue(features.contains("ng=abc")); + Assert.assertTrue(features.contains("ng=abcd")); + Assert.assertTrue(features.contains("ng=abcde")); + Assert.assertTrue(features.contains("ng=abcde")); + + Assert.assertTrue(features.contains("ng= f")); + Assert.assertTrue(features.contains("ng= fg")); + Assert.assertTrue(features.contains("ng= fgh")); + Assert.assertTrue(features.contains("ng= fghi")); + + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java new file mode 100644 index 0000000..45cec76 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import java.io.File; +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + +public class LanguageDetectorFactoryTest { + + + private LanguageDetectorModel model; + + @Before + public void train() throws Exception { + + ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( + LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); + + PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); + + LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); + + this.model = LanguageDetectorME.train(sampleStream, params, new DummyFactory()); + } + + @Test + public void testCorrectFactory() throws IOException { + File tempFile = LanguageDetectorMETest.serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile); + + Assert.assertTrue(myModel.getFactory() instanceof DummyFactory); + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java new file mode 100644 index 0000000..1e232a2 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.langdetect; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; + + +public class LanguageDetectorMETest { + + private LanguageDetectorModel model; + + @Before + public void train() throws Exception { + + ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( + LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); + + PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); + + LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, "100"); + params.put(TrainingParameters.CUTOFF_PARAM, "0"); + + this.model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory()); + } + + @Test + public void testPredictLanguages() { + LanguageDetector ld = new LanguageDetectorME(this.model); + Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno"); + + Assert.assertEquals(4, languages.length); + Assert.assertEquals("pob", languages[0].getLang()); + Assert.assertEquals("ita", languages[1].getLang()); + Assert.assertEquals("spa", languages[2].getLang()); + Assert.assertEquals("fra", languages[3].getLang()); + } + + @Test + public void testPredictLanguage() { + LanguageDetector ld = new LanguageDetectorME(this.model); + Language language = ld.predictLanguage("se lever mais il n ' a pas insisté"); + + Assert.assertEquals("fra", language.getLang()); + } + + @Test + public void testSupportedLanguages() { + + LanguageDetector ld = new LanguageDetectorME(this.model); + String[] supportedLanguages = ld.getSupportedLanguages(); + + Assert.assertEquals(4, supportedLanguages.length); + } + + @Test + public void testLoadFromFile() throws IOException { + File tempFile = serializeModel(model); + + Assert.assertTrue(tempFile.exists()); + + LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile); + + Assert.assertNotNull(myModel); + + } + + @Test + public void testLoadFromURL() throws IOException { + File tempFile = serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile.toURI().toURL()); + + Assert.assertNotNull(myModel); + + } + + @Test + public void testLoadFromStream() throws IOException { + File tempFile = serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(new FileInputStream(tempFile)); + + Assert.assertNotNull(myModel); + + } + + @Test + public void testCorrectFactory() throws IOException { + File tempFile = serializeModel(model); + + LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile); + + Assert.assertTrue(myModel.getFactory() instanceof LanguageDetectorFactory); + + } + + protected static File serializeModel(LanguageDetectorModel model) throws IOException { + File tempFile = File.createTempFile("langdetect", "model"); + + FileOutputStream fos = new FileOutputStream(tempFile); + + model.serialize(fos); + + return tempFile; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java new file mode 100644 index 0000000..5e52b24 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageDetectorSampleTest { + + @Test + public void testConstructor() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageDetectorSample sample = new LanguageDetectorSample(lang, context); + + Assert.assertEquals(lang, sample.getLanguage()); + Assert.assertEquals(context, sample.getContext()); + } + + @Test(expected = NullPointerException.class) + public void testNullLang() throws Exception { + CharSequence context = "aContext"; + + new LanguageDetectorSample(null, context); + } + + @Test(expected = NullPointerException.class) + public void testNullContext() { + Language lang = new Language("aLang"); + + new LanguageDetectorSample(lang, null); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageDetectorSample sample = new LanguageDetectorSample(lang, context); + + Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString()); + } + + @Test + public void testHash() { + + int hashA = new LanguageDetectorSample(new Language("aLang"), "aContext").hashCode(); + int hashB = new LanguageDetectorSample(new Language("bLang"), "aContext").hashCode(); + int hashC = new LanguageDetectorSample(new Language("aLang"), "bContext").hashCode(); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashC); + Assert.assertNotEquals(hashB, hashC); + } + + @Test + public void testEquals() throws Exception { + + LanguageDetectorSample sampleA = new LanguageDetectorSample(new Language("aLang"), "aContext"); + LanguageDetectorSample sampleA1 = new LanguageDetectorSample(new Language("aLang"), "aContext"); + LanguageDetectorSample sampleB = new LanguageDetectorSample(new Language("bLang"), "aContext"); + LanguageDetectorSample sampleC = new LanguageDetectorSample(new Language("aLang"), "bContext"); + + Assert.assertEquals(sampleA, sampleA); + Assert.assertEquals(sampleA, sampleA1); + Assert.assertNotEquals(sampleA, sampleB); + Assert.assertNotEquals(sampleA, sampleC); + Assert.assertNotEquals(sampleB, sampleC); + Assert.assertFalse(sampleA.equals("something else")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java new file mode 100644 index 0000000..dd373a9 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageTest { + + + @Test + public void emptyConfidence() throws Exception { + String languageCode = "aLanguage"; + Language lang = new Language(languageCode); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(0, lang.getConfidence(), 0); + } + + @Test + public void nonEmptyConfidence() throws Exception { + String languageCode = "aLanguage"; + double confidence = 0.05; + Language lang = new Language(languageCode, confidence); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(confidence, lang.getConfidence(), 0); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguage() throws Exception { + new Language(null); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguageConfidence() throws Exception { + new Language(null, 0.05); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + + Assert.assertEquals(lang.getLang(), lang.toString()); + } + + + @Test + public void testHash() { + int hashA = new Language("aLang").hashCode(); + int hashAA = new Language("aLang").hashCode(); + int hashB = new Language("BLang").hashCode(); + int hashA5 = new Language("aLang", 5.0).hashCode(); + int hashA6 = new Language("BLang", 6.0).hashCode(); + + Assert.assertEquals(hashA, hashAA); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashA5); + Assert.assertNotEquals(hashB, hashA5); + Assert.assertNotEquals(hashA5, hashA6); + } + + @Test + public void testEquals() { + Language langA = new Language("langA"); + Language langB = new Language("langB"); + Language langA5 = new Language("langA5", 5.0); + Language langA6 = new Language("langA5", 6.0); + + Assert.assertEquals(langA, langA); + Assert.assertEquals(langA5, langA5); + + Assert.assertNotEquals(langA, langA5); + Assert.assertNotEquals(langA, langB); + + Assert.assertNotEquals(langA6, langA5); + + Assert.assertNotEquals(langA, "something else"); + } +}
