Repository: opennlp Updated Branches: refs/heads/OPENNLP-778 d9a9c7dbb -> 64949f179 (forced update)
OPENNLP-778: Add LanguageDetector infrastructure classes Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/64949f17 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/64949f17 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/64949f17 Branch: refs/heads/OPENNLP-778 Commit: 64949f1798ddf0adfceed772a5a49c1ad451167d Parents: 11d7581 Author: William D C M SILVA <[email protected]> Authored: Mon Mar 13 13:48:33 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Mon Mar 13 23:40:12 2017 -0300 ---------------------------------------------------------------------- .../java/opennlp/tools/langdetect/Language.java | 34 +++++++ .../tools/langdetect/LanguageDetector.java | 10 +- .../langdetect/LanguageDetectorFactory.java | 55 +++++++++++ .../tools/langdetect/LanguageDetectorME.java | 38 ++++++++ .../tools/langdetect/LanguageDetectorModel.java | 82 ++++++++++++++++ .../langdetect/LanguageDetectorSample.java | 75 +++++++++++++++ .../tools/langdetect/LanguageSampleTest.java | 88 ++++++++++++++++++ .../opennlp/tools/langdetect/LanguageTest.java | 98 ++++++++++++++++++++ 8 files changed, 474 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java index 773201f..57655b4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java @@ -17,6 +17,8 @@ package opennlp.tools.langdetect; +import java.util.Objects; + /** * Class for holding the document language and its confidence */ @@ -24,7 +26,12 @@ public class Language { private final String lang; private final double confidence; + public Language(String lang) { + this(lang, 0); + } + public Language(String lang, double confidence) { + Objects.requireNonNull(lang, "lang must not be null"); this.lang = lang; this.confidence = confidence; } @@ -36,4 +43,31 @@ public class Language { public double getConfidence() { return confidence; } + + @Override + public String toString() { + + return getLang(); + } + + @Override + public int hashCode() { + return Objects.hash(getLang(), getConfidence()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof Language) { + Language a = (Language) obj; + + return getLang().equals(a.getLang()) + && getConfidence() == a.getConfidence(); + } + + return false; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java index ca897fd..5e9833a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java @@ -17,17 +17,15 @@ package opennlp.tools.langdetect; -import java.util.Set; - /** - * The interface for name finders which provide name tags for a sequence of tokens. + * The interface for LanguageDetector which provide the @{@link Language} according to the context. */ public interface LanguageDetector { - Language[] detectLanguage(CharSequence content); + Language[] predictLanguages(CharSequence content); - Set<String> getSupportedLanguages(); + Language predictLanguage(CharSequence content); - String getLanguageCoding(); + Language[] getSupportedLanguages(); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java new file mode 100644 index 0000000..a0fb84e --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ext.ExtensionLoader; + + +public class LanguageDetectorFactory extends BaseToolFactory { + + public static LanguageDetectorFactory create(String subclassName) + throws InvalidFormatException { + if (subclassName == null) { + // will create the default factory + return new LanguageDetectorFactory(); + } + try { + LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension( + LanguageDetectorFactory.class, subclassName); + theFactory.init(); + return theFactory; + } catch (Exception e) { + String msg = "Could not instantiate the " + subclassName + + ". The initialization throw an exception."; + System.err.println(msg); + e.printStackTrace(); + throw new InvalidFormatException(msg, e); + } + } + + public void init() { + // nothing to do + } + + @Override + public void validateArtifactMap() throws InvalidFormatException { + // nothing to validate + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java new file mode 100644 index 0000000..c88ec33 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +/** + * Implements learnable Language Detector + */ +public class LanguageDetectorME implements LanguageDetector { + @Override + public Language[] predictLanguages(CharSequence content) { + return new Language[0]; + } + + @Override + public Language predictLanguage(CharSequence content) { + return null; + } + + @Override + public Language[] getSupportedLanguages() { + return new Language[0]; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java new file mode 100644 index 0000000..eb38847 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Map; + +import opennlp.tools.ml.model.AbstractModel; +import opennlp.tools.ml.model.MaxentModel; +import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.model.BaseModel; + +/** + * A model for language detection + */ +public class LanguageDetectorModel extends BaseModel { + + private static final String COMPONENT_NAME = "LanguageDetectorME"; + private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model"; + + public LanguageDetectorModel(String languageCode, MaxentModel langdetectModel, + Map<String, String> manifestInfoEntries, + LanguageDetectorFactory factory) { + super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory); + + artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel); + checkArtifactMap(); + } + + public LanguageDetectorModel(InputStream in) throws IOException { + super(COMPONENT_NAME, in); + } + + public LanguageDetectorModel(File modelFile) throws IOException { + super(COMPONENT_NAME, modelFile); + } + + public LanguageDetectorModel(URL modelURL) throws IOException { + super(COMPONENT_NAME, modelURL); + } + + @Override + protected void validateArtifactMap() throws InvalidFormatException { + super.validateArtifactMap(); + + if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) { + throw new InvalidFormatException("Language detector model is incomplete!"); + } + } + + public LanguageDetectorFactory getFactory() { + return (LanguageDetectorFactory) this.toolFactory; + } + + @Override + protected Class<? extends BaseToolFactory> getDefaultFactory() { + return LanguageDetectorFactory.class; + } + + public MaxentModel getMaxentModel() { + return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java new file mode 100644 index 0000000..2c30044 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import java.util.Objects; + +/** + * Class which holds a classified document and its @{@link Language}. + */ +public class LanguageDetectorSample { + + private final Language language; + private final CharSequence context; + + public LanguageDetectorSample(Language language, CharSequence context) { + Objects.requireNonNull(context, "context must not be null"); + Objects.requireNonNull(language, "language must not be null"); + this.language = language; + this.context = context; + } + + public Language getLanguage() { + return language; + } + + public CharSequence getContext() { + return context; + } + + @Override + public String toString() { + + StringBuilder sampleString = new StringBuilder(); + + sampleString.append(language.getLang()).append('\t').append(context); + + return sampleString.toString(); + } + + @Override + public int hashCode() { + return Objects.hash(getContext(), getLanguage()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj instanceof LanguageDetectorSample) { + LanguageDetectorSample a = (LanguageDetectorSample) obj; + + return getLanguage().equals(a.getLanguage()) + && getContext().equals(a.getContext()); + } + + return false; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java new file mode 100644 index 0000000..31a5727 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageSampleTest { + + @Test + public void testConstructor() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageDetectorSample sample = new LanguageDetectorSample(lang, context); + + Assert.assertEquals(lang, sample.getLanguage()); + Assert.assertEquals(context, sample.getContext()); + } + + @Test(expected = NullPointerException.class) + public void testNullLang() throws Exception { + CharSequence context = "aContext"; + + new LanguageDetectorSample(null, context); + } + + @Test(expected = NullPointerException.class) + public void testNullContext() { + Language lang = new Language("aLang"); + + new LanguageDetectorSample(lang, null); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + CharSequence context = "aContext"; + + LanguageDetectorSample sample = new LanguageDetectorSample(lang, context); + + Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString()); + } + + @Test + public void testHash() { + + int hashA = new LanguageDetectorSample(new Language("aLang"), "aContext").hashCode(); + int hashB = new LanguageDetectorSample(new Language("bLang"), "aContext").hashCode(); + int hashC = new LanguageDetectorSample(new Language("aLang"), "bContext").hashCode(); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashC); + Assert.assertNotEquals(hashB, hashC); + } + + @Test + public void testEquals() throws Exception { + + LanguageDetectorSample sampleA = new LanguageDetectorSample(new Language("aLang"), "aContext"); + LanguageDetectorSample sampleA1 = new LanguageDetectorSample(new Language("aLang"), "aContext"); + LanguageDetectorSample sampleB = new LanguageDetectorSample(new Language("bLang"), "aContext"); + LanguageDetectorSample sampleC = new LanguageDetectorSample(new Language("aLang"), "bContext"); + + Assert.assertEquals(sampleA, sampleA); + Assert.assertEquals(sampleA, sampleA1); + Assert.assertNotEquals(sampleA, sampleB); + Assert.assertNotEquals(sampleA, sampleC); + Assert.assertNotEquals(sampleB, sampleC); + Assert.assertFalse(sampleA.equals("something else")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java new file mode 100644 index 0000000..56c5b80 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.langdetect; + +import org.junit.Assert; +import org.junit.Test; + + +public class LanguageTest { + + + @Test + public void emptyConfidence() throws Exception { + String languageCode = "aLanguage"; + Language lang = new Language(languageCode); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(0, lang.getConfidence(), 0); + } + + @Test + public void nonEmptyConfidence() throws Exception { + String languageCode = "aLanguage"; + double confidence = 0.05; + Language lang = new Language(languageCode, confidence); + + Assert.assertEquals(languageCode, lang.getLang()); + Assert.assertEquals(confidence, lang.getConfidence(), 0); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguage() throws Exception { + new Language(null); + } + + @Test(expected = NullPointerException.class) + public void emptyLanguageConfidence() throws Exception { + new Language(null, 0.05); + } + + @Test + public void testToString() { + Language lang = new Language("aLang"); + + Assert.assertEquals(lang.getLang(), lang.toString()); + } + + + + @Test + public void testHash() { + int hashA = new Language("aLang").hashCode(); + int hashAA = new Language("aLang").hashCode(); + int hashB = new Language("BLang").hashCode(); + int hashA5 = new Language("aLang", 5.0).hashCode(); + int hashA6 = new Language("BLang", 6.0).hashCode(); + + Assert.assertEquals(hashA, hashAA); + + Assert.assertNotEquals(hashA, hashB); + Assert.assertNotEquals(hashA, hashA5); + Assert.assertNotEquals(hashB, hashA5); + Assert.assertNotEquals(hashA5, hashA6); + } + + @Test + public void testEquals() { + Language langA = new Language("langA"); + Language langB = new Language("langB"); + Language langA5 = new Language("langA5", 5.0); + Language langA6 = new Language("langA5", 6.0); + + Assert.assertEquals(langA, langA); + Assert.assertEquals(langA5, langA5); + + Assert.assertNotEquals(langA, langA5); + Assert.assertNotEquals(langA, langB); + + Assert.assertNotEquals(langA6, langA5); + + Assert.assertNotEquals(langA, "something else"); + } +}
