Repository: opennlp
Updated Branches:
  refs/heads/OPENNLP-778 f0dcf22d4 -> 3d7a20708 (forced update)


OPENNLP-778: Add LanguageDetector infrastructure classes


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3d7a2070
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3d7a2070
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3d7a2070

Branch: refs/heads/OPENNLP-778
Commit: 3d7a207087db4503d5111b67604603f13956dd8c
Parents: 11d7581
Author: William D C M SILVA <[email protected]>
Authored: Mon Mar 13 13:48:33 2017 -0300
Committer: William D C M SILVA <[email protected]>
Committed: Tue Mar 14 13:34:19 2017 -0300

----------------------------------------------------------------------
 .../java/opennlp/tools/langdetect/Language.java |  34 +++++
 .../tools/langdetect/LanguageDetector.java      |  10 +-
 .../LanguageDetectorContextGenerator.java       |  63 +++++++++
 .../langdetect/LanguageDetectorEventStream.java |  69 ++++++++++
 .../langdetect/LanguageDetectorFactory.java     |  53 ++++++++
 .../tools/langdetect/LanguageDetectorME.java    |  99 ++++++++++++++
 .../tools/langdetect/LanguageDetectorModel.java |  82 ++++++++++++
 .../langdetect/LanguageDetectorSample.java      |  75 +++++++++++
 .../LanguageDetectorSampleStream.java           |  58 ++++++++
 .../opennlp/tools/langdetect/DummyFactory.java  |  33 +++++
 .../LanguageDetectorContextGeneratorTest.java   |  50 +++++++
 .../langdetect/LanguageDetectorFactoryTest.java |  64 +++++++++
 .../langdetect/LanguageDetectorMETest.java      | 134 +++++++++++++++++++
 .../langdetect/LanguageDetectorSampleTest.java  |  89 ++++++++++++
 .../opennlp/tools/langdetect/LanguageTest.java  |  97 ++++++++++++++
 15 files changed, 1004 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
index 773201f..57655b4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -17,6 +17,8 @@
 
 package opennlp.tools.langdetect;
 
+import java.util.Objects;
+
 /**
  * Class for holding the document language and its confidence
  */
@@ -24,7 +26,12 @@ public class Language {
   private final String lang;
   private final double confidence;
 
+  public Language(String lang) {
+    this(lang, 0);
+  }
+
   public Language(String lang, double confidence) {
+    Objects.requireNonNull(lang, "lang must not be null");
     this.lang = lang;
     this.confidence = confidence;
   }
@@ -36,4 +43,31 @@ public class Language {
   public double getConfidence() {
     return confidence;
   }
+
+  @Override
+  public String toString() {
+
+    return getLang();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getLang(), getConfidence());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Language) {
+      Language a = (Language) obj;
+
+      return getLang().equals(a.getLang())
+          && getConfidence() == a.getConfidence();
+    }
+
+    return false;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
index ca897fd..0004494 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -17,17 +17,15 @@
 
 package opennlp.tools.langdetect;
 
-import java.util.Set;
-
 /**
- * The interface for name finders which provide name tags for a sequence of 
tokens.
+ * The interface for LanguageDetector which provide the @{@link Language} 
according to the context.
  */
 public interface LanguageDetector {
 
-  Language[] detectLanguage(CharSequence content);
+  Language[] predictLanguages(CharSequence content);
 
-  Set<String> getSupportedLanguages();
+  Language predictLanguage(CharSequence content);
 
-  String getLanguageCoding();
+  String[] getSupportedLanguages();
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..b3caeea
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Context generator for document categorizer
+ */
+class LanguageDetectorContextGenerator {
+
+  private final int minLength;
+  private final int maxLength;
+
+  LanguageDetectorContextGenerator(int minLength, int maxLength) {
+    this.minLength = minLength;
+    this.maxLength = maxLength;
+  }
+
+  /**
+   * Initializes the current instance with min 2 length and max 5 length of 
ngrams.
+   */
+  LanguageDetectorContextGenerator() {
+    this(2, 5);
+  }
+
+  public String[] getContext(String document) {
+
+    Collection<String> context = new LinkedList<>();
+
+    NGramModel model = new NGramModel();
+    model.add(document, minLength, maxLength);
+
+    for (StringList tokenList : model) {
+      if (tokenList.size() > 0) {
+        context.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0)));
+      }
+    }
+
+
+    return context.toArray(new String[context.size()]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..cfe5f7c
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends 
AbstractEventStream<LanguageDetectorSample> {
+
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance via samples and feature generators.
+   *
+   * @param data {@link ObjectStream} of {@link LanguageDetectorSample}s
+   */
+  public LanguageDetectorEventStream(ObjectStream<LanguageDetectorSample> 
data) {
+    super(data);
+
+    mContextGenerator =
+        new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  protected Iterator<Event> createEvents(final LanguageDetectorSample sample) {
+
+    return new Iterator<Event>() {
+
+      private boolean isVirgin = true;
+
+      public boolean hasNext() {
+        return isVirgin;
+      }
+
+      public Event next() {
+
+        isVirgin = false;
+
+        return new Event(sample.getLanguage().getLang(),
+            mContextGenerator.getContext(sample.getContext().toString()));
+      }
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..5cebbba
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+  public static LanguageDetectorFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LanguageDetectorFactory();
+    }
+    try {
+      LanguageDetectorFactory theFactory = 
ExtensionLoader.instantiateExtension(
+          LanguageDetectorFactory.class, subclassName);
+      theFactory.init();
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  public void init() {
+    // nothing to do
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..29c7f15
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+  private LanguageDetectorModel model;
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance with a language detector model. Default 
feature
+   * generation is used.
+   *
+   * @param model the language detector model
+   */
+  public LanguageDetectorME(LanguageDetectorModel model) {
+    this.model = model;
+    this.mContextGenerator = new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    double[] eval = 
model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+    Language[] arr = new Language[eval.length];
+    for (int i = 0; i < eval.length; i++) {
+      arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+    }
+
+    Arrays.sort(arr, new Comparator<Language>() {
+      @Override
+      public int compare(Language o1, Language o2) {
+        return Double.compare(o2.getConfidence(), o1.getConfidence());
+      }
+    });
+    return arr;
+  }
+
+  @Override
+  public Language predictLanguage(CharSequence content) {
+    return predictLanguages(content)[0];
+  }
+
+  @Override
+  public String[] getSupportedLanguages() {
+    int numberLanguages = model.getMaxentModel().getNumOutcomes();
+    String[] languages = new String[numberLanguages];
+    for (int i = 0; i < numberLanguages; i++) {
+      languages[i] = model.getMaxentModel().getOutcome(i);
+    }
+    return languages;
+  }
+
+
+  public static LanguageDetectorModel 
train(ObjectStream<LanguageDetectorSample> samples,
+                                            TrainingParameters mlParams,
+                                            LanguageDetectorFactory factory)
+      throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<>();
+
+    EventTrainer trainer = TrainerFactory.getEventTrainer(
+        mlParams, manifestInfoEntries);
+
+    MaxentModel model = trainer.train(
+        new LanguageDetectorEventStream(samples));
+
+    return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "LanguageDetectorME";
+  private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+  public LanguageDetectorModel(MaxentModel langdetectModel,
+                               Map<String, String> manifestInfoEntries,
+                               LanguageDetectorFactory factory) {
+    super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+    artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+    checkArtifactMap();
+  }
+
+  public LanguageDetectorModel(InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  public LanguageDetectorModel(File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  public LanguageDetectorModel(URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof 
AbstractModel)) {
+      throw new InvalidFormatException("Language detector model is 
incomplete!");
+    }
+  }
+
+  public LanguageDetectorFactory getFactory() {
+    return (LanguageDetectorFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return LanguageDetectorFactory.class;
+  }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
new file mode 100644
index 0000000..2c30044
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageDetectorSample {
+
+  private final Language language;
+  private final CharSequence context;
+
+  public LanguageDetectorSample(Language language, CharSequence context) {
+    Objects.requireNonNull(context, "context must not be null");
+    Objects.requireNonNull(language, "language must not be null");
+    this.language = language;
+    this.context = context;
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public CharSequence getContext() {
+    return context;
+  }
+
+  @Override
+  public String toString() {
+
+    StringBuilder sampleString = new StringBuilder();
+
+    sampleString.append(language.getLang()).append('\t').append(context);
+
+    return sampleString.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getContext(), getLanguage());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof LanguageDetectorSample) {
+      LanguageDetectorSample a = (LanguageDetectorSample) obj;
+
+      return getLanguage().equals(a.getLanguage())
+          && getContext().equals(a.getContext());
+    }
+
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..b8be3df
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageDetectorSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the 
document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+    extends FilterObjectStream<String, LanguageDetectorSample> {
+
+  public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LanguageDetectorSample read() throws IOException {
+    String sampleString = samples.read();
+
+    if (sampleString != null) {
+
+      int tabIndex = sampleString.indexOf("\t");
+      if (tabIndex > 0) {
+        String lang = sampleString.substring(0, tabIndex);
+        String context = sampleString.substring(tabIndex + 1);
+
+        return new LanguageDetectorSample(new Language(lang), context);
+      }
+    } else {
+      throw new IOException("Empty lines, or lines with only a category string 
are not allowed!");
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..21efd1b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+  public DummyFactory() {
+
+  }
+
+  @Override
+  public void init() {
+    super.init();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..787dc1e
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+  @Test
+  public void extractContext() throws Exception {
+    String doc = "abcde fghijk";
+
+    LanguageDetectorContextGenerator cg = new 
LanguageDetectorContextGenerator();
+
+    Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+    Assert.assertEquals(38, features.size());
+    Assert.assertTrue(features.contains("ng=ab"));
+    Assert.assertTrue(features.contains("ng=abc"));
+    Assert.assertTrue(features.contains("ng=abcd"));
+    Assert.assertTrue(features.contains("ng=abcde"));
+    Assert.assertTrue(features.contains("ng=abcde"));
+
+    Assert.assertTrue(features.contains("ng= f"));
+    Assert.assertTrue(features.contains("ng= fg"));
+    Assert.assertTrue(features.contains("ng= fgh"));
+    Assert.assertTrue(features.contains("ng= fghi"));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..45cec76
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void train() throws Exception {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, 
"/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new 
PlainTextByLineStream(streamFactory, "UTF-8");
+
+    LanguageDetectorSampleStream sampleStream = new 
LanguageDetectorSampleStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    this.model = LanguageDetectorME.train(sampleStream, params, new 
DummyFactory());
+  }
+
+  @Test
+  public void testCorrectFactory() throws IOException {
+    File tempFile = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile);
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..1e232a2
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void train() throws Exception {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, 
"/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new 
PlainTextByLineStream(streamFactory, "UTF-8");
+
+    LanguageDetectorSampleStream sampleStream = new 
LanguageDetectorSampleStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    this.model = LanguageDetectorME.train(sampleStream, params, new 
LanguageDetectorFactory());
+  }
+
+  @Test
+  public void testPredictLanguages() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language[] languages = ld.predictLanguages("estava em uma marcenaria na 
Rua Bruno");
+
+    Assert.assertEquals(4, languages.length);
+    Assert.assertEquals("pob", languages[0].getLang());
+    Assert.assertEquals("ita", languages[1].getLang());
+    Assert.assertEquals("spa", languages[2].getLang());
+    Assert.assertEquals("fra", languages[3].getLang());
+  }
+
+  @Test
+  public void testPredictLanguage() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language language = ld.predictLanguage("se lever mais il n ' a pas 
insisté");
+
+    Assert.assertEquals("fra", language.getLang());
+  }
+
+  @Test
+  public void testSupportedLanguages() {
+
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    String[] supportedLanguages = ld.getSupportedLanguages();
+
+    Assert.assertEquals(4, supportedLanguages.length);
+  }
+
+  @Test
+  public void testLoadFromFile() throws IOException {
+    File tempFile = serializeModel(model);
+
+    Assert.assertTrue(tempFile.exists());
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile);
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  @Test
+  public void testLoadFromURL() throws IOException {
+    File tempFile = serializeModel(model);
+
+    LanguageDetectorModel myModel = new 
LanguageDetectorModel(tempFile.toURI().toURL());
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  @Test
+  public void testLoadFromStream() throws IOException {
+    File tempFile = serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new 
FileInputStream(tempFile));
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  @Test
+  public void testCorrectFactory() throws IOException {
+    File tempFile = serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(tempFile);
+
+    Assert.assertTrue(myModel.getFactory() instanceof LanguageDetectorFactory);
+
+  }
+
+  protected static File serializeModel(LanguageDetectorModel model) throws 
IOException {
+    File tempFile = File.createTempFile("langdetect", "model");
+
+    FileOutputStream fos = new FileOutputStream(tempFile);
+
+    model.serialize(fos);
+
+    return tempFile;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java
new file mode 100644
index 0000000..5e52b24
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorSampleTest {
+
+  @Test
+  public void testConstructor() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+    Assert.assertEquals(lang, sample.getLanguage());
+    Assert.assertEquals(context, sample.getContext());
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullLang() throws Exception {
+    CharSequence context = "aContext";
+
+    new LanguageDetectorSample(null, context);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullContext() {
+    Language lang = new Language("aLang");
+
+    new LanguageDetectorSample(lang, null);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+    Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+  }
+
+  @Test
+  public void testHash() {
+
+    int hashA = new LanguageDetectorSample(new Language("aLang"), 
"aContext").hashCode();
+    int hashB = new LanguageDetectorSample(new Language("bLang"), 
"aContext").hashCode();
+    int hashC = new LanguageDetectorSample(new Language("aLang"), 
"bContext").hashCode();
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashC);
+    Assert.assertNotEquals(hashB, hashC);
+  }
+
+  @Test
+  public void testEquals() throws Exception {
+
+    LanguageDetectorSample sampleA = new LanguageDetectorSample(new 
Language("aLang"), "aContext");
+    LanguageDetectorSample sampleA1 = new LanguageDetectorSample(new 
Language("aLang"), "aContext");
+    LanguageDetectorSample sampleB = new LanguageDetectorSample(new 
Language("bLang"), "aContext");
+    LanguageDetectorSample sampleC = new LanguageDetectorSample(new 
Language("aLang"), "bContext");
+
+    Assert.assertEquals(sampleA, sampleA);
+    Assert.assertEquals(sampleA, sampleA1);
+    Assert.assertNotEquals(sampleA, sampleB);
+    Assert.assertNotEquals(sampleA, sampleC);
+    Assert.assertNotEquals(sampleB, sampleC);
+    Assert.assertFalse(sampleA.equals("something else"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3d7a2070/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dd373a9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+  @Test
+  public void emptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    Language lang = new Language(languageCode);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(0, lang.getConfidence(), 0);
+  }
+
+  @Test
+  public void nonEmptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    double confidence = 0.05;
+    Language lang = new Language(languageCode, confidence);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(confidence, lang.getConfidence(), 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguage() throws Exception {
+    new Language(null);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguageConfidence() throws Exception {
+    new Language(null, 0.05);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+
+    Assert.assertEquals(lang.getLang(), lang.toString());
+  }
+
+
+  @Test
+  public void testHash() {
+    int hashA = new Language("aLang").hashCode();
+    int hashAA = new Language("aLang").hashCode();
+    int hashB = new Language("BLang").hashCode();
+    int hashA5 = new Language("aLang", 5.0).hashCode();
+    int hashA6 = new Language("BLang", 6.0).hashCode();
+
+    Assert.assertEquals(hashA, hashAA);
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashA5);
+    Assert.assertNotEquals(hashB, hashA5);
+    Assert.assertNotEquals(hashA5, hashA6);
+  }
+
+  @Test
+  public void testEquals() {
+    Language langA = new Language("langA");
+    Language langB = new Language("langB");
+    Language langA5 = new Language("langA5", 5.0);
+    Language langA6 = new Language("langA5", 6.0);
+
+    Assert.assertEquals(langA, langA);
+    Assert.assertEquals(langA5, langA5);
+
+    Assert.assertNotEquals(langA, langA5);
+    Assert.assertNotEquals(langA, langB);
+
+    Assert.assertNotEquals(langA6, langA5);
+
+    Assert.assertNotEquals(langA, "something else");
+  }
+}

Reply via email to