opennlp git commit: OPENNLP-904 insert decoding in statistical lemmatize method

ragerri Fri, 03 Feb 2017 07:01:52 -0800

Repository: opennlp
Updated Branches:
  refs/heads/904 [created] 060a4d0f8


OPENNLP-904 insert decoding in statistical lemmatize method


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/060a4d0f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/060a4d0f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/060a4d0f

Branch: refs/heads/904
Commit: 060a4d0f81bfc2d2f2d3e8d80284d64dcbf9d2a7
Parents: 46fbcbf
Author: Rodrigo Agerri <[email protected]>
Authored: Fri Feb 3 16:00:38 2017 +0100
Committer: Rodrigo Agerri <[email protected]>
Committed: Fri Feb 3 16:00:38 2017 +0100

----------------------------------------------------------------------
 .../tools/lemmatizer/DictionaryLemmatizer.java  | 68 ++++++++++++++++----
 .../opennlp/tools/lemmatizer/Lemmatizer.java    | 16 ++++-
 .../opennlp/tools/lemmatizer/LemmatizerME.java  | 21 +++++-
 .../tools/lemmatizer/DummyLemmatizer.java       |  6 ++
 4 files changed, 95 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index b1b04a1..260f98a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -37,7 +37,7 @@ public class DictionaryLemmatizer implements Lemmatizer {
   /**
    * The hashmap containing the dictionary.
    */
-  private final Map<List<String>, String> dictMap;
+  private final Map<List<String>, List<String>> dictMap;
 
   /**
    * Construct a hashmap from the input tab separated dictionary.
@@ -47,26 +47,34 @@ public class DictionaryLemmatizer implements Lemmatizer {
    * @param dictionary
    *          the input dictionary via inputstream
    */
+  // To have duplicate keys we need to have a collection for values too,
+  // this way:
+  // 1. We could get every lemma for a word,pos pair in the key
+  // 2. We could get every pos,lemma for a word in the key
+  // Crucially, both keys and values need to be collections, probably lists
   public DictionaryLemmatizer(final InputStream dictionary) {
     this.dictMap = new HashMap<>();
-    final BufferedReader breader = new BufferedReader(new 
InputStreamReader(dictionary));
+    final BufferedReader breader = new BufferedReader(
+        new InputStreamReader(dictionary));
     String line;
     try {
       while ((line = breader.readLine()) != null) {
         final String[] elems = line.split("\t");
-        this.dictMap.put(Arrays.asList(elems[0], elems[1]), elems[2]);
+        this.dictMap.put(Arrays.asList(elems[0], elems[1]),
+            Arrays.asList(elems[2]));
       }
     } catch (final IOException e) {
       e.printStackTrace();
     }
   }
 
+
   /**
    * Get the Map containing the dictionary.
    *
    * @return dictMap the Map
    */
-  public Map<List<String>, String> getDictMap() {
+  public Map<List<String>, List<String>> getDictMap() {
     return this.dictMap;
   }
 
@@ -85,31 +93,67 @@ public class DictionaryLemmatizer implements Lemmatizer {
     return keys;
   }
 
+
   public String[] lemmatize(final String[] tokens, final String[] postags) {
     List<String> lemmas = new ArrayList<>();
     for (int i = 0; i < tokens.length; i++) {
-      lemmas.add(this.apply(tokens[i], postags[i]));
+      lemmas.add(this.lemmatize(tokens[i], postags[i]));
     }
     return lemmas.toArray(new String[lemmas.size()]);
   }
 
+  public List<List<String>> lemmatize(final List<String> tokens, final 
List<String> posTags) {
+    List<List<String>> allLemmas = new ArrayList<List<String>>();
+    for (int i = 0; i < tokens.size(); i++) {
+      allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i)));
+    }
+    return allLemmas;
+  }
+
   /**
    * Lookup lemma in a dictionary. Outputs "O" if not found.
-   * @param word the token
-   * @param postag the postag
+   *
+   * @param word
+   *          the token
+   * @param postag
+   *          the postag
    * @return the lemma
    */
-  public String apply(final String word, final String postag) {
+  private String lemmatize(final String word, final String postag) {
     String lemma;
     final List<String> keys = this.getDictKeys(word, postag);
     // lookup lemma as value of the map
-    final String keyValue = this.dictMap.get(keys);
-    if (keyValue != null) {
-      lemma = keyValue;
+    final List<String> keyValues = this.dictMap.get(keys);
+    if (!keyValues.isEmpty()) {
+      lemma = keyValues.get(0);
     } else {
       lemma = "O";
     }
     return lemma;
   }
-}
 
+  /**
+   * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not
+   * found.
+   *
+   * @param word
+   *          the token
+   * @param postag
+   *          the postag
+   * @return every lemma
+   */
+  private List<String> getAllLemmas(final String word, final String postag) {
+    List<String> lemmasList = new ArrayList<>();
+    final List<String> keys = this.getDictKeys(word, postag);
+    // lookup lemma as value of the map
+    final List<String> keyValues = this.dictMap.get(keys);
+    if (!keyValues.isEmpty()) {
+      for (String keyValue : keyValues) {
+        lemmasList.add(keyValue);
+      }
+    } else {
+      lemmasList.add("O");
+    }
+    return lemmasList;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
index ddcaa6a..f5cf688 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
@@ -17,19 +17,31 @@
 
 package opennlp.tools.lemmatizer;
 
+import java.util.List;
+
 /**
  * The interface for lemmatizers.
  */
 public interface Lemmatizer {
 
   /**
-   * Generates lemma tags for the word and postag returning the result in an 
array.
+   * Generates lemmas for the word and postag returning the result in an array.
    *
    * @param toks an array of the tokens
    * @param tags an array of the pos tags
    *
-   * @return an array of lemma classes for each token in the sequence.
+   * @return an array of possible lemmas for each token in the sequence.
    */
   String[] lemmatize(String[] toks, String tags[]);
 
+  /**
+   * Generates a lemma tags for the word and postag returning the result in a 
list
+   * of every possible lemma for each token and postag.
+   *
+   * @param toks an array of the tokens
+   * @param tags an array of the pos tags
+   * @return a list of every possible lemma for each token in the sequence.
+   */
+  List<List<String>> lemmatize(List<String> toks, List<String> tags);
+
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index 98a19f5..04f5415 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -86,9 +86,26 @@ public class LemmatizerME implements Lemmatizer {
   }
 
   public String[] lemmatize(String[] toks, String[] tags) {
+    String[] ses = predictSES(toks, tags);
+    String[] lemmas = decodeLemmas(toks, ses);
+    return lemmas;
+  }
+
+  @Override public List<List<String>> lemmatize(List<String> toks,
+      List<String> tags) {
+    return null;
+  }
+
+  /**
+   * Predict Short Edit Script (automatically induced lemma class).
+   * @param toks the array of tokens
+   * @param tags the array of pos tags
+   * @return an array containing the lemma classes
+   */
+  public String[] predictSES(String[] toks, String[] tags) {
     bestSequence = model.bestSequence(toks, new Object[] {tags}, 
contextGenerator, sequenceValidator);
-    List<String> c = bestSequence.getOutcomes();
-    return c.toArray(new String[c.size()]);
+    List<String> ses = bestSequence.getOutcomes();
+    return ses.toArray(new String[ses.size()]);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java 
b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
index 489ba38..9ce2822 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java
@@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.List;
 
 /**
  * This dummy lemmatizer implementation simulates a LemmatizerME. The file has
@@ -56,4 +57,9 @@ public class DummyLemmatizer implements Lemmatizer {
     }
   }
 
+  @Override public List<List<String>> lemmatize(List<String> toks,
+      List<String> tags) {
+    return null;
+  }
+
 }

opennlp git commit: OPENNLP-904 insert decoding in statistical lemmatize method

Reply via email to