Repository: opennlp Updated Branches: refs/heads/904 [created] 060a4d0f8
OPENNLP-904 insert decoding in statistical lemmatize method Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/060a4d0f Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/060a4d0f Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/060a4d0f Branch: refs/heads/904 Commit: 060a4d0f81bfc2d2f2d3e8d80284d64dcbf9d2a7 Parents: 46fbcbf Author: Rodrigo Agerri <[email protected]> Authored: Fri Feb 3 16:00:38 2017 +0100 Committer: Rodrigo Agerri <[email protected]> Committed: Fri Feb 3 16:00:38 2017 +0100 ---------------------------------------------------------------------- .../tools/lemmatizer/DictionaryLemmatizer.java | 68 ++++++++++++++++---- .../opennlp/tools/lemmatizer/Lemmatizer.java | 16 ++++- .../opennlp/tools/lemmatizer/LemmatizerME.java | 21 +++++- .../tools/lemmatizer/DummyLemmatizer.java | 6 ++ 4 files changed, 95 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java index b1b04a1..260f98a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java @@ -37,7 +37,7 @@ public class DictionaryLemmatizer implements Lemmatizer { /** * The hashmap containing the dictionary. */ - private final Map<List<String>, String> dictMap; + private final Map<List<String>, List<String>> dictMap; /** * Construct a hashmap from the input tab separated dictionary. @@ -47,26 +47,34 @@ public class DictionaryLemmatizer implements Lemmatizer { * @param dictionary * the input dictionary via inputstream */ + // To have duplicate keys we need to have a collection for values too, + // this way: + // 1. We could get every lemma for a word,pos pair in the key + // 2. We could get every pos,lemma for a word in the key + // Crucially, both keys and values need to be collections, probably lists public DictionaryLemmatizer(final InputStream dictionary) { this.dictMap = new HashMap<>(); - final BufferedReader breader = new BufferedReader(new InputStreamReader(dictionary)); + final BufferedReader breader = new BufferedReader( + new InputStreamReader(dictionary)); String line; try { while ((line = breader.readLine()) != null) { final String[] elems = line.split("\t"); - this.dictMap.put(Arrays.asList(elems[0], elems[1]), elems[2]); + this.dictMap.put(Arrays.asList(elems[0], elems[1]), + Arrays.asList(elems[2])); } } catch (final IOException e) { e.printStackTrace(); } } + /** * Get the Map containing the dictionary. * * @return dictMap the Map */ - public Map<List<String>, String> getDictMap() { + public Map<List<String>, List<String>> getDictMap() { return this.dictMap; } @@ -85,31 +93,67 @@ public class DictionaryLemmatizer implements Lemmatizer { return keys; } + public String[] lemmatize(final String[] tokens, final String[] postags) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { - lemmas.add(this.apply(tokens[i], postags[i])); + lemmas.add(this.lemmatize(tokens[i], postags[i])); } return lemmas.toArray(new String[lemmas.size()]); } + public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) { + List<List<String>> allLemmas = new ArrayList<List<String>>(); + for (int i = 0; i < tokens.size(); i++) { + allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); + } + return allLemmas; + } + /** * Lookup lemma in a dictionary. Outputs "O" if not found. - * @param word the token - * @param postag the postag + * + * @param word + * the token + * @param postag + * the postag * @return the lemma */ - public String apply(final String word, final String postag) { + private String lemmatize(final String word, final String postag) { String lemma; final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map - final String keyValue = this.dictMap.get(keys); - if (keyValue != null) { - lemma = keyValue; + final List<String> keyValues = this.dictMap.get(keys); + if (!keyValues.isEmpty()) { + lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; } -} + /** + * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not + * found. + * + * @param word + * the token + * @param postag + * the postag + * @return every lemma + */ + private List<String> getAllLemmas(final String word, final String postag) { + List<String> lemmasList = new ArrayList<>(); + final List<String> keys = this.getDictKeys(word, postag); + // lookup lemma as value of the map + final List<String> keyValues = this.dictMap.get(keys); + if (!keyValues.isEmpty()) { + for (String keyValue : keyValues) { + lemmasList.add(keyValue); + } + } else { + lemmasList.add("O"); + } + return lemmasList; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java index ddcaa6a..f5cf688 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java @@ -17,19 +17,31 @@ package opennlp.tools.lemmatizer; +import java.util.List; + /** * The interface for lemmatizers. */ public interface Lemmatizer { /** - * Generates lemma tags for the word and postag returning the result in an array. + * Generates lemmas for the word and postag returning the result in an array. * * @param toks an array of the tokens * @param tags an array of the pos tags * - * @return an array of lemma classes for each token in the sequence. + * @return an array of possible lemmas for each token in the sequence. */ String[] lemmatize(String[] toks, String tags[]); + /** + * Generates a lemma tags for the word and postag returning the result in a list + * of every possible lemma for each token and postag. + * + * @param toks an array of the tokens + * @param tags an array of the pos tags + * @return a list of every possible lemma for each token in the sequence. + */ + List<List<String>> lemmatize(List<String> toks, List<String> tags); + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java index 98a19f5..04f5415 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java @@ -86,9 +86,26 @@ public class LemmatizerME implements Lemmatizer { } public String[] lemmatize(String[] toks, String[] tags) { + String[] ses = predictSES(toks, tags); + String[] lemmas = decodeLemmas(toks, ses); + return lemmas; + } + + @Override public List<List<String>> lemmatize(List<String> toks, + List<String> tags) { + return null; + } + + /** + * Predict Short Edit Script (automatically induced lemma class). + * @param toks the array of tokens + * @param tags the array of pos tags + * @return an array containing the lemma classes + */ + public String[] predictSES(String[] toks, String[] tags) { bestSequence = model.bestSequence(toks, new Object[] {tags}, contextGenerator, sequenceValidator); - List<String> c = bestSequence.getOutcomes(); - return c.toArray(new String[c.size()]); + List<String> ses = bestSequence.getOutcomes(); + return ses.toArray(new String[ses.size()]); } /** http://git-wip-us.apache.org/repos/asf/opennlp/blob/060a4d0f/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java index 489ba38..9ce2822 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java @@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer; import java.io.IOException; import java.util.Arrays; +import java.util.List; /** * This dummy lemmatizer implementation simulates a LemmatizerME. The file has @@ -56,4 +57,9 @@ public class DummyLemmatizer implements Lemmatizer { } } + @Override public List<List<String>> lemmatize(List<String> toks, + List<String> tags) { + return null; + } + }
