Repository: opennlp Updated Branches: refs/heads/904 f65d4d3ef -> 0e7c49aeb
OPENNLP-904 add function to LemmatizerME to get every lemma for a token and pos tag combination Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/53cd0ddf Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/53cd0ddf Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/53cd0ddf Branch: refs/heads/904 Commit: 53cd0ddf5e3d38247ecce6585a3fc53bea74463a Parents: f65d4d3 Author: Rodrigo Agerri <[email protected]> Authored: Mon Feb 6 23:42:29 2017 +0100 Committer: Rodrigo Agerri <[email protected]> Committed: Mon Feb 6 23:42:29 2017 +0100 ---------------------------------------------------------------------- .../cmdline/lemmatizer/LemmatizerMETool.java | 4 +-- .../tools/lemmatizer/LemmaSampleStream.java | 1 - .../opennlp/tools/lemmatizer/LemmatizerME.java | 35 ++++++++++++++++---- 3 files changed, 30 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java index 13f28b2..9390376 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java @@ -72,10 +72,8 @@ public class LemmatizerMETool extends BasicCmdLineTool { continue; } - String[] preds = lemmatizer.lemmatize(posSample.getSentence(), + String[] lemmas = lemmatizer.lemmatize(posSample.getSentence(), posSample.getTags()); - String[] lemmas = lemmatizer.decodeLemmas(posSample.getSentence(), - preds); System.out.println(new LemmaSample(posSample.getSentence(), posSample.getTags(), lemmas).toString()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java index 0704026..9c661a5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java @@ -50,7 +50,6 @@ public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> { else { toks.add(parts[0]); tags.add(parts[1]); - //String ses = StringUtil.getShortestEditScript(parts[0], parts[2]); preds.add(parts[2]); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/53cd0ddf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java index bb6a0b6..4ee924b 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java @@ -18,10 +18,7 @@ package opennlp.tools.lemmatizer; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import opennlp.tools.ml.BeamSearch; import opennlp.tools.ml.EventModelSequenceTrainer; @@ -47,6 +44,7 @@ import opennlp.tools.util.TrainingParameters; */ public class LemmatizerME implements Lemmatizer { + public static final int LEMMA_NUMBER = 29; public static final int DEFAULT_BEAM_SIZE = 3; protected int beamSize; private Sequence bestSequence; @@ -93,7 +91,14 @@ public class LemmatizerME implements Lemmatizer { @Override public List<List<String>> lemmatize(List<String> toks, List<String> tags) { - return null; + String[] tokens = toks.toArray(new String[toks.size()]); + String[] posTags = tags.toArray(new String[tags.size()]); + String[][] allLemmas = predictLemmas(LEMMA_NUMBER, tokens, posTags); + List<List<String>> predictedLemmas = new ArrayList<>(); + for (int i = 0; i < allLemmas.length; i++) { + predictedLemmas.add(Arrays.asList(allLemmas[i])); + } + return predictedLemmas; } /** @@ -109,6 +114,25 @@ public class LemmatizerME implements Lemmatizer { } /** + * Predict all possible lemmas (using a default upper bound). + * @param numLemmas the default number of lemmas + * @param toks the tokens + * @param tags the postags + * @return a double array containing all posible lemmas for each token and postag pair + */ + public String[][] predictLemmas(int numLemmas, String[] toks, String[] tags) { + Sequence[] bestSequences = model.bestSequences(numLemmas, toks, new Object[] {tags}, + contextGenerator, sequenceValidator); + String[][] allLemmas = new String[bestSequences.length][]; + for (int i = 0; i < allLemmas.length; i++) { + List<String> ses = bestSequences[i].getOutcomes(); + String[] sesArray = ses.toArray(new String[ses.size()]); + allLemmas[i] = decodeLemmas(toks,sesArray); + } + return allLemmas; + } + + /** * Decodes the lemma from the word and the induced lemma class. * @param toks the array of tokens * @param preds the predicted lemma classes @@ -118,7 +142,6 @@ public class LemmatizerME implements Lemmatizer { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]); - //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " + preds[i] + " " + lemma); if (lemma.length() == 0) { lemma = "_"; }
