Repository: opennlp Updated Branches: refs/heads/904 060a4d0f8 -> f65d4d3ef
OPENNLP-904 move encoding of lemmas to training logic Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f65d4d3e Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f65d4d3e Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f65d4d3e Branch: refs/heads/904 Commit: f65d4d3efdeefa1f757eaabb508b17e60d2ae847 Parents: 060a4d0 Author: Rodrigo Agerri <[email protected]> Authored: Sat Feb 4 17:58:55 2017 +0100 Committer: Rodrigo Agerri <[email protected]> Committed: Sat Feb 4 17:58:55 2017 +0100 ---------------------------------------------------------------------- .../tools/lemmatizer/LemmaSampleEventStream.java | 2 +- .../tools/lemmatizer/LemmaSampleSequenceStream.java | 12 ++++++------ .../opennlp/tools/lemmatizer/LemmaSampleStream.java | 5 ++--- .../java/opennlp/tools/lemmatizer/LemmatizerME.java | 14 +++++++++++++- .../opennlp/tools/lemmatizer/LemmatizerMETest.java | 3 +-- 5 files changed, 23 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java index fc1a558..a8d71e8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java @@ -49,7 +49,7 @@ public class LemmaSampleEventStream extends AbstractEventStream<LemmaSample> { List<Event> events = new ArrayList<>(); String[] toksArray = sample.getTokens(); String[] tagsArray = sample.getTags(); - String[] lemmasArray = sample.getLemmas(); + String[] lemmasArray = LemmatizerME.encodeLemmas(toksArray,sample.getLemmas()); for (int ei = 0, el = sample.getTokens().length; ei < el; ei++) { events.add(new Event(lemmasArray[ei], contextGenerator.getContext(ei,toksArray,tagsArray,lemmasArray))); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java index 7056538..d61daec 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java @@ -41,15 +41,15 @@ public class LemmaSampleSequenceStream implements SequenceStream { LemmaSample sample = samples.read(); if (sample != null) { - String sentence[] = sample.getTokens(); - String tags[] = sample.getTags(); - String preds[] = sample.getLemmas(); - Event[] events = new Event[sentence.length]; + String[] tokens = sample.getTokens(); + String[] tags = sample.getTags(); + String[] lemmas = LemmatizerME.encodeLemmas(tokens, sample.getLemmas()); + Event[] events = new Event[tokens.length]; - for (int i = 0; i < sentence.length; i++) { + for (int i = 0; i < tokens.length; i++) { // it is safe to pass the tags as previous tags because // the context generator does not look for non predicted tags - String[] context = contextGenerator.getContext(i, sentence, tags, preds); + String[] context = contextGenerator.getContext(i, tokens, tags, lemmas); events[i] = new Event(tags[i], context); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java index 0a133c3..0704026 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java @@ -23,7 +23,6 @@ import java.util.List; import opennlp.tools.util.FilterObjectStream; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.StringUtil; /** @@ -51,8 +50,8 @@ public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> { else { toks.add(parts[0]); tags.add(parts[1]); - String ses = StringUtil.getShortestEditScript(parts[0], parts[2]); - preds.add(ses); + //String ses = StringUtil.getShortestEditScript(parts[0], parts[2]); + preds.add(parts[2]); } } if (toks.size() > 0) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java index 04f5415..bb6a0b6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java @@ -114,7 +114,7 @@ public class LemmatizerME implements Lemmatizer { * @param preds the predicted lemma classes * @return the array of decoded lemmas */ - public String[] decodeLemmas(String[] toks, String[] preds) { + public static String[] decodeLemmas(String[] toks, String[] preds) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]); @@ -127,6 +127,18 @@ public class LemmatizerME implements Lemmatizer { return lemmas.toArray(new String[lemmas.size()]); } + public static String[] encodeLemmas(String[] toks, String[] lemmas) { + List<String> sesList = new ArrayList<>(); + for (int i = 0; i < toks.length; i++) { + String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]); + if (ses.length() == 0) { + ses = "_"; + } + sesList.add(ses); + } + return sesList.toArray(new String[sesList.size()]); + } + public Sequence[] topKSequences(String[] sentence, String[] tags) { return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { tags }, contextGenerator, sequenceValidator); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java index 76b4cd5..97dcc3c 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java @@ -82,8 +82,7 @@ public class LemmatizerMETest { @Test public void testLemmasAsArray() throws Exception { - String[] preds = lemmatizer.lemmatize(tokens, postags); - String[] lemmas = lemmatizer.decodeLemmas(tokens, preds); + String[] lemmas = lemmatizer.lemmatize(tokens, postags); Assert.assertArrayEquals(expect, lemmas); }
