Repository: opennlp
Updated Branches:
  refs/heads/904 060a4d0f8 -> f65d4d3ef


OPENNLP-904 move encoding of lemmas to training logic


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f65d4d3e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f65d4d3e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f65d4d3e

Branch: refs/heads/904
Commit: f65d4d3efdeefa1f757eaabb508b17e60d2ae847
Parents: 060a4d0
Author: Rodrigo Agerri <[email protected]>
Authored: Sat Feb 4 17:58:55 2017 +0100
Committer: Rodrigo Agerri <[email protected]>
Committed: Sat Feb 4 17:58:55 2017 +0100

----------------------------------------------------------------------
 .../tools/lemmatizer/LemmaSampleEventStream.java      |  2 +-
 .../tools/lemmatizer/LemmaSampleSequenceStream.java   | 12 ++++++------
 .../opennlp/tools/lemmatizer/LemmaSampleStream.java   |  5 ++---
 .../java/opennlp/tools/lemmatizer/LemmatizerME.java   | 14 +++++++++++++-
 .../opennlp/tools/lemmatizer/LemmatizerMETest.java    |  3 +--
 5 files changed, 23 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
index fc1a558..a8d71e8 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java
@@ -49,7 +49,7 @@ public class LemmaSampleEventStream extends 
AbstractEventStream<LemmaSample> {
       List<Event> events = new ArrayList<>();
       String[] toksArray = sample.getTokens();
       String[] tagsArray = sample.getTags();
-      String[] lemmasArray = sample.getLemmas();
+      String[] lemmasArray = 
LemmatizerME.encodeLemmas(toksArray,sample.getLemmas());
       for (int ei = 0, el = sample.getTokens().length; ei < el; ei++) {
         events.add(new Event(lemmasArray[ei],
             contextGenerator.getContext(ei,toksArray,tagsArray,lemmasArray)));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
index 7056538..d61daec 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleSequenceStream.java
@@ -41,15 +41,15 @@ public class LemmaSampleSequenceStream implements 
SequenceStream {
     LemmaSample sample = samples.read();
 
     if (sample != null) {
-      String sentence[] = sample.getTokens();
-      String tags[] = sample.getTags();
-      String preds[] = sample.getLemmas();
-      Event[] events = new Event[sentence.length];
+      String[] tokens = sample.getTokens();
+      String[] tags = sample.getTags();
+      String[] lemmas = LemmatizerME.encodeLemmas(tokens, sample.getLemmas());
+      Event[] events = new Event[tokens.length];
 
-      for (int i = 0; i < sentence.length; i++) {
+      for (int i = 0; i < tokens.length; i++) {
         // it is safe to pass the tags as previous tags because
         // the context generator does not look for non predicted tags
-        String[] context = contextGenerator.getContext(i, sentence, tags, 
preds);
+        String[] context = contextGenerator.getContext(i, tokens, tags, 
lemmas);
 
         events[i] = new Event(tags[i], context);
       }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
index 0a133c3..0704026 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
@@ -23,7 +23,6 @@ import java.util.List;
 
 import opennlp.tools.util.FilterObjectStream;
 import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.StringUtil;
 
 
 /**
@@ -51,8 +50,8 @@ public class LemmaSampleStream extends 
FilterObjectStream<String, LemmaSample> {
       else {
         toks.add(parts[0]);
         tags.add(parts[1]);
-        String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
-        preds.add(ses);
+        //String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
+        preds.add(parts[2]);
       }
     }
     if (toks.size() > 0) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index 04f5415..bb6a0b6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -114,7 +114,7 @@ public class LemmatizerME implements Lemmatizer {
    * @param preds the predicted lemma classes
    * @return the array of decoded lemmas
    */
-  public String[] decodeLemmas(String[] toks, String[] preds) {
+  public static String[] decodeLemmas(String[] toks, String[] preds) {
     List<String> lemmas = new ArrayList<>();
     for (int i = 0; i < toks.length; i++) {
       String lemma = 
StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
@@ -127,6 +127,18 @@ public class LemmatizerME implements Lemmatizer {
     return lemmas.toArray(new String[lemmas.size()]);
   }
 
+  public static String[] encodeLemmas(String[] toks, String[] lemmas) {
+    List<String> sesList = new ArrayList<>();
+    for (int i = 0; i < toks.length; i++) {
+      String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]);
+      if (ses.length() == 0) {
+        ses = "_";
+      }
+      sesList.add(ses);
+    }
+    return sesList.toArray(new String[sesList.size()]);
+  }
+
   public Sequence[] topKSequences(String[] sentence, String[] tags) {
     return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
         new Object[] { tags }, contextGenerator, sequenceValidator);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f65d4d3e/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java 
b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
index 76b4cd5..97dcc3c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
@@ -82,8 +82,7 @@ public class LemmatizerMETest {
   @Test
   public void testLemmasAsArray() throws Exception {
 
-    String[] preds = lemmatizer.lemmatize(tokens, postags);
-    String[] lemmas = lemmatizer.decodeLemmas(tokens, preds);
+    String[] lemmas = lemmatizer.lemmatize(tokens, postags);
 
     Assert.assertArrayEquals(expect, lemmas);
   }

Reply via email to