opennlp git commit: Updates Morfologik add-on with 1.7.0 interfaces

colen Tue, 27 Dec 2016 19:17:54 -0800

Repository: opennlp
Updated Branches:
  refs/heads/902 486b88079 -> 001b97068



Updates Morfologik add-on with 1.7.0 interfaces

The Morfologik add-on was not compatible with the latest OpenNLP code. This 
also simplifies the implementation of the wrapper. Previous code was a little 
language specific.

See issue OPENNLP-902


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/001b9706
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/001b9706
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/001b9706

Branch: refs/heads/902
Commit: 001b970685ef0cb3904d2d8b0b2dfc2462eed870
Parents: 486b880
Author: William Colen <[email protected]>
Authored: Wed Dec 28 01:17:13 2016 -0200
Committer: William Colen <[email protected]>
Committed: Wed Dec 28 01:17:13 2016 -0200

----------------------------------------------------------------------
 .../builder/XMLDictionaryToTableTool.java       |   2 +-
 .../lemmatizer/MorfologikLemmatizer.java        |  86 +++++++++----------
 .../builder/POSDictionayBuilderTest.java        |  30 ++++++-
 .../lemmatizer/MorfologikLemmatizerTest.java    |  42 +++++++--
 .../tagdict/POSTaggerFactoryTest.java           |  28 ++++--
 .../src/test/resources/dictionaryWithLemma.dict | Bin 0 -> 223 bytes
 .../src/test/resources/dictionaryWithLemma.txt  |  10 ++-
 7 files changed, 129 insertions(+), 69 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
 
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index ef6668e..f3108a4 100644
--- 
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ 
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -41,7 +41,7 @@ public class XMLDictionaryToTableTool extends 
BasicCmdLineTool {
   private String SEPARATOR;
 
   public String getShortDescription() {
-    return "reads an OpenNLP XML tag dictionary and outputs it in a tab 
separated file";
+    return "reads an OpenNLP XML tag dictionary and outputs it in a tabular 
file";
   }
 
   public String getHelp() {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
 
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 2798e42..489b6fc 100644
--- 
a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ 
b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -20,11 +20,9 @@ package opennlp.morfologik.lemmatizer;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 
@@ -32,66 +30,62 @@ import morfologik.stemming.Dictionary;
 import morfologik.stemming.DictionaryLookup;
 import morfologik.stemming.IStemmer;
 import morfologik.stemming.WordData;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import opennlp.tools.lemmatizer.Lemmatizer;
 
-public class MorfologikLemmatizer implements DictionaryLemmatizer {
+public class MorfologikLemmatizer implements Lemmatizer {
 
   private IStemmer dictLookup;
-  public final Set<String> constantTags = new HashSet<>(Arrays.asList("NNP", 
"NP00000"));
 
   public MorfologikLemmatizer(Path dictionaryPath) throws 
IllegalArgumentException,
       IOException {
     dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
   }
 
-  private Map<List<String>, String> getLemmaTagsDict(String word) {
-    List<WordData> wdList = dictLookup.lookup(word);
-    Map<List<String>, String> dictMap = new HashMap<>();
-    for (WordData wd : wdList) {
-      List<String> wordLemmaTags = new ArrayList<>();
-      wordLemmaTags.add(word);
-      wordLemmaTags.add(wd.getTag().toString());
-      dictMap.put(wordLemmaTags, wd.getStem().toString());
+  private List<String> lemmatize(String word, String postag) {
+    List<WordData> dictMap = dictLookup.lookup(word.toLowerCase());
+    Set<String> lemmas = new HashSet<>();
+    for (WordData wordData : dictMap) {
+      if(Objects.equals(postag, asString(wordData.getTag()))) {
+        lemmas.add(asString(wordData.getStem()));
+      }
     }
-    return dictMap;
+    return Collections.unmodifiableList(new ArrayList<>(lemmas));
   }
 
-  private List<String> getDictKeys(String word, String postag) {
-    List<String> keys = new ArrayList<>();
-    if (constantTags.contains(postag)) {
-      keys.addAll(Arrays.asList(word, postag));
-    } else {
-      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
-    }
-    return keys;
+  private String asString(CharSequence tag) {
+    if(tag == null)
+      return null;
+    return tag.toString();
   }
 
-  private Map<List<String>, String> getDictMap(String word, String postag) {
-    Map<List<String>, String> dictMap;
-
-    if (constantTags.contains(postag)) {
-      dictMap = this.getLemmaTagsDict(word);
-    } else {
-      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+  @Override
+  public String[] lemmatize(String[] toks, String[] tags) {
+    String[] lemmas = new String[toks.length];
+    for (int i = 0; i < toks.length; i++) {
+       List<String> l = lemmatize(toks[i],tags[i]);
+      if(l.size() > 0) {
+        lemmas[i] = l.get(0);
+      } else {
+        lemmas[i] = null;
+      }
     }
-    return dictMap;
+    return lemmas;
   }
+  
 
-  public String lemmatize(String word, String postag) {
-    String lemma;
-    List<String> keys = this.getDictKeys(word, postag);
-    Map<List<String>, String> dictMap = this.getDictMap(word, postag);
-    // lookup lemma as value of the map
-    String keyValue = dictMap.get(keys);
-    if (keyValue != null) {
-      lemma = keyValue;
-    } else if (constantTags.contains(postag)) {
-      lemma = word;
-    } else if (Objects.equals(word.toUpperCase(), word)) {
-      lemma = word;
-    } else {
-      lemma = word.toLowerCase();
+  /**
+   * Generates a lemma tags for the word and postag returning the result in 
list of possible lemmas.
+   *
+   * @param toks an array of the tokens
+   * @param tags an array of the pos tags
+   *
+   * @return an list of possible lemmas for each token in the sequence.
+   */
+  public List<List<String>> lemmatize(List<String> toks, List<String> tags) {
+    List<List<String>> lemmas = new ArrayList<>();
+    for (int i = 0; i < toks.size(); i++) {
+      lemmas.add(lemmatize(toks.get(i),tags.get(i)));
     }
-    return lemma;
+    return lemmas;
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 0a7ba48..4d450ba 100644
--- 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -20,14 +20,16 @@ package opennlp.morfologik.builder;
 import java.io.File;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+
+import org.junit.Test;
 
 import junit.framework.TestCase;
 import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
 
-import org.junit.Test;
-
 public class POSDictionayBuilderTest extends TestCase {
 
   @Test
@@ -54,5 +56,29 @@ public class POSDictionayBuilderTest extends TestCase {
     
     return builder.build(tabFilePath);
   }
+  
+  
+  public static void main(String[] args) throws Exception {
+
+    // Part 1: compile a FSA lemma dictionary 
+    
+    // we need the tabular dictionary. It is mandatory to have info 
+    //  file with same name, but .info extension
+    Path textLemmaDictionary = 
Paths.get("/Users/wcolen/git/opennlp/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt");
+    
+    // this will build a binary dictionary located in compiledLemmaDictionary
+    Path compiledLemmaDictionary = new MorfologikDictionayBuilder()
+        .build(textLemmaDictionary);
+    
+    // Part 2: load a MorfologikLemmatizer and use it
+    MorfologikLemmatizer lemmatizer = new 
MorfologikLemmatizer(compiledLemmaDictionary);
+    
+    String[] toks = {"casa", "casa"};
+    String[] tags = {"NOUN", "V"};
+    
+    String[] lemmas = lemmatizer.lemmatize(toks, tags);
+    System.out.println(Arrays.toString(lemmas)); // outputs [casa, casar]
+    
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 6b7525e..35757be 100644
--- 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -1,24 +1,50 @@
 package opennlp.morfologik.lemmatizer;
 
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.*;
 
 import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import java.util.Arrays;
+import java.util.List;
 
 import org.junit.Test;
 
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.Lemmatizer;
+
 public class MorfologikLemmatizerTest {
 
   @Test
   public void testLemmatizeInsensitive() throws Exception {
-    DictionaryLemmatizer dict = createDictionary(false);
+    Lemmatizer dict = createDictionary(false);
+    
+    
+    String[] toks = {"casa", "casa", "Casa"};
+    String[] tags = {"V", "NOUN", "PROP"};
+    
+    String[] lemmas = dict.lemmatize(toks, tags);
 
-    assertEquals("casar", dict.lemmatize("casa", "V"));
-    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+    assertEquals("casar", lemmas[0]);
+    assertEquals("casa", lemmas[1]);
 
-    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+    // lookup is case insensitive. There is no entry casa - prop
+    assertNull(lemmas[2]);
+
+  }
+  
+  @Test
+  public void testLemmatizeMultiLemma() throws Exception {
+    MorfologikLemmatizer dict = createDictionary(false);
+    
+    
+    String[] toks = {"foi"};
+    String[] tags = {"V"};
+    
+    List<List<String>> lemmas = dict.lemmatize(Arrays.asList(toks), 
Arrays.asList(tags));
+
+    
+    assertTrue(lemmas.get(0).contains("ir"));
+    assertTrue(lemmas.get(0).contains("ser"));
+    
 
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 7341a02..354b34c 100644
--- 
a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ 
b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,28 +17,31 @@
 
 package opennlp.morfologik.tagdict;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.nio.file.Path;
 
+import org.junit.Test;
+
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.postag.TagDictionary;
 import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelType;
 
-import org.junit.Test;
-
 /**
  * Tests for the {@link POSTaggerFactory} class.
  */
@@ -46,10 +49,19 @@ public class POSTaggerFactoryTest {
 
   private static ObjectStream<POSSample> createSampleStream()
       throws IOException {
-    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
-        .getResourceAsStream("AnnotatedSentences.txt");
+    MarkableFileInputStreamFactory sampleDataIn = new 
MarkableFileInputStreamFactory(
+        new File(POSTaggerFactory.class.getResource("/AnnotatedSentences.txt")
+            .getFile()));
+    
+
+    ObjectStream<String> lineStream = null;
+    try {
+      lineStream = new PlainTextByLineStream(sampleDataIn, "UTF-8");
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
 
-    return new WordTagSampleStream((new InputStreamReader(in)));
+    return new WordTagSampleStream(lineStream);
   }
 
   static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict 
b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict
new file mode 100644
index 0000000..66288b0
Binary files /dev/null and 
b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict differ

http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git 
a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt 
b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
index 09d39e3..3e27a3c 100644
--- a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
@@ -1,11 +1,13 @@
+carro,carro,NOUN
 casa,casa,NOUN
-casar,casa,V
-casar,casar,V-INF
 Casa,Casa,PROP
 casa,casinha,NOUN
 casa,casona,NOUN
+casar,casa,V
+casar,casar,V-INF
+ir,foi,V
 menino,menina,NOUN
+menino,menininho,NOUN
 menino,menino,NOUN
 menino,meninÃ£o,NOUN
-menino,menininho,NOUN
-carro,carro,NOUN
\ No newline at end of file
+ser,foi,V

opennlp git commit: Updates Morfologik add-on with 1.7.0 interfaces

Reply via email to