Repository: opennlp Updated Branches: refs/heads/902 486b88079 -> 001b97068
Updates Morfologik add-on with 1.7.0 interfaces The Morfologik add-on was not compatible with the latest OpenNLP code. This also simplifies the implementation of the wrapper. Previous code was a little language specific. See issue OPENNLP-902 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/001b9706 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/001b9706 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/001b9706 Branch: refs/heads/902 Commit: 001b970685ef0cb3904d2d8b0b2dfc2462eed870 Parents: 486b880 Author: William Colen <[email protected]> Authored: Wed Dec 28 01:17:13 2016 -0200 Committer: William Colen <[email protected]> Committed: Wed Dec 28 01:17:13 2016 -0200 ---------------------------------------------------------------------- .../builder/XMLDictionaryToTableTool.java | 2 +- .../lemmatizer/MorfologikLemmatizer.java | 86 +++++++++---------- .../builder/POSDictionayBuilderTest.java | 30 ++++++- .../lemmatizer/MorfologikLemmatizerTest.java | 42 +++++++-- .../tagdict/POSTaggerFactoryTest.java | 28 ++++-- .../src/test/resources/dictionaryWithLemma.dict | Bin 0 -> 223 bytes .../src/test/resources/dictionaryWithLemma.txt | 10 ++- 7 files changed, 129 insertions(+), 69 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java index ef6668e..f3108a4 100644 --- a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java @@ -41,7 +41,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool { private String SEPARATOR; public String getShortDescription() { - return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file"; + return "reads an OpenNLP XML tag dictionary and outputs it in a tabular file"; } public String getHelp() { http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java index 2798e42..489b6fc 100644 --- a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java +++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java @@ -20,11 +20,9 @@ package opennlp.morfologik.lemmatizer; import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; +import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Objects; import java.util.Set; @@ -32,66 +30,62 @@ import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.IStemmer; import morfologik.stemming.WordData; -import opennlp.tools.lemmatizer.DictionaryLemmatizer; +import opennlp.tools.lemmatizer.Lemmatizer; -public class MorfologikLemmatizer implements DictionaryLemmatizer { +public class MorfologikLemmatizer implements Lemmatizer { private IStemmer dictLookup; - public final Set<String> constantTags = new HashSet<>(Arrays.asList("NNP", "NP00000")); public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException, IOException { dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath)); } - private Map<List<String>, String> getLemmaTagsDict(String word) { - List<WordData> wdList = dictLookup.lookup(word); - Map<List<String>, String> dictMap = new HashMap<>(); - for (WordData wd : wdList) { - List<String> wordLemmaTags = new ArrayList<>(); - wordLemmaTags.add(word); - wordLemmaTags.add(wd.getTag().toString()); - dictMap.put(wordLemmaTags, wd.getStem().toString()); + private List<String> lemmatize(String word, String postag) { + List<WordData> dictMap = dictLookup.lookup(word.toLowerCase()); + Set<String> lemmas = new HashSet<>(); + for (WordData wordData : dictMap) { + if(Objects.equals(postag, asString(wordData.getTag()))) { + lemmas.add(asString(wordData.getStem())); + } } - return dictMap; + return Collections.unmodifiableList(new ArrayList<>(lemmas)); } - private List<String> getDictKeys(String word, String postag) { - List<String> keys = new ArrayList<>(); - if (constantTags.contains(postag)) { - keys.addAll(Arrays.asList(word, postag)); - } else { - keys.addAll(Arrays.asList(word.toLowerCase(), postag)); - } - return keys; + private String asString(CharSequence tag) { + if(tag == null) + return null; + return tag.toString(); } - private Map<List<String>, String> getDictMap(String word, String postag) { - Map<List<String>, String> dictMap; - - if (constantTags.contains(postag)) { - dictMap = this.getLemmaTagsDict(word); - } else { - dictMap = this.getLemmaTagsDict(word.toLowerCase()); + @Override + public String[] lemmatize(String[] toks, String[] tags) { + String[] lemmas = new String[toks.length]; + for (int i = 0; i < toks.length; i++) { + List<String> l = lemmatize(toks[i],tags[i]); + if(l.size() > 0) { + lemmas[i] = l.get(0); + } else { + lemmas[i] = null; + } } - return dictMap; + return lemmas; } + - public String lemmatize(String word, String postag) { - String lemma; - List<String> keys = this.getDictKeys(word, postag); - Map<List<String>, String> dictMap = this.getDictMap(word, postag); - // lookup lemma as value of the map - String keyValue = dictMap.get(keys); - if (keyValue != null) { - lemma = keyValue; - } else if (constantTags.contains(postag)) { - lemma = word; - } else if (Objects.equals(word.toUpperCase(), word)) { - lemma = word; - } else { - lemma = word.toLowerCase(); + /** + * Generates a lemma tags for the word and postag returning the result in list of possible lemmas. + * + * @param toks an array of the tokens + * @param tags an array of the pos tags + * + * @return an list of possible lemmas for each token in the sequence. + */ + public List<List<String>> lemmatize(List<String> toks, List<String> tags) { + List<List<String>> lemmas = new ArrayList<>(); + for (int i = 0; i < toks.size(); i++) { + lemmas.add(lemmatize(toks.get(i),tags.get(i))); } - return lemma; + return lemmas; } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java index 0a7ba48..4d450ba 100644 --- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java @@ -20,14 +20,16 @@ package opennlp.morfologik.builder; import java.io.File; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.Arrays; + +import org.junit.Test; import junit.framework.TestCase; import morfologik.stemming.DictionaryMetadata; import opennlp.morfologik.lemmatizer.MorfologikLemmatizer; -import org.junit.Test; - public class POSDictionayBuilderTest extends TestCase { @Test @@ -54,5 +56,29 @@ public class POSDictionayBuilderTest extends TestCase { return builder.build(tabFilePath); } + + + public static void main(String[] args) throws Exception { + + // Part 1: compile a FSA lemma dictionary + + // we need the tabular dictionary. It is mandatory to have info + // file with same name, but .info extension + Path textLemmaDictionary = Paths.get("/Users/wcolen/git/opennlp/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt"); + + // this will build a binary dictionary located in compiledLemmaDictionary + Path compiledLemmaDictionary = new MorfologikDictionayBuilder() + .build(textLemmaDictionary); + + // Part 2: load a MorfologikLemmatizer and use it + MorfologikLemmatizer lemmatizer = new MorfologikLemmatizer(compiledLemmaDictionary); + + String[] toks = {"casa", "casa"}; + String[] tags = {"NOUN", "V"}; + + String[] lemmas = lemmatizer.lemmatize(toks, tags); + System.out.println(Arrays.toString(lemmas)); // outputs [casa, casar] + + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java index 6b7525e..35757be 100644 --- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java @@ -1,24 +1,50 @@ package opennlp.morfologik.lemmatizer; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.*; import java.nio.file.Path; - -import opennlp.morfologik.builder.POSDictionayBuilderTest; -import opennlp.tools.lemmatizer.DictionaryLemmatizer; +import java.util.Arrays; +import java.util.List; import org.junit.Test; +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.lemmatizer.Lemmatizer; + public class MorfologikLemmatizerTest { @Test public void testLemmatizeInsensitive() throws Exception { - DictionaryLemmatizer dict = createDictionary(false); + Lemmatizer dict = createDictionary(false); + + + String[] toks = {"casa", "casa", "Casa"}; + String[] tags = {"V", "NOUN", "PROP"}; + + String[] lemmas = dict.lemmatize(toks, tags); - assertEquals("casar", dict.lemmatize("casa", "V")); - assertEquals("casa", dict.lemmatize("casa", "NOUN")); + assertEquals("casar", lemmas[0]); + assertEquals("casa", lemmas[1]); - assertEquals("casa", dict.lemmatize("Casa", "PROP")); + // lookup is case insensitive. There is no entry casa - prop + assertNull(lemmas[2]); + + } + + @Test + public void testLemmatizeMultiLemma() throws Exception { + MorfologikLemmatizer dict = createDictionary(false); + + + String[] toks = {"foi"}; + String[] tags = {"V"}; + + List<List<String>> lemmas = dict.lemmatize(Arrays.asList(toks), Arrays.asList(tags)); + + + assertTrue(lemmas.get(0).contains("ir")); + assertTrue(lemmas.get(0).contains("ser")); + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java index 7341a02..354b34c 100644 --- a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java @@ -17,28 +17,31 @@ package opennlp.morfologik.tagdict; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.nio.file.Path; +import org.junit.Test; + import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerFactory; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.postag.TagDictionary; import opennlp.tools.postag.WordTagSampleStream; +import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.ModelType; -import org.junit.Test; - /** * Tests for the {@link POSTaggerFactory} class. */ @@ -46,10 +49,19 @@ public class POSTaggerFactoryTest { private static ObjectStream<POSSample> createSampleStream() throws IOException { - InputStream in = POSTaggerFactoryTest.class.getClassLoader() - .getResourceAsStream("AnnotatedSentences.txt"); + MarkableFileInputStreamFactory sampleDataIn = new MarkableFileInputStreamFactory( + new File(POSTaggerFactory.class.getResource("/AnnotatedSentences.txt") + .getFile())); + + + ObjectStream<String> lineStream = null; + try { + lineStream = new PlainTextByLineStream(sampleDataIn, "UTF-8"); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } - return new WordTagSampleStream((new InputStreamReader(in))); + return new WordTagSampleStream(lineStream); } static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict new file mode 100644 index 0000000..66288b0 Binary files /dev/null and b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.dict differ http://git-wip-us.apache.org/repos/asf/opennlp/blob/001b9706/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt index 09d39e3..3e27a3c 100644 --- a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt +++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt @@ -1,11 +1,13 @@ +carro,carro,NOUN casa,casa,NOUN -casar,casa,V -casar,casar,V-INF Casa,Casa,PROP casa,casinha,NOUN casa,casona,NOUN +casar,casa,V +casar,casar,V-INF +ir,foi,V menino,menina,NOUN +menino,menininho,NOUN menino,menino,NOUN menino,meninão,NOUN -menino,menininho,NOUN -carro,carro,NOUN \ No newline at end of file +ser,foi,V
