OPENNLP-1061 Add functionality to DictionaryLemmatizer to output several lemmas for a given word postag pair
closes #202 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a00624cf Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a00624cf Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a00624cf Branch: refs/heads/LangDetect Commit: a00624cf27791193be74a610723a9a0b0980d23f Parents: c434b3a Author: Rodrigo Agerri <rage...@apache.org> Authored: Tue May 16 12:35:22 2017 +0200 Committer: Jörn Kottmann <jo...@apache.org> Committed: Tue May 16 15:32:37 2017 +0200 ---------------------------------------------------------------------- .../tools/lemmatizer/DictionaryLemmatizer.java | 11 ++-- .../DictionaryLemmatizerMultiTest.java | 64 ++++++++++++++++++++ .../tools/lemmatizer/smalldictionarymulti.dict | 5 ++ 3 files changed, 76 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java index 37d488c..97d6854 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java @@ -29,7 +29,7 @@ import java.util.Map; /** * Lemmatize by simple dictionary lookup into a hashmap built from a file - * containing, for each line, word\tablemma\tabpostag. + * containing, for each line, word\tabpostag\tablemma. * @version 2014-07-08 */ public class DictionaryLemmatizer implements Lemmatizer { @@ -42,7 +42,9 @@ public class DictionaryLemmatizer implements Lemmatizer { /** * Construct a hashmap from the input tab separated dictionary. * - * The input file should have, for each line, word\tablemma\tabpostag + * The input file should have, for each line, word\tabpostag\tablemma. + * Alternatively, if multiple lemmas are possible for each word,postag pair, + * then the format should be word\tab\postag\tablemma01#lemma02#lemma03 * * @param dictionary * the input dictionary via inputstream @@ -54,7 +56,8 @@ public class DictionaryLemmatizer implements Lemmatizer { String line; while ((line = breader.readLine()) != null) { final String[] elems = line.split("\t"); - this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(elems[2])); + final String[] lemmas = elems[2].split("#"); + this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(lemmas)); } } @@ -137,7 +140,7 @@ public class DictionaryLemmatizer implements Lemmatizer { final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); - if (!keyValues.isEmpty()) { + if (keyValues != null && !keyValues.isEmpty()) { lemmasList.addAll(keyValues); } else { lemmasList.add("O"); http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java new file mode 100644 index 0000000..d29830b --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.lemmatizer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +public class DictionaryLemmatizerMultiTest { + + private static DictionaryLemmatizer dictionaryLemmatizer; + + @BeforeClass + public static void loadDictionary() throws Exception { + dictionaryLemmatizer = new DictionaryLemmatizer( + DictionaryLemmatizerTest.class.getResourceAsStream( + "/opennlp/tools/lemmatizer/smalldictionarymulti.dict") + ); + } + + @Test + public void testForNullPointerException() { + List<String> sentence = Arrays.asList("The","dogs","were","running","and","barking", + "down","the","street"); + List<String> sentencePOS = Arrays.asList("DT","NNS","VBD","VBG","CC","VBG","RP","DT","NN"); + List<List<String>> expectedLemmas = new ArrayList<>(); + expectedLemmas.add(Arrays.asList("the")); + expectedLemmas.add(Arrays.asList("dog")); + expectedLemmas.add(Arrays.asList("is")); + expectedLemmas.add(Arrays.asList("run,run")); + expectedLemmas.add(Arrays.asList("and")); + expectedLemmas.add(Arrays.asList("bark,bark")); + expectedLemmas.add(Arrays.asList("down")); + expectedLemmas.add(Arrays.asList("the")); + expectedLemmas.add(Arrays.asList("street")); + + List<List<String>> actualLemmas = dictionaryLemmatizer.lemmatize(sentence, sentencePOS); + + for (int i = 0; i < sentence.size(); i++) { + // don't compare cases where the word is not in the dictionary... + if (!actualLemmas.get(0).get(0).equals("O")) + Assert.assertEquals(expectedLemmas.get(i), actualLemmas.get(i)); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict new file mode 100644 index 0000000..b650a0b --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict @@ -0,0 +1,5 @@ +barking VBG bark#bark +dogs NNS dog +running VBG run#run +down RP down +street NN street \ No newline at end of file