Repository: opennlp Updated Branches: refs/heads/master 929595d2f -> b7d3abce5
OPENNLP-983: Make suffix/prefix length configurable This closes #121 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b7d3abce Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b7d3abce Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b7d3abce Branch: refs/heads/master Commit: b7d3abce569b5a4bf0ae39b24c6ac9920032db01 Parents: 929595d Author: jzonthemtn <jeff...@mtnfog.com> Authored: Mon Feb 13 07:57:21 2017 -0500 Committer: Jörn Kottmann <jo...@apache.org> Committed: Thu Feb 16 10:04:08 2017 +0100 ---------------------------------------------------------------------- .../tools/util/featuregen/GeneratorFactory.java | 22 ++++- .../util/featuregen/PrefixFeatureGenerator.java | 32 +++++-- .../util/featuregen/SuffixFeatureGenerator.java | 33 +++++-- .../featuregen/PrefixFeatureGeneratorTest.java | 92 ++++++++++++++++++++ .../featuregen/SuffixFeatureGeneratorTest.java | 92 ++++++++++++++++++++ 5 files changed, 251 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/b7d3abce/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java index fa97f43..ef08cfb 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java @@ -555,7 +555,16 @@ public class GeneratorFactory { public AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) { - return new PrefixFeatureGenerator(); + + String attribute = generatorElement.getAttribute("length"); + + int prefixLength = PrefixFeatureGenerator.DEFAULT_MAX_LENGTH; + + if (!Objects.equals(attribute, "")) { + prefixLength = Integer.parseInt(attribute); + } + + return new PrefixFeatureGenerator(prefixLength); } static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) { @@ -570,7 +579,16 @@ public class GeneratorFactory { public AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) { - return new SuffixFeatureGenerator(); + + String attribute = generatorElement.getAttribute("length"); + + int suffixLength = SuffixFeatureGenerator.DEFAULT_MAX_LENGTH; + + if (!Objects.equals(attribute, "")) { + suffixLength = Integer.parseInt(attribute); + } + + return new SuffixFeatureGenerator(suffixLength); } static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/b7d3abce/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java index 8cdd48f..04fcd15 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java @@ -21,21 +21,35 @@ import java.util.List; public class PrefixFeatureGenerator implements AdaptiveFeatureGenerator { - private static final int PREFIX_LENGTH = 4; - - private static String[] getPrefixes(String lex) { - String[] prefs = new String[PREFIX_LENGTH]; - for (int li = 0; li < PREFIX_LENGTH; li++) { - prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); - } - return prefs; + static final int DEFAULT_MAX_LENGTH = 4; + + private final int prefixLength; + + public PrefixFeatureGenerator() { + prefixLength = DEFAULT_MAX_LENGTH; + } + + public PrefixFeatureGenerator(int prefixLength) { + this.prefixLength = prefixLength; } + @Override public void createFeatures(List<String> features, String[] tokens, int index, String[] previousOutcomes) { - String[] prefs = PrefixFeatureGenerator.getPrefixes(tokens[index]); + String[] prefs = getPrefixes(tokens[index]); for (String pref : prefs) { features.add("pre=" + pref); } } + + private String[] getPrefixes(String lex) { + + int prefixes = Math.min(prefixLength, lex.length()); + + String[] prefs = new String[prefixes]; + for (int li = 0; li < prefixes; li++) { + prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); + } + return prefs; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/b7d3abce/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java index a17fd47..c626fd9 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java @@ -21,21 +21,36 @@ import java.util.List; public class SuffixFeatureGenerator implements AdaptiveFeatureGenerator { - private static final int SUFFIX_LENGTH = 4; - - public static String[] getSuffixes(String lex) { - String[] suffs = new String[SUFFIX_LENGTH]; - for (int li = 0; li < SUFFIX_LENGTH; li++) { - suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); - } - return suffs; + static final int DEFAULT_MAX_LENGTH = 4; + + private final int suffixLength; + + public SuffixFeatureGenerator() { + suffixLength = DEFAULT_MAX_LENGTH; + } + + public SuffixFeatureGenerator(int suffixLength) { + this.suffixLength = suffixLength; } + @Override public void createFeatures(List<String> features, String[] tokens, int index, String[] previousOutcomes) { - String[] suffs = SuffixFeatureGenerator.getSuffixes(tokens[index]); + String[] suffs = getSuffixes(tokens[index]); for (String suff : suffs) { features.add("suf=" + suff); } } + + private String[] getSuffixes(String lex) { + + int suffixes = Math.min(suffixLength, lex.length()); + + String[] suffs = new String[suffixes]; + for (int li = 0; li < suffixes; li++) { + suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); + } + return suffs; + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/b7d3abce/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java new file mode 100644 index 0000000..5639174 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/PrefixFeatureGeneratorTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class PrefixFeatureGeneratorTest { + + private List<String> features; + + @Before + public void setUp() throws Exception { + features = new ArrayList<>(); + } + + @Test + public void lengthTest1() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 0; + int suffixLength = 2; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("pre=T", features.get(0)); + Assert.assertEquals("pre=Th", features.get(1)); + + } + + @Test + public void lengthTest2() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 3; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(5, features.size()); + Assert.assertEquals("pre=e", features.get(0)); + Assert.assertEquals("pre=ex", features.get(1)); + Assert.assertEquals("pre=exa", features.get(2)); + Assert.assertEquals("pre=exam", features.get(3)); + Assert.assertEquals("pre=examp", features.get(4)); + + } + + @Test + public void lengthTest3() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 1; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new PrefixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("pre=i", features.get(0)); + Assert.assertEquals("pre=is", features.get(1)); + + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/b7d3abce/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java new file mode 100644 index 0000000..fcb23a6 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/SuffixFeatureGeneratorTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class SuffixFeatureGeneratorTest { + + private List<String> features; + + @Before + public void setUp() throws Exception { + features = new ArrayList<>(); + } + + @Test + public void lengthTest1() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 0; + int suffixLength = 2; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("suf=s", features.get(0)); + Assert.assertEquals("suf=is", features.get(1)); + + } + + @Test + public void lengthTest2() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 3; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(5, features.size()); + Assert.assertEquals("suf=e", features.get(0)); + Assert.assertEquals("suf=le", features.get(1)); + Assert.assertEquals("suf=ple", features.get(2)); + Assert.assertEquals("suf=mple", features.get(3)); + Assert.assertEquals("suf=ample", features.get(4)); + + } + + @Test + public void lengthTest3() { + + String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"}; + + int testTokenIndex = 1; + int suffixLength = 5; + + AdaptiveFeatureGenerator generator = new SuffixFeatureGenerator(suffixLength); + + generator.createFeatures(features, testSentence, testTokenIndex, null); + + Assert.assertEquals(2, features.size()); + Assert.assertEquals("suf=s", features.get(0)); + Assert.assertEquals("suf=is", features.get(1)); + + } +}