OPENNLP-958: Add POS Name Finder feature generator closes apache/opennlp#170
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ab6698b Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ab6698b Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ab6698b Branch: refs/heads/LangDetect Commit: 3ab6698b6ae590ae1ac5758d988837d89ea74b3e Parents: 62d9fd2 Author: William D C M SILVA <co...@apache.org> Authored: Mon May 8 19:48:09 2017 -0300 Committer: William D C M SILVA <co...@apache.org> Committed: Mon May 8 19:48:09 2017 -0300 ---------------------------------------------------------------------- opennlp-docs/src/docbkx/namefinder.xml | 5 ++ .../java/opennlp/tools/parser/ParserModel.java | 52 +-------------- .../tools/util/featuregen/GeneratorFactory.java | 37 +++++++++++ .../POSTaggerNameFeatureGenerator.java | 68 ++++++++++++++++++++ .../util/model/ChunkerModelSerializer.java | 49 ++++++++++++++ .../tools/util/model/POSModelSerializer.java | 51 +++++++++++++++ .../tools/eval/OntoNotes4NameFinderEval.java | 63 ++++++++++++++++++ .../opennlp/tools/postag/POSTaggerMETest.java | 2 +- .../POSTaggerNameFeatureGeneratorTest.java | 45 +++++++++++++ .../opennlp/tools/eval/ner-en_pos-features.xml | 37 +++++++++++ 10 files changed, 358 insertions(+), 51 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-docs/src/docbkx/namefinder.xml ---------------------------------------------------------------------- diff --git a/opennlp-docs/src/docbkx/namefinder.xml b/opennlp-docs/src/docbkx/namefinder.xml index 2f68c47..abcd094 100644 --- a/opennlp-docs/src/docbkx/namefinder.xml +++ b/opennlp-docs/src/docbkx/namefinder.xml @@ -439,6 +439,11 @@ new NameFinderME(model);]]> <entry>no</entry> <entry>none</entry> </row> + <row> + <entry>tokenpos</entry> + <entry>no</entry> + <entry><emphasis>model</emphasis> is the file name of the POS Tagger model to use</entry> + </row> <row> <entry>wordcluster</entry> <entry>no</entry> http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java index 61ac401..c290d9f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java @@ -30,15 +30,14 @@ import java.util.Map; import java.util.Objects; import opennlp.tools.chunker.ChunkerModel; -import opennlp.tools.ml.BeamSearch; import opennlp.tools.ml.model.AbstractModel; import opennlp.tools.ml.model.MaxentModel; import opennlp.tools.postag.POSModel; import opennlp.tools.util.InvalidFormatException; -import opennlp.tools.util.Version; import opennlp.tools.util.model.ArtifactSerializer; import opennlp.tools.util.model.BaseModel; -import opennlp.tools.util.model.UncloseableInputStream; +import opennlp.tools.util.model.ChunkerModelSerializer; +import opennlp.tools.util.model.POSModelSerializer; /** * This is an abstract base class for {@link ParserModel} implementations. @@ -46,53 +45,6 @@ import opennlp.tools.util.model.UncloseableInputStream; // TODO: Model should validate the artifact map public class ParserModel extends BaseModel { - private static class POSModelSerializer implements ArtifactSerializer<POSModel> { - - public POSModel create(InputStream in) throws IOException { - POSModel posModel = new POSModel(new UncloseableInputStream(in)); - - // The 1.6.x models write the non-default beam size into the model itself. - // In 1.5.x the parser configured the beam size when the model was loaded, - // this is not possible anymore with the new APIs - Version version = posModel.getVersion(); - if (version.getMajor() == 1 && version.getMinor() == 5) { - if (posModel.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER) == null) { - posModel = new POSModel(posModel.getLanguage(), posModel.getPosModel(), 10, - null, posModel.getFactory()); - } - } - - return posModel; - } - - public void serialize(POSModel artifact, OutputStream out) - throws IOException { - artifact.serialize(out); - } - } - - private static class ChunkerModelSerializer implements ArtifactSerializer<ChunkerModel> { - - public ChunkerModel create(InputStream in) throws IOException { - - ChunkerModel model = new ChunkerModel(new UncloseableInputStream(in)); - - Version version = model.getVersion(); - if (version.getMajor() == 1 && version.getMinor() == 5) { - - model = new ChunkerModel(model.getLanguage(), model.getChunkerModel(), new ParserChunkerFactory()); - - } - - return model; - } - - public void serialize(ChunkerModel artifact, OutputStream out) - throws IOException { - artifact.serialize(out); - } - } - private static class HeadRulesSerializer implements ArtifactSerializer<opennlp.tools.parser.lang.en.HeadRules> { http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java index 5060961..11cad42 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java @@ -43,9 +43,11 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.postag.POSModel; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.ext.ExtensionLoader; import opennlp.tools.util.model.ArtifactSerializer; +import opennlp.tools.util.model.POSModelSerializer; /** * Creates a set of feature generators based on a provided XML descriptor. @@ -607,6 +609,30 @@ public class GeneratorFactory { } } + + + /** + * @see TokenPatternFeatureGenerator + */ + static class POSTaggerNameFeatureGeneratorFactory implements XmlFeatureGeneratorFactory { + + public AdaptiveFeatureGenerator create(Element generatorElement, + FeatureGeneratorResourceProvider resourceManager) + throws InvalidFormatException { + + String modelResourceKey = generatorElement.getAttribute("model"); + + POSModel model = (POSModel)resourceManager.getResource(modelResourceKey); + + return new POSTaggerNameFeatureGenerator(model); + + } + + static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) { + factoryMap.put("tokenpos", new POSTaggerNameFeatureGeneratorFactory()); + } + } + // TODO: We have to support custom resources here. How does it work ?! // Attributes get into a Map<String, String> properties @@ -678,6 +704,7 @@ public class GeneratorFactory { BrownClusterTokenClassFeatureGeneratorFactory.register(factories); BrownClusterBigramFeatureGeneratorFactory.register(factories); CustomFeatureGeneratorFactory.register(factories); + POSTaggerNameFeatureGeneratorFactory.register(factories); } /** @@ -820,6 +847,16 @@ public class GeneratorFactory { break; } } + + String modelName = xmlElement.getAttribute("model"); + if (modelName != null) { + + switch (xmlElement.getTagName()) { + case "tokenpos": + mapping.put(modelName, new POSModelSerializer()); + break; + } + } } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java new file mode 100644 index 0000000..39c6335 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package opennlp.tools.util.featuregen; + +import java.util.List; +import java.util.Objects; + +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTagger; +import opennlp.tools.postag.POSTaggerME; + +/** + * Adds the token POS Tag as feature. Requires a POS Tag model. + */ +public class POSTaggerNameFeatureGenerator implements AdaptiveFeatureGenerator { + + private POSTagger posTagger; + + private String[] cachedTokens; + private String[] cachedTags; + + /** + * Initializes a new instance. + * + * @param aPosTagger a POSTagger implementation. + */ + public POSTaggerNameFeatureGenerator(POSTagger aPosTagger) { + this.posTagger = aPosTagger; + } + + /** + * Initializes a new instance. + * + * @param aPosModel a POSTagger model. + */ + public POSTaggerNameFeatureGenerator(POSModel aPosModel) { + + this.posTagger = new POSTaggerME(aPosModel); + } + + + public void createFeatures(List<String> feats, String[] toks, int index, String[] preds) { + if (!Objects.equals(this.cachedTokens, toks)) { + this.cachedTokens = toks; + this.cachedTags = this.posTagger.tag(toks); + } + + feats.add("pos=" + this.cachedTags[index]); + } + + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java new file mode 100644 index 0000000..c32cc69 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.model; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import opennlp.tools.chunker.ChunkerModel; +import opennlp.tools.parser.ParserChunkerFactory; +import opennlp.tools.util.Version; + + +public class ChunkerModelSerializer implements ArtifactSerializer<ChunkerModel> { + + public ChunkerModel create(InputStream in) throws IOException { + + ChunkerModel model = new ChunkerModel(new UncloseableInputStream(in)); + + Version version = model.getVersion(); + if (version.getMajor() == 1 && version.getMinor() == 5) { + + model = new ChunkerModel(model.getLanguage(), model.getChunkerModel(), new ParserChunkerFactory()); + + } + + return model; + } + + public void serialize(ChunkerModel artifact, OutputStream out) + throws IOException { + artifact.serialize(out); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java new file mode 100644 index 0000000..a82319c --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.model; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import opennlp.tools.ml.BeamSearch; +import opennlp.tools.postag.POSModel; +import opennlp.tools.util.Version; + +public class POSModelSerializer implements ArtifactSerializer<POSModel> { + + public POSModel create(InputStream in) throws IOException { + POSModel posModel = new POSModel(new UncloseableInputStream(in)); + + // The 1.6.x models write the non-default beam size into the model itself. + // In 1.5.x the parser configured the beam size when the model was loaded, + // this is not possible anymore with the new APIs + Version version = posModel.getVersion(); + if (version.getMajor() == 1 && version.getMinor() == 5) { + if (posModel.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER) == null) { + posModel = new POSModel(posModel.getLanguage(), posModel.getPosModel(), 10, + null, posModel.getFactory()); + } + } + + return posModel; + } + + public void serialize(POSModel artifact, OutputStream out) + throws IOException { + artifact.serialize(out); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java index ac9509c..a696787 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java @@ -17,17 +17,27 @@ package opennlp.tools.eval; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.math.BigInteger; +import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Map; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool; import opennlp.tools.formats.DirectorySampleStream; import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStream; @@ -117,4 +127,57 @@ public class OntoNotes4NameFinderEval { params.put("Threads", "4"); crossEval(params, null, 0.8014054850253551d); } + + @Test + public void evalAllTypesWithPOSNameFinder() throws IOException { + TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + params.put("Threads", "4"); + + // load the feature generator + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + try (InputStream in = this.getClass().getResourceAsStream( + "ner-en_pos-features.xml")) { + byte[] buf = new byte[1024]; + int len; + while ((len = in.read(buf)) > 0) { + bytes.write(buf, 0, len); + } + } + catch (IOException e) { + throw new IllegalStateException("Failed reading from ner-default-features.xml file on classpath!"); + } + + byte[] featureGen = bytes.toByteArray(); + + // create a temp resource folder and copy the pos model there + Path resourcesPath = Files.createTempDirectory("opennlp_resources"); + Files.copy(new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin").toPath(), + new File(resourcesPath.toFile(), "en-pos-perceptron.bin").toPath(), + StandardCopyOption.REPLACE_EXISTING); + + Map<String, Object> resources; + + try { + resources = TokenNameFinderTrainerTool.loadResources(resourcesPath.toFile(), + Paths.get(this.getClass().getResource("ner-en_pos-features.xml").toURI()).toFile()); + } + catch (IOException | URISyntaxException e) { + throw new TerminateToolException(-1,"IO error while loading resources", e); + } + + + try (ObjectStream<NameSample> samples = createNameSampleStream()) { + + TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null, + params, featureGen, resources); + + ObjectStream<NameSample> filteredSamples; + + filteredSamples = samples; + + cv.evaluate(filteredSamples, 5); + + Assert.assertEquals(0.8044097625338349d, cv.getFMeasure().getFMeasure(), 0.001d); + } + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java index 6d0785b..838150e 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java @@ -48,7 +48,7 @@ public class POSTaggerMETest { * * @return {@link POSModel} */ - static POSModel trainPOSModel(ModelType type) throws IOException { + public static POSModel trainPOSModel(ModelType type) throws IOException { TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, type.toString()); params.put(TrainingParameters.ITERATIONS_PARAM, 100); http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java new file mode 100644 index 0000000..0514c26 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util.featuregen; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.postag.POSTaggerMETest; +import opennlp.tools.util.model.ModelType; + +public class POSTaggerNameFeatureGeneratorTest { + + + @Test + public void testFeatureGeneration() throws IOException { + POSTaggerNameFeatureGenerator fg = new POSTaggerNameFeatureGenerator( + POSTaggerMETest.trainPOSModel(ModelType.MAXENT)); + + String[] tokens = {"Hi", "Mike", ",", "it", "'s", "Stefanie", "Schmidt", "."}; + for (int i = 0; i < tokens.length; i++) { + List<String> feats = new ArrayList<>(); + fg.createFeatures(feats, tokens, i, null); + Assert.assertTrue(feats.get(0).startsWith("pos=")); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml new file mode 100644 index 0000000..b850904 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml @@ -0,0 +1,37 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<!-- Default name finder feature generator configuration --> +<generators> + <cache> + <generators> + <window prevLength = "2" nextLength = "2"> + <tokenclass/> + </window> + <window prevLength = "2" nextLength = "2"> + <token/> + </window> + <definition/> + <prevmap/> + <bigram/> + <sentence begin="true" end="false"/> + <tokenpos model="en-pos-perceptron.bin"/> + </generators> + </cache> +</generators> \ No newline at end of file