Repository: opennlp Updated Branches: refs/heads/master e515ff474 -> 08e163ca5
OPENNLP-1092: Fix pos model serialization bug Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/08e163ca Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/08e163ca Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/08e163ca Branch: refs/heads/master Commit: 08e163ca5f07db9ad9460a686b7e5085b12d9128 Parents: e515ff4 Author: Jörn Kottmann <[email protected]> Authored: Mon Jun 26 16:20:09 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Wed Jun 28 11:06:58 2017 +0200 ---------------------------------------------------------------------- .../java/opennlp/tools/postag/POSModel.java | 9 +- .../namefind/TokenNameFinderModelTest.java | 104 +++++++++++++++++++ .../opennlp/tools/namefind/ner-pos-features.xml | 36 +++++++ 3 files changed, 148 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/08e163ca/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java index 95a41a8..d55921c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java @@ -35,6 +35,8 @@ import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.model.ArtifactSerializer; import opennlp.tools.util.model.BaseModel; import opennlp.tools.util.model.ByteArraySerializer; +import opennlp.tools.util.model.POSModelSerializer; +import opennlp.tools.util.model.SerializableArtifact; /** * The {@link POSModel} is the model used @@ -42,7 +44,7 @@ import opennlp.tools.util.model.ByteArraySerializer; * * @see POSTaggerME */ -public final class POSModel extends BaseModel { +public final class POSModel extends BaseModel implements SerializableArtifact { private static final String COMPONENT_NAME = "POSTaggerME"; static final String POS_MODEL_ENTRY_NAME = "pos.model"; @@ -178,4 +180,9 @@ public final class POSModel extends BaseModel { return getFactory().getDictionary(); return null; } + + @Override + public Class<POSModelSerializer> getArtifactSerializerClass() { + return POSModelSerializer.class; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/08e163ca/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java new file mode 100644 index 0000000..9d58993 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderModelTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.namefind; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.stream.Collectors; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTaggerMETest; +import opennlp.tools.util.MockInputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelType; + +public class TokenNameFinderModelTest { + + @Test + public void testNERWithPOSModel() throws IOException { + + // create a resources folder + Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath(); + + // save a POS model there + POSModel posModel = POSTaggerMETest.trainPOSModel(ModelType.MAXENT); + File posModelFile = new File(resourcesFolder.toFile(),"pos-model.bin"); + FileOutputStream fos = new FileOutputStream(posModelFile); + + posModel.serialize(posModelFile); + + Assert.assertTrue(posModelFile.exists()); + + // load feature generator xml bytes + InputStream fgInputStream = this.getClass().getResourceAsStream("ner-pos-features.xml"); + BufferedReader buffers = new BufferedReader(new InputStreamReader(fgInputStream)); + String featureGeneratorString = buffers.lines(). + collect(Collectors.joining("\n")); + + // create a featuregenerator file + Path featureGenerator = Files.createTempFile("ner-featuregen", ".xml"); + Files.write(featureGenerator, featureGeneratorString.getBytes()); + + + Map<String, Object> resources; + try { + resources = TokenNameFinderTrainerTool.loadResources(resourcesFolder.toFile(), + featureGenerator.toAbsolutePath().toFile()); + } + catch (IOException e) { + throw new TerminateToolException(-1, e.getMessage(), e); + } + + + // train a name finder + ObjectStream<NameSample> sampleStream = new NameSampleDataStream( + new PlainTextByLineStream(new MockInputStreamFactory( + new File("opennlp/tools/namefind/voa1.train")), "UTF-8")); + + TrainingParameters params = new TrainingParameters(); + params.put(TrainingParameters.ITERATIONS_PARAM, 70); + params.put(TrainingParameters.CUTOFF_PARAM, 1); + + TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, + params, TokenNameFinderFactory.create(null, + featureGeneratorString.getBytes(), resources, new BioCodec())); + + + File model = File.createTempFile("nermodel", ".bin"); + FileOutputStream modelOut = new FileOutputStream(model); + nameFinderModel.serialize(modelOut); + + modelOut.close(); + + Assert.assertTrue(model.exists()); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/08e163ca/opennlp-tools/src/test/resources/opennlp/tools/namefind/ner-pos-features.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/namefind/ner-pos-features.xml b/opennlp-tools/src/test/resources/opennlp/tools/namefind/ner-pos-features.xml new file mode 100644 index 0000000..7600e38 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/namefind/ner-pos-features.xml @@ -0,0 +1,36 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<generators> + <cache> + <generators> + <window prevLength = "2" nextLength = "2"> + <tokenclass/> + </window> + <window prevLength = "2" nextLength = "2"> + <token/> + </window> + <window prevLength = "2" nextLength = "2"> + <tokenpos model="pos-model.bin"/> + </window> + <definition/> + <prevmap/> + <bigram/> + <sentence begin="true" end="false"/> + </generators> + </cache> +</generators> \ No newline at end of file
