Repository: opennlp Updated Branches: refs/heads/master c0880fb68 -> d372ad1de
OPENNLP-1062: Add lemmatizer eval tests Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d372ad1d Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d372ad1d Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d372ad1d Branch: refs/heads/master Commit: d372ad1de8212901641ba4bf896188c58be1b017 Parents: c0880fb Author: Jörn Kottmann <[email protected]> Authored: Mon May 15 11:57:45 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Tue May 16 14:28:35 2017 +0200 ---------------------------------------------------------------------- .../formats/conllu/ConlluLemmaSampleStream.java | 2 +- .../tools/formats/conllu/ConlluTagset.java | 2 +- .../opennlp/tools/eval/Conll00ChunkerEval.java | 5 +- .../opennlp/tools/eval/ConllXPosTaggerEval.java | 6 +- .../test/java/opennlp/tools/eval/EvalUtil.java | 33 ++++++++ .../tools/eval/OntoNotes4NameFinderEval.java | 8 +- .../tools/eval/OntoNotes4ParserEval.java | 8 +- .../tools/eval/OntoNotes4PosTaggerEval.java | 8 +- .../tools/eval/SourceForgeModelEval.java | 39 ++++----- .../tools/eval/UniversalDependency20Eval.java | 84 ++++++++++++++++++++ 10 files changed, 143 insertions(+), 52 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java index 0782120..98ee48d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java @@ -29,7 +29,7 @@ public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence, private final ConlluTagset tagset; - ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) { + public ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) { super(samples); this.tagset = tagset; } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java index f49f3fd..3f6ee76 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java @@ -17,7 +17,7 @@ package opennlp.tools.formats.conllu; -enum ConlluTagset { +public enum ConlluTagset { U, X } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java index 8ac90d7..62d4a46 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java @@ -19,6 +19,7 @@ package opennlp.tools.eval; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; @@ -49,7 +50,7 @@ public class Conll00ChunkerEval { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream( - new MarkableFileInputStreamFactory(trainFile), "UTF-8")); + new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8)); return ChunkerME.train("en", samples, params, new ChunkerFactory()); } @@ -58,7 +59,7 @@ public class Conll00ChunkerEval { double expectedFMeasure) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( - new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), "UTF-8")); + new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8)); ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model)); evaluator.evaluate(samples); http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java index 600e599..af53878 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java @@ -19,7 +19,7 @@ package opennlp.tools.eval; import java.io.File; import java.io.IOException; -import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; @@ -59,7 +59,7 @@ public class ConllXPosTaggerEval { TrainingParameters params) throws IOException { ObjectStream<POSSample> samples = - new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), Charset.forName("UTF-8")); + new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8); return POSTaggerME.train(lang, samples, params, new POSTaggerFactory()); } @@ -68,7 +68,7 @@ public class ConllXPosTaggerEval { double expectedAccuracy) throws IOException { ObjectStream<POSSample> samples = new ConllXPOSSampleStream( - new MarkableFileInputStreamFactory(testData), Charset.forName("UTF-8")); + new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8); POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model)); evaluator.evaluate(samples); http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java index 45f2471..2b04afb 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java @@ -18,6 +18,15 @@ package opennlp.tools.eval; import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.math.BigInteger; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import org.junit.Assert; import opennlp.tools.ml.maxent.quasinewton.QNTrainer; import opennlp.tools.ml.naivebayes.NaiveBayesTrainer; @@ -27,6 +36,8 @@ import opennlp.tools.util.model.ModelUtil; public class EvalUtil { + static final double ACCURACY_DELTA = 0.0001d; + static TrainingParameters createPerceptronParams() { TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, @@ -54,4 +65,26 @@ public class EvalUtil { public static File getOpennlpDataDir() { return new File(System.getProperty("OPENNLP_DATA_DIR")); } + + static MessageDigest createDigest() { + try { + return MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + } + + static void verifyFileChecksum(Path file, BigInteger checksum) throws IOException { + MessageDigest digest = createDigest(); + + try (InputStream in = Files.newInputStream(file)) { + byte[] buf = new byte[65536]; + int len; + while ((len = in.read(buf)) > 0) { + digest.update(buf, 0, len); + } + } + + Assert.assertEquals(checksum, new BigInteger(1, digest.digest())); + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java index a696787..d9f5ecd 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java @@ -29,7 +29,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.Map; import org.junit.Assert; @@ -89,12 +88,7 @@ public class OntoNotes4NameFinderEval { @BeforeClass public static void verifyTrainingData() throws IOException { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new IllegalStateException(e); - } + MessageDigest digest = EvalUtil.createDigest(); try (ObjectStream<NameSample> samples = createNameSampleStream()) { NameSample sample; http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java index f7e1046..5606b82 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java @@ -24,7 +24,6 @@ import java.io.InputStreamReader; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import org.junit.Assert; import org.junit.BeforeClass; @@ -73,12 +72,7 @@ public class OntoNotes4ParserEval { @BeforeClass public static void verifyTrainingData() throws IOException { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new IllegalStateException(e); - } + MessageDigest digest = EvalUtil.createDigest(); try (ObjectStream<Parse> samples = createParseSampleStream()) { Parse sample; http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java index 6236507..3ea7abe 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java @@ -22,7 +22,6 @@ import java.io.IOException; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import org.junit.Assert; import org.junit.BeforeClass; @@ -70,12 +69,7 @@ public class OntoNotes4PosTaggerEval { @BeforeClass public static void verifyTrainingData() throws IOException { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new IllegalStateException(e); - } + MessageDigest digest = EvalUtil.createDigest(); try (ObjectStream<POSSample> samples = createPOSSampleStream()) { POSSample sample; http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java index 25b6f54..24cdcd0 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java @@ -20,10 +20,8 @@ package opennlp.tools.eval; import java.io.File; import java.io.IOException; import java.math.BigInteger; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import org.junit.Assert; import org.junit.BeforeClass; @@ -83,25 +81,17 @@ import opennlp.tools.util.Span; */ public class SourceForgeModelEval { - private static MessageDigest createDigest() { - try { - return MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new IllegalStateException(e); - } - } - @BeforeClass public static void ensureTestDataIsCorrect() throws IOException { - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); try (ObjectStream<String> lines = new PlainTextByLineStream( new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(), - "leipzig/eng_news_2010_300K-sentences.txt")), Charset.forName("UTF-8"))) { + "leipzig/eng_news_2010_300K-sentences.txt")), StandardCharsets.UTF_8)) { String line; while ((line = lines.read()) != null) { - digest.update(line.getBytes("UTF-8")); + digest.update(line.getBytes(StandardCharsets.UTF_8)); } Assert.assertEquals(new BigInteger("248567841356936801447294643695012852392"), @@ -115,7 +105,7 @@ public class SourceForgeModelEval { SentenceModel model = new SentenceModel( new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin")); - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); SentenceDetector sentenceDetector = new SentenceDetectorME(model); @@ -134,7 +124,7 @@ public class SourceForgeModelEval { String[] sentences = sentenceDetector.sentDetect(text.toString()); for (String sentence : sentences) { - digest.update(sentence.getBytes("UTF-8")); + digest.update(sentence.getBytes(StandardCharsets.UTF_8)); } Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"), @@ -151,7 +141,7 @@ public class SourceForgeModelEval { TokenizerModel model = new TokenizerModel( new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin")); - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); Tokenizer tokenizer = new TokenizerME(model); @@ -164,7 +154,7 @@ public class SourceForgeModelEval { while ((line = lines.read()) != null) { String[] tokens = tokenizer.tokenize(String.join(" ", line.getText())); for (String token : tokens) { - digest.update(token.getBytes("UTF-8")); + digest.update(token.getBytes(StandardCharsets.UTF_8)); } } } @@ -183,7 +173,7 @@ public class SourceForgeModelEval { private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash) throws IOException { - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); TokenNameFinder nameFinder = new NameFinderME(model); @@ -193,7 +183,8 @@ public class SourceForgeModelEval { while ((line = lines.read()) != null) { Span[] names = nameFinder.find(line.getText()); for (Span name : names) { - digest.update((name.getType() + name.getStart() + name.getEnd()).getBytes("UTF-8")); + digest.update((name.getType() + name.getStart() + + name.getEnd()).getBytes(StandardCharsets.UTF_8)); } } } @@ -260,7 +251,7 @@ public class SourceForgeModelEval { @Test public void evalChunkerModel() throws IOException { - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); POSTagger tagger = new POSTaggerME(new POSModel( new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"))); @@ -276,7 +267,7 @@ public class SourceForgeModelEval { String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags()); for (String chunk : chunks) { - digest.update(chunk.getBytes("UTF-8")); + digest.update(chunk.getBytes(StandardCharsets.UTF_8)); } } } @@ -290,7 +281,7 @@ public class SourceForgeModelEval { // break the input stream into sentences // The input stream is tokenized and can be processed here directly - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); POSTagger tagger = new POSTaggerME(model); @@ -300,7 +291,7 @@ public class SourceForgeModelEval { while ((line = lines.read()) != null) { String[] tags = tagger.tag(line.getText()); for (String tag : tags) { - digest.update(tag.getBytes("UTF-8")); + digest.update(tag.getBytes(StandardCharsets.UTF_8)); } } } @@ -330,7 +321,7 @@ public class SourceForgeModelEval { ParserModel model = new ParserModel( new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-parser-chunking.bin")); - MessageDigest digest = createDigest(); + MessageDigest digest = EvalUtil.createDigest(); Parser parser = ParserFactory.create(model); http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java new file mode 100644 index 0000000..70fc8b0 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.eval; + +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import opennlp.tools.formats.conllu.ConlluLemmaSampleStream; +import opennlp.tools.formats.conllu.ConlluStream; +import opennlp.tools.formats.conllu.ConlluTagset; +import opennlp.tools.lemmatizer.LemmaSample; +import opennlp.tools.lemmatizer.LemmatizerEvaluator; +import opennlp.tools.lemmatizer.LemmatizerFactory; +import opennlp.tools.lemmatizer.LemmatizerME; +import opennlp.tools.lemmatizer.LemmatizerModel; +import opennlp.tools.util.MarkableFileInputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelUtil; + +public class UniversalDependency20Eval { + + private static File SPA_ANCORA_TRAIN = + new File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu"); + private static File SPA_ANCORA_DEV = + new File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu"); + + @BeforeClass + public static void ensureTestDataIsCorrect() throws IOException { + SourceForgeModelEval.ensureTestDataIsCorrect(); + + EvalUtil.verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(), + new BigInteger("224942804200733453179524127037951530195")); + EvalUtil.verifyFileChecksum(SPA_ANCORA_DEV.toPath(), + new BigInteger("280996187464384493180190898172297941708")); + } + + private static double trainAndEval(String lang, File trainFile, TrainingParameters params, + File evalFile) throws IOException { + ConlluTagset tagset = ConlluTagset.X; + + ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream( + new MarkableFileInputStreamFactory(trainFile)), tagset); + + LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory()); + LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model)); + + evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream( + new MarkableFileInputStreamFactory(evalFile)), tagset)); + + return evaluator.getWordAccuracy(); + } + + @Test + public void trainAndEvalSpanishAncora() throws IOException { + TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + params.put("Threads", "4"); + + double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN, + params, SPA_ANCORA_DEV); + + Assert.assertEquals(0.9046675934566091d, wordAccuracy, EvalUtil.ACCURACY_DELTA); + } +}
