Repository: opennlp Updated Branches: refs/heads/master 32afb6a8b -> 406021733
OPENNLP-1040: Add OntoNotes4 training data verification Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/40602173 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/40602173 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/40602173 Branch: refs/heads/master Commit: 406021733baf6cdd339d7b14a413b2ffeeaae42d Parents: 32afb6a Author: Jörn Kottmann <[email protected]> Authored: Fri Apr 21 12:57:19 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Mon Apr 24 12:49:20 2017 +0200 ---------------------------------------------------------------------- .../tools/eval/OntoNotes4NameFinderEval.java | 56 +++++++++++++++----- .../tools/eval/OntoNotes4ParserEval.java | 45 ++++++++++++---- .../tools/eval/OntoNotes4PosTaggerEval.java | 45 ++++++++++++---- 3 files changed, 116 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java index e0e3912..ef018cd 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java @@ -19,9 +19,13 @@ package opennlp.tools.eval; import java.io.File; import java.io.IOException; -import java.nio.charset.Charset; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import opennlp.tools.formats.DirectorySampleStream; @@ -37,9 +41,7 @@ import opennlp.tools.util.model.ModelUtil; public class OntoNotes4NameFinderEval { - private static void crossEval(TrainingParameters params, String type, double expectedScore) - throws IOException { - + private static ObjectStream<NameSample> createNameSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { @@ -50,19 +52,49 @@ public class OntoNotes4NameFinderEval { return file.isDirectory(); }, true); - ObjectStream<NameSample> samples = new OntoNotesNameSampleStream(new FileToStringSampleStream( - documentStream, Charset.forName("UTF-8"))); + return new OntoNotesNameSampleStream(new FileToStringSampleStream( + documentStream, StandardCharsets.UTF_8)); + } + + private static void crossEval(TrainingParameters params, String type, double expectedScore) + throws IOException { + try (ObjectStream<NameSample> samples = createNameSampleStream()) { - TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null, - params, new TokenNameFinderFactory()); + TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null, + params, new TokenNameFinderFactory()); - if (type != null) { - samples = new NameSampleTypeFilter(new String[] {type}, samples); + ObjectStream<NameSample> filteredSamples; + if (type != null) { + filteredSamples = new NameSampleTypeFilter(new String[] {type}, samples); + } + else { + filteredSamples = samples; + } + + cv.evaluate(filteredSamples, 10); + + Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d); + } + } + + @BeforeClass + public static void verifyTrainingData() throws IOException { + MessageDigest digest; + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); } - cv.evaluate(samples, 10); + try (ObjectStream<NameSample> samples = createNameSampleStream()) { + NameSample sample; + while ((sample = samples.read()) != null) { + digest.update(sample.toString().getBytes(StandardCharsets.UTF_8)); + } - Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d); + Assert.assertEquals(new BigInteger("168206908604555450993491898907821588182"), + new BigInteger(1, digest.digest())); + } } @Test http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java index 2182957..3a5b30d 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java @@ -21,9 +21,13 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.nio.charset.Charset; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import opennlp.tools.formats.DirectorySampleStream; @@ -31,6 +35,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.formats.ontonotes.DocumentToLineStream; import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream; import opennlp.tools.parser.HeadRules; +import opennlp.tools.parser.Parse; import opennlp.tools.parser.ParserCrossValidator; import opennlp.tools.parser.ParserType; import opennlp.tools.parser.lang.en.HeadRulesTest; @@ -40,9 +45,7 @@ import opennlp.tools.util.model.ModelUtil; public class OntoNotes4ParserEval { - private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore) - throws IOException { - + private static ObjectStream<Parse> createParseSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { @@ -53,15 +56,39 @@ public class OntoNotes4ParserEval { return file.isDirectory(); }, true); - OntoNotesParseSampleStream samples = new OntoNotesParseSampleStream( + return new OntoNotesParseSampleStream( new DocumentToLineStream(new FileToStringSampleStream( - documentStream, Charset.forName("UTF-8")))); + documentStream, StandardCharsets.UTF_8))); + } + + private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore) + throws IOException { + try (ObjectStream<Parse> samples = createParseSampleStream()) { + ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING); + cv.evaluate(samples, 10); + + Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d); + } + } - ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING); + @BeforeClass + public static void verifyTrainingData() throws IOException { + MessageDigest digest; + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } - cv.evaluate(samples, 10); + try (ObjectStream<Parse> samples = createParseSampleStream()) { + Parse sample; + while ((sample = samples.read()) != null) { + digest.update(sample.toString().getBytes(StandardCharsets.UTF_8)); + } - Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d); + Assert.assertEquals(new BigInteger("83833369887442127665956850482411800415"), + new BigInteger(1, digest.digest())); + } } @Test http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java index ab33568..b171978 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java @@ -19,9 +19,13 @@ package opennlp.tools.eval; import java.io.File; import java.io.IOException; -import java.nio.charset.Charset; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import opennlp.tools.formats.DirectorySampleStream; @@ -29,6 +33,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream; import opennlp.tools.formats.convert.ParseToPOSSampleStream; import opennlp.tools.formats.ontonotes.DocumentToLineStream; import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream; +import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerCrossValidator; import opennlp.tools.postag.POSTaggerFactory; import opennlp.tools.util.ObjectStream; @@ -37,9 +42,7 @@ import opennlp.tools.util.model.ModelUtil; public class OntoNotes4PosTaggerEval { - private static void crossEval(TrainingParameters params, double expectedScore) - throws IOException { - + private static ObjectStream<POSSample> createPOSSampleStream() throws IOException { ObjectStream<File> documentStream = new DirectorySampleStream(new File( EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"), file -> { @@ -50,16 +53,40 @@ public class OntoNotes4PosTaggerEval { return file.isDirectory(); }, true); - ParseToPOSSampleStream samples = new ParseToPOSSampleStream(new OntoNotesParseSampleStream( + return new ParseToPOSSampleStream(new OntoNotesParseSampleStream( new DocumentToLineStream( - new FileToStringSampleStream(documentStream, Charset.forName("UTF-8"))))); + new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8)))); + } - POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory()); - cv.evaluate(samples, 10); + private static void crossEval(TrainingParameters params, double expectedScore) + throws IOException { + try (ObjectStream<POSSample> samples = createPOSSampleStream()) { + POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory()); + cv.evaluate(samples, 10); - Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d); + Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d); + } } + @BeforeClass + public static void verifyTrainingData() throws IOException { + MessageDigest digest; + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + + try (ObjectStream<POSSample> samples = createPOSSampleStream()) { + POSSample sample; + while ((sample = samples.read()) != null) { + digest.update(sample.toString().getBytes(StandardCharsets.UTF_8)); + } + + Assert.assertEquals(new BigInteger("300430765214895870888056958221353356972"), + new BigInteger(1, digest.digest())); + } + } @Test public void evalEnglishMaxentTagger() throws IOException { crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9698145168879707d);
