opennlp git commit: OPENNLP-1040: Add OntoNotes4 training data verification

joern Mon, 24 Apr 2017 04:07:04 -0700

Repository: opennlp
Updated Branches:
  refs/heads/master 32afb6a8b -> 406021733



OPENNLP-1040: Add OntoNotes4 training data verification


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/40602173
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/40602173
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/40602173

Branch: refs/heads/master
Commit: 406021733baf6cdd339d7b14a413b2ffeeaae42d
Parents: 32afb6a
Author: JÃ¶rn Kottmann <[email protected]>
Authored: Fri Apr 21 12:57:19 2017 +0200
Committer: JÃ¶rn Kottmann <[email protected]>
Committed: Mon Apr 24 12:49:20 2017 +0200

----------------------------------------------------------------------
 .../tools/eval/OntoNotes4NameFinderEval.java    | 56 +++++++++++++++-----
 .../tools/eval/OntoNotes4ParserEval.java        | 45 ++++++++++++----
 .../tools/eval/OntoNotes4PosTaggerEval.java     | 45 ++++++++++++----
 3 files changed, 116 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index e0e3912..ef018cd 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -37,9 +41,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4NameFinderEval {
 
-  private static void crossEval(TrainingParameters params, String type, double 
expectedScore)
-      throws IOException {
-
+  private static ObjectStream<NameSample> createNameSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -50,19 +52,49 @@ public class OntoNotes4NameFinderEval {
           return file.isDirectory();
         }, true);
 
-    ObjectStream<NameSample> samples = new OntoNotesNameSampleStream(new 
FileToStringSampleStream(
-        documentStream, Charset.forName("UTF-8")));
+    return new OntoNotesNameSampleStream(new FileToStringSampleStream(
+        documentStream, StandardCharsets.UTF_8));
+  }
+
+  private static void crossEval(TrainingParameters params, String type, double 
expectedScore)
+      throws IOException {
+    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
 
-    TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", 
null,
-        params, new TokenNameFinderFactory());
+      TokenNameFinderCrossValidator cv = new 
TokenNameFinderCrossValidator("en", null,
+          params, new TokenNameFinderFactory());
 
-    if (type != null) {
-      samples = new NameSampleTypeFilter(new String[] {type}, samples);
+      ObjectStream<NameSample> filteredSamples;
+      if (type != null) {
+        filteredSamples = new NameSampleTypeFilter(new String[] {type}, 
samples);
+      }
+      else {
+        filteredSamples = samples;
+      }
+
+      cv.evaluate(filteredSamples, 10);
+
+      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 
0.001d);
+    }
+  }
+
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
     }
 
-    cv.evaluate(samples, 10);
+    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
+      NameSample sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
 
-    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(new 
BigInteger("168206908604555450993491898907821588182"),
+          new BigInteger(1, digest.digest()));
+    }
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index 2182957..3a5b30d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -21,9 +21,13 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -31,6 +35,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.formats.ontonotes.DocumentToLineStream;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
 import opennlp.tools.parser.HeadRules;
+import opennlp.tools.parser.Parse;
 import opennlp.tools.parser.ParserCrossValidator;
 import opennlp.tools.parser.ParserType;
 import opennlp.tools.parser.lang.en.HeadRulesTest;
@@ -40,9 +45,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4ParserEval {
 
-  private static void crossEval(TrainingParameters params, HeadRules rules, 
double expectedScore)
-      throws IOException {
-
+  private static ObjectStream<Parse> createParseSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -53,15 +56,39 @@ public class OntoNotes4ParserEval {
           return file.isDirectory();
         }, true);
 
-    OntoNotesParseSampleStream samples = new OntoNotesParseSampleStream(
+    return new OntoNotesParseSampleStream(
         new DocumentToLineStream(new FileToStringSampleStream(
-            documentStream, Charset.forName("UTF-8"))));
+            documentStream, StandardCharsets.UTF_8)));
+  }
+
+  private static void crossEval(TrainingParameters params, HeadRules rules, 
double expectedScore)
+      throws IOException {
+    try (ObjectStream<Parse> samples = createParseSampleStream()) {
+      ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, 
ParserType.CHUNKING);
+      cv.evaluate(samples, 10);
+
+      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 
0.001d);
+    }
+  }
 
-    ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, 
ParserType.CHUNKING);
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
 
-    cv.evaluate(samples, 10);
+    try (ObjectStream<Parse> samples = createParseSampleStream()) {
+      Parse sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
 
-    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(new 
BigInteger("83833369887442127665956850482411800415"),
+          new BigInteger(1, digest.digest()));
+    }
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index ab33568..b171978 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -29,6 +33,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.formats.convert.ParseToPOSSampleStream;
 import opennlp.tools.formats.ontonotes.DocumentToLineStream;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
+import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerCrossValidator;
 import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.util.ObjectStream;
@@ -37,9 +42,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4PosTaggerEval {
 
-  private static void crossEval(TrainingParameters params, double 
expectedScore)
-      throws IOException {
-
+  private static ObjectStream<POSSample> createPOSSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -50,16 +53,40 @@ public class OntoNotes4PosTaggerEval {
           return file.isDirectory();
         }, true);
 
-    ParseToPOSSampleStream samples = new ParseToPOSSampleStream(new 
OntoNotesParseSampleStream(
+    return new ParseToPOSSampleStream(new OntoNotesParseSampleStream(
         new DocumentToLineStream(
-            new FileToStringSampleStream(documentStream, 
Charset.forName("UTF-8")))));
+            new FileToStringSampleStream(documentStream, 
StandardCharsets.UTF_8))));
+  }
 
-    POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new 
POSTaggerFactory());
-    cv.evaluate(samples, 10);
+  private static void crossEval(TrainingParameters params, double 
expectedScore)
+      throws IOException {
+    try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+      POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, 
new POSTaggerFactory());
+      cv.evaluate(samples, 10);
 
-    Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+      Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+    }
   }
 
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
+
+    try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+      POSSample sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
+
+      Assert.assertEquals(new 
BigInteger("300430765214895870888056958221353356972"),
+          new BigInteger(1, digest.digest()));
+    }
+  }
   @Test
   public void evalEnglishMaxentTagger() throws IOException {
     crossEval(ModelUtil.createDefaultTrainingParameters(), 
0.9698145168879707d);

opennlp git commit: OPENNLP-1040: Add OntoNotes4 training data verification

Reply via email to