[opennlp] branch master updated: OPENNLP-1118: Adds data verification for eval tests.

jzemerick Mon, 31 Jul 2017 06:40:47 -0700

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/master by this push:
     new e7ce3bf  OPENNLP-1118: Adds data verification for eval tests.
e7ce3bf is described below

commit e7ce3bf49bbdc7b81ba229736c9017bbc57029bf
Author: jzonthemtn <[email protected]>
AuthorDate: Wed Jul 19 11:33:20 2017 -0400

    OPENNLP-1118: Adds data verification for eval tests.
---
 .gitignore                                         |   1 +
 .../java/opennlp/tools/eval/AbstractEvalTest.java  | 140 +++++++++++++++++++++
 .../opennlp/tools/eval/ArvoresDeitadasEval.java    |  46 ++++---
 .../opennlp/tools/eval/Conll00ChunkerEval.java     |  48 ++++---
 .../opennlp/tools/eval/Conll02NameFinderEval.java  | 111 ++++++++++------
 .../opennlp/tools/eval/ConllXPosTaggerEval.java    |  88 +++++++++----
 .../src/test/java/opennlp/tools/eval/EvalUtil.java |  90 -------------
 .../tools/eval/OntoNotes4NameFinderEval.java       |  24 ++--
 .../opennlp/tools/eval/OntoNotes4ParserEval.java   |  21 +---
 .../tools/eval/OntoNotes4PosTaggerEval.java        |  23 ++--
 .../opennlp/tools/eval/SourceForgeModelEval.java   |  99 +++++++--------
 .../tools/eval/UniversalDependency20Eval.java      |  22 ++--
 12 files changed, 412 insertions(+), 301 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5d44bbd..81ef51f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ target
 nbactions.xml
 nb-configuration.xml
 *.DS_Store
+.checkstyle
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java
new file mode 100644
index 0000000..5f8865b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.junit.Assert;
+
+import opennlp.tools.ml.maxent.quasinewton.QNTrainer;
+import opennlp.tools.ml.naivebayes.NaiveBayesTrainer;
+import opennlp.tools.ml.perceptron.PerceptronTrainer;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+public abstract class AbstractEvalTest {
+
+  public static final double ACCURACY_DELTA = 0.0001d;
+  public static final String HASH_ALGORITHM = "MD5";
+
+  public static void verifyTrainingData(ObjectStream<?> samples, BigInteger 
checksum) throws Exception {
+
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+
+    Object sample;
+    while ((sample = samples.read()) != null) {
+      digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+    }
+
+    samples.close();
+
+    Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+
+  }
+  
+  public static void verifyFileChecksum(Path file, BigInteger checksum) throws 
Exception {
+
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+
+    try (InputStream in = Files.newInputStream(file)) {
+      byte[] buf = new byte[65536];
+      int len;
+      while ((len = in.read(buf)) > 0) {
+        digest.update(buf, 0, len);
+      }
+    }
+
+    Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+  }
+  
+  public static void verifyDirectoryChecksum(Path path, String extension, 
BigInteger checksum)
+      throws Exception {
+
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+    
+    final List<Path> paths = Files.walk(path)
+        .filter(Files::isRegularFile)
+        .filter(p -> p.toString().endsWith(extension))
+        .collect(Collectors.toList());
+
+    // Ensure the paths are in a consistent order when
+    // verifying the file checksums.
+    Collections.sort(paths);
+    
+    for (Path p : paths) {
+      try (InputStream in = Files.newInputStream(p)) {
+        byte[] buf = new byte[65536];
+        int len;
+        while ((len = in.read(buf)) > 0) {
+          digest.update(buf, 0, len);
+        }
+      }
+    }
+
+    Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+  }    
+
+  public static File getOpennlpDataDir() throws FileNotFoundException {
+    final String dataDirectory = System.getProperty("OPENNLP_DATA_DIR");
+    if (StringUtil.isEmpty(dataDirectory)) {
+      throw new IllegalArgumentException("The OPENNLP_DATA_DIR is not set.");
+    }
+    final File file = new File(System.getProperty("OPENNLP_DATA_DIR"));
+    if (!file.exists()) {
+      throw new FileNotFoundException("The OPENNLP_DATA_DIR path of " + 
dataDirectory + " was not found.");
+    }
+    return file;
+  }
+
+  public TrainingParameters createPerceptronParams() {
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put(TrainingParameters.ALGORITHM_PARAM,
+        PerceptronTrainer.PERCEPTRON_VALUE);
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
+    return params;
+  }
+
+  public TrainingParameters createMaxentQnParams() {
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put(TrainingParameters.ALGORITHM_PARAM,
+        QNTrainer.MAXENT_QN_VALUE);
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
+    return params;
+  }
+
+  public TrainingParameters createNaiveBayesParams() {
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put(TrainingParameters.ALGORITHM_PARAM,
+        NaiveBayesTrainer.NAIVE_BAYES_VALUE);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
+    return params;
+  }
+
+}
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
index 6ee3eb0..cd34046 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
+import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.chunker.ChunkerCrossValidator;
@@ -61,7 +63,7 @@ import opennlp.tools.util.model.ModelUtil;
  * Bosque_CF_8.0.ad.txt.gz </a></li>
  * </ul>
  */
-public class ArvoresDeitadasEval {
+public class ArvoresDeitadasEval extends AbstractEvalTest {
 
   private static final String BOSQUE = "ad/Bosque_CF_8.0.ad.txt";
   private static final String FLORESTA_VIRGEM = 
"ad/FlorestaVirgem_CF_3.0_ad.txt";
@@ -71,10 +73,21 @@ public class ArvoresDeitadasEval {
   private static ObjectStream<String> getLineSample(String corpus)
       throws IOException {
     return new PlainTextByLineStream(new MarkableFileInputStreamFactory(
-        new File(EvalUtil.getOpennlpDataDir(), corpus)), 
StandardCharsets.ISO_8859_1);
+        new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1);
   }
+  
+  @BeforeClass
+  public static void verifyTrainingData() throws Exception {
 
-  private static void sentenceCrossEval(TrainingParameters params,
+    verifyTrainingData(new ADSentenceSampleStream(getLineSample(BOSQUE), 
false),
+        new BigInteger("140568367548727787313497336739085858596"));
+
+    verifyTrainingData(new 
ADSentenceSampleStream(getLineSample(FLORESTA_VIRGEM), false),
+        new BigInteger("2614161133949079191933514776652602918"));
+  
+  }
+
+  private void sentenceCrossEval(TrainingParameters params,
                                         double expectedScore) throws 
IOException {
 
     ADSentenceSampleStream samples = new ADSentenceSampleStream(
@@ -90,7 +103,7 @@ public class ArvoresDeitadasEval {
     Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 
0.0001d);
   }
 
-  private static void tokenizerCrossEval(TrainingParameters params,
+  private void tokenizerCrossEval(TrainingParameters params,
                                          double expectedScore) throws 
IOException {
 
     ObjectStream<NameSample> nameSamples = new ADNameSampleStream(
@@ -116,7 +129,7 @@ public class ArvoresDeitadasEval {
         0.0001d);
   }
 
-  private static void chunkerCrossEval(TrainingParameters params,
+  private void chunkerCrossEval(TrainingParameters params,
                                        double expectedScore) throws 
IOException {
 
     ADChunkSampleStream samples = new 
ADChunkSampleStream(getLineSample(BOSQUE));
@@ -130,7 +143,7 @@ public class ArvoresDeitadasEval {
 
   @Test
   public void evalPortugueseSentenceDetectorPerceptron() throws IOException {
-    sentenceCrossEval(EvalUtil.createPerceptronParams(), 0.9892778840089301d);
+    sentenceCrossEval(createPerceptronParams(), 0.9892778840089301d);
   }
 
   @Test
@@ -140,17 +153,17 @@ public class ArvoresDeitadasEval {
 
   @Test
   public void evalPortugueseSentenceDetectorMaxentQn() throws IOException {
-    sentenceCrossEval(EvalUtil.createMaxentQnParams(), 0.99261110833375d);
+    sentenceCrossEval(createMaxentQnParams(), 0.99261110833375d);
   }
 
   @Test
   public void evalPortugueseSentenceDetectorNaiveBayes() throws IOException {
-    sentenceCrossEval(EvalUtil.createNaiveBayesParams(), 0.9672196206048099d);
+    sentenceCrossEval(createNaiveBayesParams(), 0.9672196206048099d);
   }
 
   @Test
   public void evalPortugueseTokenizerPerceptron() throws IOException {
-    tokenizerCrossEval(EvalUtil.createPerceptronParams(), 0.9994887308380267d);
+    tokenizerCrossEval(createPerceptronParams(), 0.9994887308380267d);
   }
 
   @Test
@@ -160,23 +173,24 @@ public class ArvoresDeitadasEval {
 
   @Test
   public void evalPortugueseTokenizerMaxentQn() throws IOException {
-    tokenizerCrossEval(EvalUtil.createMaxentQnParams(), 0.9996017148748251d);
+    tokenizerCrossEval(createMaxentQnParams(), 0.9996017148748251d);
   }
 
   @Test
   public void evalPortugueseTokenizerNaiveBayes() throws IOException {
-    tokenizerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9962358244502717d);
+    tokenizerCrossEval(createNaiveBayesParams(), 0.9962358244502717d);
   }
+
   @Test
   public void evalPortugueseTokenizerMaxentQnMultipleThreads() throws 
IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
     params.put("Threads", 4);
     tokenizerCrossEval(params, 0.9996017148748251d);
   }
 
   @Test
   public void evalPortugueseChunkerPerceptron() throws IOException {
-    chunkerCrossEval(EvalUtil.createPerceptronParams(),
+    chunkerCrossEval(createPerceptronParams(),
         0.9638122825015589d);
   }
 
@@ -195,13 +209,13 @@ public class ArvoresDeitadasEval {
 
   @Test
   public void evalPortugueseChunkerQn() throws IOException {
-    chunkerCrossEval(EvalUtil.createMaxentQnParams(),
+    chunkerCrossEval(createMaxentQnParams(),
         0.9648211936491359d);
   }
 
   @Test
   public void evalPortugueseChunkerQnMultipleThreads() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
     params.put("Threads", 4);
 
     // NOTE: Should be the same as without multiple threads!!!
@@ -210,6 +224,6 @@ public class ArvoresDeitadasEval {
 
   @Test
   public void evalPortugueseChunkerNaiveBayes() throws IOException {
-    chunkerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9041507736043933d);
+    chunkerCrossEval(createNaiveBayesParams(), 0.9041507736043933d);
   }
 }
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index f4712d9..80a0a74 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
+import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 
@@ -45,8 +47,11 @@ import opennlp.tools.util.model.ModelUtil;
  * <a href="http://www.cnts.ua.ac.be/conll2000/chunking/";> site </a>
  * and decompress them into this directory: $OPENNLP_DATA_DIR/conll00.
  */
-public class Conll00ChunkerEval {
+public class Conll00ChunkerEval extends AbstractEvalTest {
 
+  private static File TEST_DATA_FILE; 
+  private static File TRAIN_DATA_FILE;
+  
   private static ChunkerModel train(File trainFile, TrainingParameters params)
       throws IOException {
 
@@ -68,38 +73,47 @@ public class Conll00ChunkerEval {
     Assert.assertEquals(expectedFMeasure,
         evaluator.getFMeasure().getFMeasure(), 0.0001);
   }
+  
+  @BeforeClass
+  public static void verifyTrainingData() throws Exception {
+    
+    TEST_DATA_FILE = new File(getOpennlpDataDir(), "conll00/test.txt");
+    TRAIN_DATA_FILE = new File(getOpennlpDataDir(), "conll00/train.txt");
+
+    verifyTrainingData(new ChunkSampleStream(
+            new PlainTextByLineStream(new 
MarkableFileInputStreamFactory(TEST_DATA_FILE),
+                    StandardCharsets.UTF_8)),
+        new BigInteger("84610235226433393380477662908529306002"));
+
+    verifyTrainingData(new ChunkSampleStream(
+            new PlainTextByLineStream(new 
MarkableFileInputStreamFactory(TEST_DATA_FILE),
+                    StandardCharsets.UTF_8)),
+        new BigInteger("84610235226433393380477662908529306002"));    
+
+  }
 
   @Test
   public void evalEnglishPerceptron() throws IOException {
-    ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
-        "conll00/train.txt"), EvalUtil.createPerceptronParams());
+    ChunkerModel maxentModel = train(TRAIN_DATA_FILE, 
createPerceptronParams());
 
-    eval(maxentModel,
-        new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
-        0.9295018353434714d);
+    eval(maxentModel, TEST_DATA_FILE, 0.9295018353434714d);
   }
 
   @Test
   public void evalEnglishMaxentGis() throws IOException {
-    ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
-        "conll00/train.txt"), ModelUtil.createDefaultTrainingParameters());
+    ChunkerModel maxentModel = train(TRAIN_DATA_FILE, 
ModelUtil.createDefaultTrainingParameters());
 
-    eval(maxentModel,
-        new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
-        0.9239687473746113d);
+    eval(maxentModel, TEST_DATA_FILE, 0.9239687473746113d);
   }
 
   // Note: Don't try to run this on your MacBook
   @Test
   @Category(HighMemoryUsage.class)
   public void evalEnglishMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
     params.put("Threads", 4);
-    ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
-        "conll00/train.txt"), params);
+    ChunkerModel maxentModel = train(TRAIN_DATA_FILE, params);
 
-    eval(maxentModel,
-        new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
-        0.9302599230947028d);
+    eval(maxentModel, TEST_DATA_FILE, 0.9302599230947028d);
   }
 }
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
index c233686..fccc8e9 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
@@ -20,7 +20,10 @@ package opennlp.tools.eval;
 import java.io.File;
 import java.io.IOException;
 
+import java.math.BigInteger;
+
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.Conll02NameSampleStream;
@@ -48,27 +51,17 @@ import opennlp.tools.util.model.ModelUtil;
  * - Double check the encoding which is used to open the files. Currently that 
is UTF-8.
  * - Make the Conll02 reader compatible. Currently it doesn't work with 
spanish data without pos tags.
  */
-public class Conll02NameFinderEval {
-
-  private final File dutchTrainingFile;
-  private final File dutchTestAFile;
-  private final File dutchTestBFile;
+public class Conll02NameFinderEval extends AbstractEvalTest {
 
-  private final File spanishTrainingFile;
-  private final File spanishTestAFile;
-  private final File spanishTestBFile;
+  private static File dutchTrainingFile;
+  private static File dutchTestAFile;
+  private static File dutchTestBFile;
+  private static File spanishTrainingFile;
+  private static File spanishTestAFile;
+  private static File spanishTestBFile;
+      
 
-  public Conll02NameFinderEval() {
-    dutchTrainingFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/ned.train");
-    dutchTestAFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/ned.testa");
-    dutchTestBFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/ned.testb");
-
-    spanishTrainingFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/esp.train");
-    spanishTestAFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/esp.testa");
-    spanishTestBFile = new File(EvalUtil.getOpennlpDataDir(), 
"conll02/ner/data/esp.testb");
-  }
-
-  private static TokenNameFinderModel train(File trainFile, LANGUAGE lang,
+  private TokenNameFinderModel train(File trainFile, LANGUAGE lang,
       TrainingParameters params, int types) throws IOException {
 
     ObjectStream<NameSample> samples = new Conll02NameSampleStream(
@@ -78,7 +71,7 @@ public class Conll02NameFinderEval {
         params, new TokenNameFinderFactory());
   }
 
-  private static void eval(TokenNameFinderModel model, File testData, LANGUAGE 
lang,
+  private void eval(TokenNameFinderModel model, File testData, LANGUAGE lang,
       int types, double expectedFMeasure) throws IOException {
 
     ObjectStream<NameSample> samples = new Conll02NameSampleStream(
@@ -89,10 +82,48 @@ public class Conll02NameFinderEval {
 
     Assert.assertEquals(expectedFMeasure, 
evaluator.getFMeasure().getFMeasure(), 0.0001);
   }
+  
+  @BeforeClass
+  public static void verifyTrainingData() throws Exception {
+
+    dutchTrainingFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/ned.train");
+    dutchTestAFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/ned.testa");
+    dutchTestBFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/ned.testb");
+    spanishTrainingFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/esp.train");
+    spanishTestAFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/esp.testa");
+    spanishTestBFile = new File(getOpennlpDataDir(), 
"conll02/ner/data/esp.testb");
+    
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTrainingFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("109687424525847313767541246922170457976"));
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestAFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("12942966701628852910737840182656846323"));
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestBFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("223206987942490952427646331013509976957"));
+    
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTrainingFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("226089384066775461905386060946810714487"));  
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestAFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("313879596837181728494732341737647284762"));
+    verifyTrainingData(new Conll02NameSampleStream(
+        LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestBFile),
+          Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+            new BigInteger("24037715705115461166858183817622459974"));
+
+  }
 
   @Test
   public void evalDutchPersonPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -120,7 +151,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchPersonMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -134,7 +165,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchOrganizationPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -162,7 +193,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchOrganizationMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -176,7 +207,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchLocationPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -204,7 +235,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchLocationMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -218,7 +249,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchMiscPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -246,7 +277,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchMiscMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD, 
params,
         Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -260,7 +291,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchCombinedPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
         | Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -294,7 +325,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalDutchCombinedMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
         | Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -311,7 +342,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishPersonPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -340,7 +371,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishPersonMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -354,7 +385,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishOrganizationPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -382,7 +413,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishOrganizationMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -396,7 +427,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishLocationPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -424,7 +455,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishLocationMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -438,7 +469,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishMiscPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -466,7 +497,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishMiscMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     TokenNameFinderModel maxentModel = train(spanishTrainingFile, 
LANGUAGE.SPA, params,
         Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -480,7 +511,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishCombinedPerceptron() throws IOException {
-    TrainingParameters params = EvalUtil.createPerceptronParams();
+    TrainingParameters params = createPerceptronParams();
 
     int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
         | Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -514,7 +545,7 @@ public class Conll02NameFinderEval {
 
   @Test
   public void evalSpanishCombinedMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
     int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
         | Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
index e546beb..ed24cf3 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
+import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 
@@ -55,9 +57,9 @@ import opennlp.tools.util.model.ModelUtil;
  * one package for each language, and an extra package containing the tests 
for all
  * languages.
  */
-public class ConllXPosTaggerEval {
+public class ConllXPosTaggerEval extends AbstractEvalTest {
 
-  private static POSModel train(File trainFile, String lang,
+  private POSModel train(File trainFile, String lang,
                                 TrainingParameters params) throws IOException {
 
     ObjectStream<POSSample> samples =
@@ -66,7 +68,7 @@ public class ConllXPosTaggerEval {
     return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
   }
 
-  private static void eval(POSModel model, File testData,
+  private void eval(POSModel model, File testData,
                            double expectedAccuracy) throws IOException {
 
     ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
@@ -78,25 +80,65 @@ public class ConllXPosTaggerEval {
     Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001);
   }
 
+  @BeforeClass
+  public static void verifyTrainingData() throws Exception {
+    
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          "conllx/data/danish/ddt/train/danish_ddt_train.conll")), 
StandardCharsets.UTF_8), 
+        new BigInteger("30795670444498617202001550516753630016"));
+    
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          "conllx/data/danish/ddt/test/danish_ddt_test.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("314104267846430512372780024568104131337"));
+    
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          "conllx/data/dutch/alpino/train/dutch_alpino_train.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("109328245573060521952850454797286933887"));
+
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          "conllx/data/dutch/alpino/test/dutch_alpino_test.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("132343141132816640849897155456916243039"));
+
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("9504382474772307801979515927230835901"));
+
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("175256039869578311901318972681191182910"));
+
+    verifyTrainingData(new ConllXPOSSampleStream(
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+          
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll")), 
StandardCharsets.UTF_8), 
+            new BigInteger("128378790384268106811747599235147991544"));
+    
+  }
+
   @Test
   public void evalDanishMaxentGis() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         "conllx/data/danish/ddt/train/danish_ddt_train.conll"), "dan", params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/danish/ddt/test/danish_ddt_test.conll"), 
0.9504442925495558d);
   }
 
   @Test
   public void evalDanishMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         "conllx/data/danish/ddt/train/danish_ddt_train.conll"), "dan", params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/danish/ddt/test/danish_ddt_test.conll"), 
0.9564251537935748d);
   }
 
@@ -104,22 +146,22 @@ public class ConllXPosTaggerEval {
   public void evalDutchMaxentGis() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         "conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nld", 
params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/dutch/alpino/test/dutch_alpino_test.conll"), 
0.9213965980304387d);
   }
 
   @Test
   @Category(HighMemoryUsage.class)
   public void evalDutchMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         "conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nld", 
params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/dutch/alpino/test/dutch_alpino_test.conll"), 
0.9282005371530886d);
   }
 
@@ -127,21 +169,21 @@ public class ConllXPosTaggerEval {
   public void evalPortugueseMaxentGis() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "por", 
params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"), 
0.9671041418101244d);
   }
 
   @Test
   public void evalPortugueseMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "por", 
params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         "conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"), 
0.9662519175046872d);
   }
 
@@ -149,21 +191,21 @@ public class ConllXPosTaggerEval {
   public void evalSwedishMaxentGis() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"), 
"swe", params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"), 
0.9248585572842999d);
   }
 
   @Test
   public void evalSwedishMaxentQn() throws IOException {
-    TrainingParameters params = EvalUtil.createMaxentQnParams();
+    TrainingParameters params = createMaxentQnParams();
 
-    POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+    POSModel maxentModel = train(new File(getOpennlpDataDir(),
         
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"), 
"swe", params);
 
-    eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+    eval(maxentModel, new File(getOpennlpDataDir(),
         
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"), 
0.9347595473833098d);
   }
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
deleted file mode 100644
index 2b04afb..0000000
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.eval;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.math.BigInteger;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-
-import org.junit.Assert;
-
-import opennlp.tools.ml.maxent.quasinewton.QNTrainer;
-import opennlp.tools.ml.naivebayes.NaiveBayesTrainer;
-import opennlp.tools.ml.perceptron.PerceptronTrainer;
-import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelUtil;
-
-public class EvalUtil {
-
-  static final double ACCURACY_DELTA = 0.0001d;
-
-  static TrainingParameters createPerceptronParams() {
-    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    params.put(TrainingParameters.ALGORITHM_PARAM,
-        PerceptronTrainer.PERCEPTRON_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, 0);
-    return params;
-  }
-
-  static TrainingParameters createMaxentQnParams() {
-    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    params.put(TrainingParameters.ALGORITHM_PARAM,
-        QNTrainer.MAXENT_QN_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, 0);
-    return params;
-  }
-
-  static TrainingParameters createNaiveBayesParams() {
-    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    params.put(TrainingParameters.ALGORITHM_PARAM,
-        NaiveBayesTrainer.NAIVE_BAYES_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, 5);
-    return params;
-  }
-
-  public static File getOpennlpDataDir() {
-    return new File(System.getProperty("OPENNLP_DATA_DIR"));
-  }
-
-  static MessageDigest createDigest() {
-    try {
-      return MessageDigest.getInstance("MD5");
-    } catch (NoSuchAlgorithmException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
-  static void verifyFileChecksum(Path file, BigInteger checksum) throws 
IOException {
-    MessageDigest digest = createDigest();
-
-    try (InputStream in = Files.newInputStream(file)) {
-      byte[] buf = new byte[65536];
-      int len;
-      while ((len = in.read(buf)) > 0) {
-        digest.update(buf, 0, len);
-      }
-    }
-
-    Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
-  }
-}
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index af217f8..1ce225a 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -28,7 +28,6 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
-import java.security.MessageDigest;
 import java.util.Map;
 
 import org.junit.Assert;
@@ -47,11 +46,11 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
-public class OntoNotes4NameFinderEval {
+public class OntoNotes4NameFinderEval extends AbstractEvalTest {
 
   private static ObjectStream<NameSample> createNameSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
-        EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+        getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
           if (file.isFile()) {
             return file.getName().endsWith(".name");
@@ -64,7 +63,7 @@ public class OntoNotes4NameFinderEval {
         documentStream, StandardCharsets.UTF_8));
   }
 
-  private static void crossEval(TrainingParameters params, String type, double 
expectedScore)
+  private void crossEval(TrainingParameters params, String type, double 
expectedScore)
       throws IOException {
     try (ObjectStream<NameSample> samples = createNameSampleStream()) {
 
@@ -86,18 +85,9 @@ public class OntoNotes4NameFinderEval {
   }
 
   @BeforeClass
-  public static void verifyTrainingData() throws IOException {
-    MessageDigest digest = EvalUtil.createDigest();
-
-    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
-      NameSample sample;
-      while ((sample = samples.read()) != null) {
-        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
-      }
-
-      Assert.assertEquals(new 
BigInteger("168206908604555450993491898907821588182"),
-          new BigInteger(1, digest.digest()));
-    }
+  public static void verifyTrainingData() throws Exception {
+    verifyDirectoryChecksum(new File(getOpennlpDataDir(), 
"ontonotes4/data/files/data/english").toPath(),
+        ".name", new BigInteger("74675117716526375898817028829433420680"));
   }
 
   @Test
@@ -141,7 +131,7 @@ public class OntoNotes4NameFinderEval {
 
     // create a temp resource folder and copy the pos model there
     Path resourcesPath = Files.createTempDirectory("opennlp_resources");
-    Files.copy(new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin").toPath(),
+    Files.copy(new File(getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin").toPath(),
         new File(resourcesPath.toFile(), "en-pos-perceptron.bin").toPath(),
         StandardCopyOption.REPLACE_EXISTING);
 
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index bf6a508..31c7ff7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -23,7 +23,6 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -42,11 +41,11 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
-public class OntoNotes4ParserEval {
+public class OntoNotes4ParserEval extends AbstractEvalTest {
 
   private static ObjectStream<Parse> createParseSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
-        EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+        getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
           if (file.isFile()) {
             return file.getName().endsWith(".parse");
@@ -60,7 +59,7 @@ public class OntoNotes4ParserEval {
             documentStream, StandardCharsets.UTF_8)));
   }
 
-  private static void crossEval(TrainingParameters params, HeadRules rules, 
double expectedScore)
+  private void crossEval(TrainingParameters params, HeadRules rules, double 
expectedScore)
       throws IOException {
     try (ObjectStream<Parse> samples = createParseSampleStream()) {
       ParserCrossValidator cv = new ParserCrossValidator("eng", params, rules, 
ParserType.CHUNKING);
@@ -71,18 +70,8 @@ public class OntoNotes4ParserEval {
   }
 
   @BeforeClass
-  public static void verifyTrainingData() throws IOException {
-    MessageDigest digest = EvalUtil.createDigest();
-
-    try (ObjectStream<Parse> samples = createParseSampleStream()) {
-      Parse sample;
-      while ((sample = samples.read()) != null) {
-        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
-      }
-
-      Assert.assertEquals(new 
BigInteger("83833369887442127665956850482411800415"),
-          new BigInteger(1, digest.digest()));
-    }
+  public static void verifyTrainingData() throws Exception {
+    verifyTrainingData(createParseSampleStream(), new 
BigInteger("83833369887442127665956850482411800415"));
   }
 
   @Test
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index b3939e0..a373192 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -20,8 +20,8 @@ package opennlp.tools.eval;
 import java.io.File;
 import java.io.IOException;
 import java.math.BigInteger;
+
 import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -39,11 +39,11 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
-public class OntoNotes4PosTaggerEval {
+public class OntoNotes4PosTaggerEval extends AbstractEvalTest {
 
   private static ObjectStream<POSSample> createPOSSampleStream() throws 
IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
-        EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+        getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
           if (file.isFile()) {
             return file.getName().endsWith(".parse");
@@ -57,7 +57,7 @@ public class OntoNotes4PosTaggerEval {
             new FileToStringSampleStream(documentStream, 
StandardCharsets.UTF_8))));
   }
 
-  private static void crossEval(TrainingParameters params, double 
expectedScore)
+  private void crossEval(TrainingParameters params, double expectedScore)
       throws IOException {
     try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
       POSTaggerCrossValidator cv = new POSTaggerCrossValidator("eng", params, 
new POSTaggerFactory());
@@ -68,19 +68,10 @@ public class OntoNotes4PosTaggerEval {
   }
 
   @BeforeClass
-  public static void verifyTrainingData() throws IOException {
-    MessageDigest digest = EvalUtil.createDigest();
-
-    try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
-      POSSample sample;
-      while ((sample = samples.read()) != null) {
-        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
-      }
-
-      Assert.assertEquals(new 
BigInteger("300430765214895870888056958221353356972"),
-          new BigInteger(1, digest.digest()));
-    }
+  public static void verifyTrainingData() throws Exception {
+    verifyTrainingData(createPOSSampleStream(), new 
BigInteger("300430765214895870888056958221353356972"));
   }
+  
   @Test
   public void evalEnglishMaxentTagger() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index 89f4c5e..2c2fd72 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -53,7 +53,6 @@ import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 
 /**
@@ -79,40 +78,30 @@ import opennlp.tools.util.Span;
  * - models-sf/en-pos-perceptron.bin<br>
  * - models-sf/en-parser-chunking.bin.bin<br>
  */
-public class SourceForgeModelEval {
+public class SourceForgeModelEval extends AbstractEvalTest {
 
   @BeforeClass
-  public static void ensureTestDataIsCorrect() throws IOException {
-    MessageDigest digest = EvalUtil.createDigest();
-
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
-            "leipzig/eng_news_2010_300K-sentences.txt")), 
StandardCharsets.UTF_8)) {
-
-      String line;
-      while ((line = lines.read()) != null) {
-        digest.update(line.getBytes(StandardCharsets.UTF_8));
-      }
-
-      Assert.assertEquals(new 
BigInteger("248567841356936801447294643695012852392"),
-          new BigInteger(1, digest.digest()));
-    }
+  public static void verifyTrainingData() throws Exception {
+    verifyTrainingData(new LeipzigDoccatSampleStream("eng", 25,
+            new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+                    "leipzig/eng_news_2010_300K-sentences.txt"))), 
+        new BigInteger("248567841356936801447294643695012852392"));
   }
 
   @Test
-  public void evalSentenceModel() throws IOException {
+  public void evalSentenceModel() throws Exception {
 
     SentenceModel model = new SentenceModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-sent.bin"));
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     SentenceDetector sentenceDetector = new SentenceDetectorME(model);
 
     StringBuilder text = new StringBuilder();
 
     try (ObjectStream<DocumentSample> lineBatches = new 
LeipzigDoccatSampleStream("eng", 25,
-        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
             "leipzig/eng_news_2010_300K-sentences.txt")))) {
 
       DocumentSample lineBatch;
@@ -132,22 +121,22 @@ public class SourceForgeModelEval {
   }
 
   @Test
-  public void evalTokenModel() throws IOException {
+  public void evalTokenModel() throws Exception {
 
     // the input stream is currently tokenized, we should detokenize it again,
     //    (or extend to pass in tokenizer, then whitespace tokenizer can be 
passed)
     // and then tokenize it here
 
     TokenizerModel model = new TokenizerModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-token.bin"));
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     Tokenizer tokenizer = new TokenizerME(model);
 
     try (ObjectStream<DocumentSample> lines = new 
LeipzigDoccatSampleStream("eng", 1,
         WhitespaceTokenizer.INSTANCE,
-        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
             "leipzig/eng_news_2010_300K-sentences.txt")))) {
 
       DocumentSample line;
@@ -165,15 +154,15 @@ public class SourceForgeModelEval {
 
   private ObjectStream<DocumentSample> createLineWiseStream() throws 
IOException {
     return new LeipzigDoccatSampleStream("eng", 1,
-        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+        new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
             "leipzig/eng_news_2010_300K-sentences.txt")));
   }
 
 
   private void evalNameFinder(TokenNameFinderModel model, BigInteger 
expectedHash)
-      throws IOException {
+      throws Exception {
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     TokenNameFinder nameFinder = new NameFinderME(model);
 
@@ -193,71 +182,71 @@ public class SourceForgeModelEval {
   }
 
   @Test
-  public void evalNerDateModel() throws IOException {
+  public void evalNerDateModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("116570003910213570906062355532299200317"));
   }
 
   @Test
-  public void evalNerLocationModel() throws IOException {
+  public void evalNerLocationModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-location.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-location.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("44810593886021404716125849669208680993"));
   }
 
   @Test
-  public void evalNerMoneyModel() throws IOException {
+  public void evalNerMoneyModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("65248897509365807977219790824670047287"));
   }
 
   @Test
-  public void evalNerOrganizationModel() throws IOException {
+  public void evalNerOrganizationModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-organization.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-organization.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("50454559690338630659278005157657197233"));
   }
 
   @Test
-  public void evalNerPercentageModel() throws IOException {
+  public void evalNerPercentageModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-percentage.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-percentage.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("320996882594215344113023719117249515343"));
   }
 
   @Test
-  public void evalNerPersonModel() throws IOException {
+  public void evalNerPersonModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("143619582249937129618340838626447763744"));
   }
 
   @Test
-  public void evalNerTimeModel() throws IOException {
+  public void evalNerTimeModel() throws Exception {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
 
     evalNameFinder(personModel, new 
BigInteger("282941772380683328816791801782579055940"));
   }
 
   @Test
-  public void evalChunkerModel() throws IOException {
+  public void evalChunkerModel() throws Exception {
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     POSTagger tagger = new POSTaggerME(new POSModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin")));
+        new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));
 
     Chunker chunker = new ChunkerME(new ChunkerModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin")));
+        new File(getOpennlpDataDir(), "models-sf/en-chunker.bin")));
 
     try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
 
@@ -276,12 +265,12 @@ public class SourceForgeModelEval {
         new BigInteger(1, digest.digest()));
   }
 
-  private void evalPosModel(POSModel model, BigInteger expectedHash) throws 
IOException {
+  private void evalPosModel(POSModel model, BigInteger expectedHash) throws 
Exception {
 
     // break the input stream into sentences
     // The input stream is tokenized and can be processed here directly
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     POSTagger tagger = new POSTaggerME(model);
 
@@ -300,28 +289,28 @@ public class SourceForgeModelEval {
   }
 
   @Test
-  public void evalMaxentModel() throws IOException {
+  public void evalMaxentModel() throws Exception {
     POSModel maxentModel = new POSModel(
-        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
 
     evalPosModel(maxentModel, new 
BigInteger("231995214522232523777090597594904492687"));
   }
 
   @Test
-  public void evalPerceptronModel() throws IOException {
+  public void evalPerceptronModel() throws Exception {
     POSModel perceptronModel = new POSModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"));
 
     evalPosModel(perceptronModel, new 
BigInteger("209440430718727101220960491543652921728"));
   }
 
   @Test
-  public void evalParserModel() throws IOException {
+  public void evalParserModel() throws Exception {
 
     ParserModel model = new ParserModel(
-        new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-parser-chunking.bin"));
+        new File(getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));
 
-    MessageDigest digest = EvalUtil.createDigest();
+    MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 
     Parser parser = ParserFactory.create(model);
 
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
index 7f8a17f..3ab1a7d 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
@@ -38,24 +38,24 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
-public class UniversalDependency20Eval {
+public class UniversalDependency20Eval extends AbstractEvalTest {
 
-  private static File SPA_ANCORA_TRAIN =
-      new 
File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
-  private static File SPA_ANCORA_DEV =
-      new 
File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
+  private static File SPA_ANCORA_TRAIN;      
+  private static File SPA_ANCORA_DEV;
 
   @BeforeClass
-  public static void ensureTestDataIsCorrect() throws IOException {
-    SourceForgeModelEval.ensureTestDataIsCorrect();
+  public static void verifyTrainingData() throws Exception {
 
-    EvalUtil.verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
+    SPA_ANCORA_TRAIN = new 
File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
+    SPA_ANCORA_DEV = new 
File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
+
+    verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
         new BigInteger("224942804200733453179524127037951530195"));
-    EvalUtil.verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
+    verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
         new BigInteger("280996187464384493180190898172297941708"));
   }
 
-  private static double trainAndEval(String lang, File trainFile, 
TrainingParameters params,
+  private double trainAndEval(String lang, File trainFile, TrainingParameters 
params,
                                      File evalFile) throws IOException {
     ConlluTagset tagset = ConlluTagset.X;
 
@@ -79,6 +79,6 @@ public class UniversalDependency20Eval {
     double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
         params, SPA_ANCORA_DEV);
 
-    Assert.assertEquals(0.9057341692068787d, wordAccuracy, 
EvalUtil.ACCURACY_DELTA);
+    Assert.assertEquals(0.9057341692068787d, wordAccuracy, ACCURACY_DELTA);
   }
 }

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

[opennlp] branch master updated: OPENNLP-1118: Adds data verification for eval tests.

Reply via email to