This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new e7ce3bf OPENNLP-1118: Adds data verification for eval tests.
e7ce3bf is described below
commit e7ce3bf49bbdc7b81ba229736c9017bbc57029bf
Author: jzonthemtn <[email protected]>
AuthorDate: Wed Jul 19 11:33:20 2017 -0400
OPENNLP-1118: Adds data verification for eval tests.
---
.gitignore | 1 +
.../java/opennlp/tools/eval/AbstractEvalTest.java | 140 +++++++++++++++++++++
.../opennlp/tools/eval/ArvoresDeitadasEval.java | 46 ++++---
.../opennlp/tools/eval/Conll00ChunkerEval.java | 48 ++++---
.../opennlp/tools/eval/Conll02NameFinderEval.java | 111 ++++++++++------
.../opennlp/tools/eval/ConllXPosTaggerEval.java | 88 +++++++++----
.../src/test/java/opennlp/tools/eval/EvalUtil.java | 90 -------------
.../tools/eval/OntoNotes4NameFinderEval.java | 24 ++--
.../opennlp/tools/eval/OntoNotes4ParserEval.java | 21 +---
.../tools/eval/OntoNotes4PosTaggerEval.java | 23 ++--
.../opennlp/tools/eval/SourceForgeModelEval.java | 99 +++++++--------
.../tools/eval/UniversalDependency20Eval.java | 22 ++--
12 files changed, 412 insertions(+), 301 deletions(-)
diff --git a/.gitignore b/.gitignore
index 5d44bbd..81ef51f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ target
nbactions.xml
nb-configuration.xml
*.DS_Store
+.checkstyle
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java
new file mode 100644
index 0000000..5f8865b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/AbstractEvalTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.junit.Assert;
+
+import opennlp.tools.ml.maxent.quasinewton.QNTrainer;
+import opennlp.tools.ml.naivebayes.NaiveBayesTrainer;
+import opennlp.tools.ml.perceptron.PerceptronTrainer;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+public abstract class AbstractEvalTest {
+
+ public static final double ACCURACY_DELTA = 0.0001d;
+ public static final String HASH_ALGORITHM = "MD5";
+
+ public static void verifyTrainingData(ObjectStream<?> samples, BigInteger
checksum) throws Exception {
+
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+
+ Object sample;
+ while ((sample = samples.read()) != null) {
+ digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+ }
+
+ samples.close();
+
+ Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+
+ }
+
+ public static void verifyFileChecksum(Path file, BigInteger checksum) throws
Exception {
+
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+
+ try (InputStream in = Files.newInputStream(file)) {
+ byte[] buf = new byte[65536];
+ int len;
+ while ((len = in.read(buf)) > 0) {
+ digest.update(buf, 0, len);
+ }
+ }
+
+ Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+ }
+
+ public static void verifyDirectoryChecksum(Path path, String extension,
BigInteger checksum)
+ throws Exception {
+
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
+
+ final List<Path> paths = Files.walk(path)
+ .filter(Files::isRegularFile)
+ .filter(p -> p.toString().endsWith(extension))
+ .collect(Collectors.toList());
+
+ // Ensure the paths are in a consistent order when
+ // verifying the file checksums.
+ Collections.sort(paths);
+
+ for (Path p : paths) {
+ try (InputStream in = Files.newInputStream(p)) {
+ byte[] buf = new byte[65536];
+ int len;
+ while ((len = in.read(buf)) > 0) {
+ digest.update(buf, 0, len);
+ }
+ }
+ }
+
+ Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+ }
+
+ public static File getOpennlpDataDir() throws FileNotFoundException {
+ final String dataDirectory = System.getProperty("OPENNLP_DATA_DIR");
+ if (StringUtil.isEmpty(dataDirectory)) {
+ throw new IllegalArgumentException("The OPENNLP_DATA_DIR is not set.");
+ }
+ final File file = new File(System.getProperty("OPENNLP_DATA_DIR"));
+ if (!file.exists()) {
+ throw new FileNotFoundException("The OPENNLP_DATA_DIR path of " +
dataDirectory + " was not found.");
+ }
+ return file;
+ }
+
+ public TrainingParameters createPerceptronParams() {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+ params.put(TrainingParameters.ALGORITHM_PARAM,
+ PerceptronTrainer.PERCEPTRON_VALUE);
+ params.put(TrainingParameters.CUTOFF_PARAM, 0);
+ return params;
+ }
+
+ public TrainingParameters createMaxentQnParams() {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+ params.put(TrainingParameters.ALGORITHM_PARAM,
+ QNTrainer.MAXENT_QN_VALUE);
+ params.put(TrainingParameters.CUTOFF_PARAM, 0);
+ return params;
+ }
+
+ public TrainingParameters createNaiveBayesParams() {
+ TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+ params.put(TrainingParameters.ALGORITHM_PARAM,
+ NaiveBayesTrainer.NAIVE_BAYES_VALUE);
+ params.put(TrainingParameters.CUTOFF_PARAM, 5);
+ return params;
+ }
+
+}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
index 6ee3eb0..cd34046 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
+import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.chunker.ChunkerCrossValidator;
@@ -61,7 +63,7 @@ import opennlp.tools.util.model.ModelUtil;
* Bosque_CF_8.0.ad.txt.gz </a></li>
* </ul>
*/
-public class ArvoresDeitadasEval {
+public class ArvoresDeitadasEval extends AbstractEvalTest {
private static final String BOSQUE = "ad/Bosque_CF_8.0.ad.txt";
private static final String FLORESTA_VIRGEM =
"ad/FlorestaVirgem_CF_3.0_ad.txt";
@@ -71,10 +73,21 @@ public class ArvoresDeitadasEval {
private static ObjectStream<String> getLineSample(String corpus)
throws IOException {
return new PlainTextByLineStream(new MarkableFileInputStreamFactory(
- new File(EvalUtil.getOpennlpDataDir(), corpus)),
StandardCharsets.ISO_8859_1);
+ new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1);
}
+
+ @BeforeClass
+ public static void verifyTrainingData() throws Exception {
- private static void sentenceCrossEval(TrainingParameters params,
+ verifyTrainingData(new ADSentenceSampleStream(getLineSample(BOSQUE),
false),
+ new BigInteger("140568367548727787313497336739085858596"));
+
+ verifyTrainingData(new
ADSentenceSampleStream(getLineSample(FLORESTA_VIRGEM), false),
+ new BigInteger("2614161133949079191933514776652602918"));
+
+ }
+
+ private void sentenceCrossEval(TrainingParameters params,
double expectedScore) throws
IOException {
ADSentenceSampleStream samples = new ADSentenceSampleStream(
@@ -90,7 +103,7 @@ public class ArvoresDeitadasEval {
Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(),
0.0001d);
}
- private static void tokenizerCrossEval(TrainingParameters params,
+ private void tokenizerCrossEval(TrainingParameters params,
double expectedScore) throws
IOException {
ObjectStream<NameSample> nameSamples = new ADNameSampleStream(
@@ -116,7 +129,7 @@ public class ArvoresDeitadasEval {
0.0001d);
}
- private static void chunkerCrossEval(TrainingParameters params,
+ private void chunkerCrossEval(TrainingParameters params,
double expectedScore) throws
IOException {
ADChunkSampleStream samples = new
ADChunkSampleStream(getLineSample(BOSQUE));
@@ -130,7 +143,7 @@ public class ArvoresDeitadasEval {
@Test
public void evalPortugueseSentenceDetectorPerceptron() throws IOException {
- sentenceCrossEval(EvalUtil.createPerceptronParams(), 0.9892778840089301d);
+ sentenceCrossEval(createPerceptronParams(), 0.9892778840089301d);
}
@Test
@@ -140,17 +153,17 @@ public class ArvoresDeitadasEval {
@Test
public void evalPortugueseSentenceDetectorMaxentQn() throws IOException {
- sentenceCrossEval(EvalUtil.createMaxentQnParams(), 0.99261110833375d);
+ sentenceCrossEval(createMaxentQnParams(), 0.99261110833375d);
}
@Test
public void evalPortugueseSentenceDetectorNaiveBayes() throws IOException {
- sentenceCrossEval(EvalUtil.createNaiveBayesParams(), 0.9672196206048099d);
+ sentenceCrossEval(createNaiveBayesParams(), 0.9672196206048099d);
}
@Test
public void evalPortugueseTokenizerPerceptron() throws IOException {
- tokenizerCrossEval(EvalUtil.createPerceptronParams(), 0.9994887308380267d);
+ tokenizerCrossEval(createPerceptronParams(), 0.9994887308380267d);
}
@Test
@@ -160,23 +173,24 @@ public class ArvoresDeitadasEval {
@Test
public void evalPortugueseTokenizerMaxentQn() throws IOException {
- tokenizerCrossEval(EvalUtil.createMaxentQnParams(), 0.9996017148748251d);
+ tokenizerCrossEval(createMaxentQnParams(), 0.9996017148748251d);
}
@Test
public void evalPortugueseTokenizerNaiveBayes() throws IOException {
- tokenizerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9962358244502717d);
+ tokenizerCrossEval(createNaiveBayesParams(), 0.9962358244502717d);
}
+
@Test
public void evalPortugueseTokenizerMaxentQnMultipleThreads() throws
IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
params.put("Threads", 4);
tokenizerCrossEval(params, 0.9996017148748251d);
}
@Test
public void evalPortugueseChunkerPerceptron() throws IOException {
- chunkerCrossEval(EvalUtil.createPerceptronParams(),
+ chunkerCrossEval(createPerceptronParams(),
0.9638122825015589d);
}
@@ -195,13 +209,13 @@ public class ArvoresDeitadasEval {
@Test
public void evalPortugueseChunkerQn() throws IOException {
- chunkerCrossEval(EvalUtil.createMaxentQnParams(),
+ chunkerCrossEval(createMaxentQnParams(),
0.9648211936491359d);
}
@Test
public void evalPortugueseChunkerQnMultipleThreads() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
params.put("Threads", 4);
// NOTE: Should be the same as without multiple threads!!!
@@ -210,6 +224,6 @@ public class ArvoresDeitadasEval {
@Test
public void evalPortugueseChunkerNaiveBayes() throws IOException {
- chunkerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9041507736043933d);
+ chunkerCrossEval(createNaiveBayesParams(), 0.9041507736043933d);
}
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index f4712d9..80a0a74 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
+import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@@ -45,8 +47,11 @@ import opennlp.tools.util.model.ModelUtil;
* <a href="http://www.cnts.ua.ac.be/conll2000/chunking/"> site </a>
* and decompress them into this directory: $OPENNLP_DATA_DIR/conll00.
*/
-public class Conll00ChunkerEval {
+public class Conll00ChunkerEval extends AbstractEvalTest {
+ private static File TEST_DATA_FILE;
+ private static File TRAIN_DATA_FILE;
+
private static ChunkerModel train(File trainFile, TrainingParameters params)
throws IOException {
@@ -68,38 +73,47 @@ public class Conll00ChunkerEval {
Assert.assertEquals(expectedFMeasure,
evaluator.getFMeasure().getFMeasure(), 0.0001);
}
+
+ @BeforeClass
+ public static void verifyTrainingData() throws Exception {
+
+ TEST_DATA_FILE = new File(getOpennlpDataDir(), "conll00/test.txt");
+ TRAIN_DATA_FILE = new File(getOpennlpDataDir(), "conll00/train.txt");
+
+ verifyTrainingData(new ChunkSampleStream(
+ new PlainTextByLineStream(new
MarkableFileInputStreamFactory(TEST_DATA_FILE),
+ StandardCharsets.UTF_8)),
+ new BigInteger("84610235226433393380477662908529306002"));
+
+ verifyTrainingData(new ChunkSampleStream(
+ new PlainTextByLineStream(new
MarkableFileInputStreamFactory(TEST_DATA_FILE),
+ StandardCharsets.UTF_8)),
+ new BigInteger("84610235226433393380477662908529306002"));
+
+ }
@Test
public void evalEnglishPerceptron() throws IOException {
- ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
- "conll00/train.txt"), EvalUtil.createPerceptronParams());
+ ChunkerModel maxentModel = train(TRAIN_DATA_FILE,
createPerceptronParams());
- eval(maxentModel,
- new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
- 0.9295018353434714d);
+ eval(maxentModel, TEST_DATA_FILE, 0.9295018353434714d);
}
@Test
public void evalEnglishMaxentGis() throws IOException {
- ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
- "conll00/train.txt"), ModelUtil.createDefaultTrainingParameters());
+ ChunkerModel maxentModel = train(TRAIN_DATA_FILE,
ModelUtil.createDefaultTrainingParameters());
- eval(maxentModel,
- new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
- 0.9239687473746113d);
+ eval(maxentModel, TEST_DATA_FILE, 0.9239687473746113d);
}
// Note: Don't try to run this on your MacBook
@Test
@Category(HighMemoryUsage.class)
public void evalEnglishMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
params.put("Threads", 4);
- ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
- "conll00/train.txt"), params);
+ ChunkerModel maxentModel = train(TRAIN_DATA_FILE, params);
- eval(maxentModel,
- new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"),
- 0.9302599230947028d);
+ eval(maxentModel, TEST_DATA_FILE, 0.9302599230947028d);
}
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
index c233686..fccc8e9 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll02NameFinderEval.java
@@ -20,7 +20,10 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
+import java.math.BigInteger;
+
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.Conll02NameSampleStream;
@@ -48,27 +51,17 @@ import opennlp.tools.util.model.ModelUtil;
* - Double check the encoding which is used to open the files. Currently that
is UTF-8.
* - Make the Conll02 reader compatible. Currently it doesn't work with
spanish data without pos tags.
*/
-public class Conll02NameFinderEval {
-
- private final File dutchTrainingFile;
- private final File dutchTestAFile;
- private final File dutchTestBFile;
+public class Conll02NameFinderEval extends AbstractEvalTest {
- private final File spanishTrainingFile;
- private final File spanishTestAFile;
- private final File spanishTestBFile;
+ private static File dutchTrainingFile;
+ private static File dutchTestAFile;
+ private static File dutchTestBFile;
+ private static File spanishTrainingFile;
+ private static File spanishTestAFile;
+ private static File spanishTestBFile;
+
- public Conll02NameFinderEval() {
- dutchTrainingFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/ned.train");
- dutchTestAFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/ned.testa");
- dutchTestBFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/ned.testb");
-
- spanishTrainingFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/esp.train");
- spanishTestAFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/esp.testa");
- spanishTestBFile = new File(EvalUtil.getOpennlpDataDir(),
"conll02/ner/data/esp.testb");
- }
-
- private static TokenNameFinderModel train(File trainFile, LANGUAGE lang,
+ private TokenNameFinderModel train(File trainFile, LANGUAGE lang,
TrainingParameters params, int types) throws IOException {
ObjectStream<NameSample> samples = new Conll02NameSampleStream(
@@ -78,7 +71,7 @@ public class Conll02NameFinderEval {
params, new TokenNameFinderFactory());
}
- private static void eval(TokenNameFinderModel model, File testData, LANGUAGE
lang,
+ private void eval(TokenNameFinderModel model, File testData, LANGUAGE lang,
int types, double expectedFMeasure) throws IOException {
ObjectStream<NameSample> samples = new Conll02NameSampleStream(
@@ -89,10 +82,48 @@ public class Conll02NameFinderEval {
Assert.assertEquals(expectedFMeasure,
evaluator.getFMeasure().getFMeasure(), 0.0001);
}
+
+ @BeforeClass
+ public static void verifyTrainingData() throws Exception {
+
+ dutchTrainingFile = new File(getOpennlpDataDir(),
"conll02/ner/data/ned.train");
+ dutchTestAFile = new File(getOpennlpDataDir(),
"conll02/ner/data/ned.testa");
+ dutchTestBFile = new File(getOpennlpDataDir(),
"conll02/ner/data/ned.testb");
+ spanishTrainingFile = new File(getOpennlpDataDir(),
"conll02/ner/data/esp.train");
+ spanishTestAFile = new File(getOpennlpDataDir(),
"conll02/ner/data/esp.testa");
+ spanishTestBFile = new File(getOpennlpDataDir(),
"conll02/ner/data/esp.testb");
+
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTrainingFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("109687424525847313767541246922170457976"));
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestAFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("12942966701628852910737840182656846323"));
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestBFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("223206987942490952427646331013509976957"));
+
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTrainingFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("226089384066775461905386060946810714487"));
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestAFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("313879596837181728494732341737647284762"));
+ verifyTrainingData(new Conll02NameSampleStream(
+ LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestBFile),
+ Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
+ new BigInteger("24037715705115461166858183817622459974"));
+
+ }
@Test
public void evalDutchPersonPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -120,7 +151,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchPersonMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -134,7 +165,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchOrganizationPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -162,7 +193,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchOrganizationMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -176,7 +207,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchLocationPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -204,7 +235,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchLocationMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -218,7 +249,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchMiscPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -246,7 +277,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchMiscMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(dutchTrainingFile, LANGUAGE.NLD,
params,
Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -260,7 +291,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchCombinedPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
| Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -294,7 +325,7 @@ public class Conll02NameFinderEval {
@Test
public void evalDutchCombinedMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
| Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -311,7 +342,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishPersonPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -340,7 +371,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishPersonMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_PERSON_ENTITIES);
@@ -354,7 +385,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishOrganizationPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -382,7 +413,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishOrganizationMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
@@ -396,7 +427,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishLocationPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -424,7 +455,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishLocationMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_LOCATION_ENTITIES);
@@ -438,7 +469,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishMiscPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -466,7 +497,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishMiscMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
TokenNameFinderModel maxentModel = train(spanishTrainingFile,
LANGUAGE.SPA, params,
Conll02NameSampleStream.GENERATE_MISC_ENTITIES);
@@ -480,7 +511,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishCombinedPerceptron() throws IOException {
- TrainingParameters params = EvalUtil.createPerceptronParams();
+ TrainingParameters params = createPerceptronParams();
int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
| Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
@@ -514,7 +545,7 @@ public class Conll02NameFinderEval {
@Test
public void evalSpanishCombinedMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
int combinedType = Conll02NameSampleStream.GENERATE_PERSON_ENTITIES
| Conll02NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
index e546beb..ed24cf3 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
@@ -19,9 +19,11 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
+import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@@ -55,9 +57,9 @@ import opennlp.tools.util.model.ModelUtil;
* one package for each language, and an extra package containing the tests
for all
* languages.
*/
-public class ConllXPosTaggerEval {
+public class ConllXPosTaggerEval extends AbstractEvalTest {
- private static POSModel train(File trainFile, String lang,
+ private POSModel train(File trainFile, String lang,
TrainingParameters params) throws IOException {
ObjectStream<POSSample> samples =
@@ -66,7 +68,7 @@ public class ConllXPosTaggerEval {
return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
}
- private static void eval(POSModel model, File testData,
+ private void eval(POSModel model, File testData,
double expectedAccuracy) throws IOException {
ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
@@ -78,25 +80,65 @@ public class ConllXPosTaggerEval {
Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001);
}
+ @BeforeClass
+ public static void verifyTrainingData() throws Exception {
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "conllx/data/danish/ddt/train/danish_ddt_train.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("30795670444498617202001550516753630016"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "conllx/data/danish/ddt/test/danish_ddt_test.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("314104267846430512372780024568104131337"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "conllx/data/dutch/alpino/train/dutch_alpino_train.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("109328245573060521952850454797286933887"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "conllx/data/dutch/alpino/test/dutch_alpino_test.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("132343141132816640849897155456916243039"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("9504382474772307801979515927230835901"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("175256039869578311901318972681191182910"));
+
+ verifyTrainingData(new ConllXPOSSampleStream(
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll")),
StandardCharsets.UTF_8),
+ new BigInteger("128378790384268106811747599235147991544"));
+
+ }
+
@Test
public void evalDanishMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/danish/ddt/train/danish_ddt_train.conll"), "dan", params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/danish/ddt/test/danish_ddt_test.conll"),
0.9504442925495558d);
}
@Test
public void evalDanishMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/danish/ddt/train/danish_ddt_train.conll"), "dan", params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/danish/ddt/test/danish_ddt_test.conll"),
0.9564251537935748d);
}
@@ -104,22 +146,22 @@ public class ConllXPosTaggerEval {
public void evalDutchMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nld",
params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/dutch/alpino/test/dutch_alpino_test.conll"),
0.9213965980304387d);
}
@Test
@Category(HighMemoryUsage.class)
public void evalDutchMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nld",
params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/dutch/alpino/test/dutch_alpino_test.conll"),
0.9282005371530886d);
}
@@ -127,21 +169,21 @@ public class ConllXPosTaggerEval {
public void evalPortugueseMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "por",
params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"),
0.9671041418101244d);
}
@Test
public void evalPortugueseMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "por",
params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"),
0.9662519175046872d);
}
@@ -149,21 +191,21 @@ public class ConllXPosTaggerEval {
public void evalSwedishMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"),
"swe", params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"),
0.9248585572842999d);
}
@Test
public void evalSwedishMaxentQn() throws IOException {
- TrainingParameters params = EvalUtil.createMaxentQnParams();
+ TrainingParameters params = createMaxentQnParams();
- POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
+ POSModel maxentModel = train(new File(getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"),
"swe", params);
- eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
+ eval(maxentModel, new File(getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"),
0.9347595473833098d);
}
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
deleted file mode 100644
index 2b04afb..0000000
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.eval;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.math.BigInteger;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-
-import org.junit.Assert;
-
-import opennlp.tools.ml.maxent.quasinewton.QNTrainer;
-import opennlp.tools.ml.naivebayes.NaiveBayesTrainer;
-import opennlp.tools.ml.perceptron.PerceptronTrainer;
-import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelUtil;
-
-public class EvalUtil {
-
- static final double ACCURACY_DELTA = 0.0001d;
-
- static TrainingParameters createPerceptronParams() {
- TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- params.put(TrainingParameters.ALGORITHM_PARAM,
- PerceptronTrainer.PERCEPTRON_VALUE);
- params.put(TrainingParameters.CUTOFF_PARAM, 0);
- return params;
- }
-
- static TrainingParameters createMaxentQnParams() {
- TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- params.put(TrainingParameters.ALGORITHM_PARAM,
- QNTrainer.MAXENT_QN_VALUE);
- params.put(TrainingParameters.CUTOFF_PARAM, 0);
- return params;
- }
-
- static TrainingParameters createNaiveBayesParams() {
- TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
- params.put(TrainingParameters.ALGORITHM_PARAM,
- NaiveBayesTrainer.NAIVE_BAYES_VALUE);
- params.put(TrainingParameters.CUTOFF_PARAM, 5);
- return params;
- }
-
- public static File getOpennlpDataDir() {
- return new File(System.getProperty("OPENNLP_DATA_DIR"));
- }
-
- static MessageDigest createDigest() {
- try {
- return MessageDigest.getInstance("MD5");
- } catch (NoSuchAlgorithmException e) {
- throw new IllegalStateException(e);
- }
- }
-
- static void verifyFileChecksum(Path file, BigInteger checksum) throws
IOException {
- MessageDigest digest = createDigest();
-
- try (InputStream in = Files.newInputStream(file)) {
- byte[] buf = new byte[65536];
- int len;
- while ((len = in.read(buf)) > 0) {
- digest.update(buf, 0, len);
- }
- }
-
- Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
- }
-}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index af217f8..1ce225a 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -28,7 +28,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
-import java.security.MessageDigest;
import java.util.Map;
import org.junit.Assert;
@@ -47,11 +46,11 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
-public class OntoNotes4NameFinderEval {
+public class OntoNotes4NameFinderEval extends AbstractEvalTest {
private static ObjectStream<NameSample> createNameSampleStream() throws
IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
- EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+ getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
if (file.isFile()) {
return file.getName().endsWith(".name");
@@ -64,7 +63,7 @@ public class OntoNotes4NameFinderEval {
documentStream, StandardCharsets.UTF_8));
}
- private static void crossEval(TrainingParameters params, String type, double
expectedScore)
+ private void crossEval(TrainingParameters params, String type, double
expectedScore)
throws IOException {
try (ObjectStream<NameSample> samples = createNameSampleStream()) {
@@ -86,18 +85,9 @@ public class OntoNotes4NameFinderEval {
}
@BeforeClass
- public static void verifyTrainingData() throws IOException {
- MessageDigest digest = EvalUtil.createDigest();
-
- try (ObjectStream<NameSample> samples = createNameSampleStream()) {
- NameSample sample;
- while ((sample = samples.read()) != null) {
- digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
- }
-
- Assert.assertEquals(new
BigInteger("168206908604555450993491898907821588182"),
- new BigInteger(1, digest.digest()));
- }
+ public static void verifyTrainingData() throws Exception {
+ verifyDirectoryChecksum(new File(getOpennlpDataDir(),
"ontonotes4/data/files/data/english").toPath(),
+ ".name", new BigInteger("74675117716526375898817028829433420680"));
}
@Test
@@ -141,7 +131,7 @@ public class OntoNotes4NameFinderEval {
// create a temp resource folder and copy the pos model there
Path resourcesPath = Files.createTempDirectory("opennlp_resources");
- Files.copy(new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-pos-perceptron.bin").toPath(),
+ Files.copy(new File(getOpennlpDataDir(),
"models-sf/en-pos-perceptron.bin").toPath(),
new File(resourcesPath.toFile(), "en-pos-perceptron.bin").toPath(),
StandardCopyOption.REPLACE_EXISTING);
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index bf6a508..31c7ff7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -23,7 +23,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
import org.junit.Assert;
import org.junit.BeforeClass;
@@ -42,11 +41,11 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
-public class OntoNotes4ParserEval {
+public class OntoNotes4ParserEval extends AbstractEvalTest {
private static ObjectStream<Parse> createParseSampleStream() throws
IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
- EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+ getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
if (file.isFile()) {
return file.getName().endsWith(".parse");
@@ -60,7 +59,7 @@ public class OntoNotes4ParserEval {
documentStream, StandardCharsets.UTF_8)));
}
- private static void crossEval(TrainingParameters params, HeadRules rules,
double expectedScore)
+ private void crossEval(TrainingParameters params, HeadRules rules, double
expectedScore)
throws IOException {
try (ObjectStream<Parse> samples = createParseSampleStream()) {
ParserCrossValidator cv = new ParserCrossValidator("eng", params, rules,
ParserType.CHUNKING);
@@ -71,18 +70,8 @@ public class OntoNotes4ParserEval {
}
@BeforeClass
- public static void verifyTrainingData() throws IOException {
- MessageDigest digest = EvalUtil.createDigest();
-
- try (ObjectStream<Parse> samples = createParseSampleStream()) {
- Parse sample;
- while ((sample = samples.read()) != null) {
- digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
- }
-
- Assert.assertEquals(new
BigInteger("83833369887442127665956850482411800415"),
- new BigInteger(1, digest.digest()));
- }
+ public static void verifyTrainingData() throws Exception {
+ verifyTrainingData(createParseSampleStream(), new
BigInteger("83833369887442127665956850482411800415"));
}
@Test
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index b3939e0..a373192 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -20,8 +20,8 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
+
import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
import org.junit.Assert;
import org.junit.BeforeClass;
@@ -39,11 +39,11 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
-public class OntoNotes4PosTaggerEval {
+public class OntoNotes4PosTaggerEval extends AbstractEvalTest {
private static ObjectStream<POSSample> createPOSSampleStream() throws
IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
- EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
+ getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
if (file.isFile()) {
return file.getName().endsWith(".parse");
@@ -57,7 +57,7 @@ public class OntoNotes4PosTaggerEval {
new FileToStringSampleStream(documentStream,
StandardCharsets.UTF_8))));
}
- private static void crossEval(TrainingParameters params, double
expectedScore)
+ private void crossEval(TrainingParameters params, double expectedScore)
throws IOException {
try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
POSTaggerCrossValidator cv = new POSTaggerCrossValidator("eng", params,
new POSTaggerFactory());
@@ -68,19 +68,10 @@ public class OntoNotes4PosTaggerEval {
}
@BeforeClass
- public static void verifyTrainingData() throws IOException {
- MessageDigest digest = EvalUtil.createDigest();
-
- try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
- POSSample sample;
- while ((sample = samples.read()) != null) {
- digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
- }
-
- Assert.assertEquals(new
BigInteger("300430765214895870888056958221353356972"),
- new BigInteger(1, digest.digest()));
- }
+ public static void verifyTrainingData() throws Exception {
+ verifyTrainingData(createPOSSampleStream(), new
BigInteger("300430765214895870888056958221353356972"));
}
+
@Test
public void evalEnglishMaxentTagger() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index 89f4c5e..2c2fd72 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -53,7 +53,6 @@ import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
/**
@@ -79,40 +78,30 @@ import opennlp.tools.util.Span;
* - models-sf/en-pos-perceptron.bin<br>
* - models-sf/en-parser-chunking.bin.bin<br>
*/
-public class SourceForgeModelEval {
+public class SourceForgeModelEval extends AbstractEvalTest {
@BeforeClass
- public static void ensureTestDataIsCorrect() throws IOException {
- MessageDigest digest = EvalUtil.createDigest();
-
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new
File(EvalUtil.getOpennlpDataDir(),
- "leipzig/eng_news_2010_300K-sentences.txt")),
StandardCharsets.UTF_8)) {
-
- String line;
- while ((line = lines.read()) != null) {
- digest.update(line.getBytes(StandardCharsets.UTF_8));
- }
-
- Assert.assertEquals(new
BigInteger("248567841356936801447294643695012852392"),
- new BigInteger(1, digest.digest()));
- }
+ public static void verifyTrainingData() throws Exception {
+ verifyTrainingData(new LeipzigDoccatSampleStream("eng", 25,
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt"))),
+ new BigInteger("248567841356936801447294643695012852392"));
}
@Test
- public void evalSentenceModel() throws IOException {
+ public void evalSentenceModel() throws Exception {
SentenceModel model = new SentenceModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-sent.bin"));
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
SentenceDetector sentenceDetector = new SentenceDetectorME(model);
StringBuilder text = new StringBuilder();
try (ObjectStream<DocumentSample> lineBatches = new
LeipzigDoccatSampleStream("eng", 25,
- new MarkableFileInputStreamFactory(new
File(EvalUtil.getOpennlpDataDir(),
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")))) {
DocumentSample lineBatch;
@@ -132,22 +121,22 @@ public class SourceForgeModelEval {
}
@Test
- public void evalTokenModel() throws IOException {
+ public void evalTokenModel() throws Exception {
// the input stream is currently tokenized, we should detokenize it again,
// (or extend to pass in tokenizer, then whitespace tokenizer can be
passed)
// and then tokenize it here
TokenizerModel model = new TokenizerModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-token.bin"));
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
Tokenizer tokenizer = new TokenizerME(model);
try (ObjectStream<DocumentSample> lines = new
LeipzigDoccatSampleStream("eng", 1,
WhitespaceTokenizer.INSTANCE,
- new MarkableFileInputStreamFactory(new
File(EvalUtil.getOpennlpDataDir(),
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")))) {
DocumentSample line;
@@ -165,15 +154,15 @@ public class SourceForgeModelEval {
private ObjectStream<DocumentSample> createLineWiseStream() throws
IOException {
return new LeipzigDoccatSampleStream("eng", 1,
- new MarkableFileInputStreamFactory(new
File(EvalUtil.getOpennlpDataDir(),
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")));
}
private void evalNameFinder(TokenNameFinderModel model, BigInteger
expectedHash)
- throws IOException {
+ throws Exception {
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
TokenNameFinder nameFinder = new NameFinderME(model);
@@ -193,71 +182,71 @@ public class SourceForgeModelEval {
}
@Test
- public void evalNerDateModel() throws IOException {
+ public void evalNerDateModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
evalNameFinder(personModel, new
BigInteger("116570003910213570906062355532299200317"));
}
@Test
- public void evalNerLocationModel() throws IOException {
+ public void evalNerLocationModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-ner-location.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-location.bin"));
evalNameFinder(personModel, new
BigInteger("44810593886021404716125849669208680993"));
}
@Test
- public void evalNerMoneyModel() throws IOException {
+ public void evalNerMoneyModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
evalNameFinder(personModel, new
BigInteger("65248897509365807977219790824670047287"));
}
@Test
- public void evalNerOrganizationModel() throws IOException {
+ public void evalNerOrganizationModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-ner-organization.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-organization.bin"));
evalNameFinder(personModel, new
BigInteger("50454559690338630659278005157657197233"));
}
@Test
- public void evalNerPercentageModel() throws IOException {
+ public void evalNerPercentageModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-ner-percentage.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-percentage.bin"));
evalNameFinder(personModel, new
BigInteger("320996882594215344113023719117249515343"));
}
@Test
- public void evalNerPersonModel() throws IOException {
+ public void evalNerPersonModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
evalNameFinder(personModel, new
BigInteger("143619582249937129618340838626447763744"));
}
@Test
- public void evalNerTimeModel() throws IOException {
+ public void evalNerTimeModel() throws Exception {
TokenNameFinderModel personModel = new TokenNameFinderModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
evalNameFinder(personModel, new
BigInteger("282941772380683328816791801782579055940"));
}
@Test
- public void evalChunkerModel() throws IOException {
+ public void evalChunkerModel() throws Exception {
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
POSTagger tagger = new POSTaggerME(new POSModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-pos-perceptron.bin")));
+ new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));
Chunker chunker = new ChunkerME(new ChunkerModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin")));
+ new File(getOpennlpDataDir(), "models-sf/en-chunker.bin")));
try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
@@ -276,12 +265,12 @@ public class SourceForgeModelEval {
new BigInteger(1, digest.digest()));
}
- private void evalPosModel(POSModel model, BigInteger expectedHash) throws
IOException {
+ private void evalPosModel(POSModel model, BigInteger expectedHash) throws
Exception {
// break the input stream into sentences
// The input stream is tokenized and can be processed here directly
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
POSTagger tagger = new POSTaggerME(model);
@@ -300,28 +289,28 @@ public class SourceForgeModelEval {
}
@Test
- public void evalMaxentModel() throws IOException {
+ public void evalMaxentModel() throws Exception {
POSModel maxentModel = new POSModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
evalPosModel(maxentModel, new
BigInteger("231995214522232523777090597594904492687"));
}
@Test
- public void evalPerceptronModel() throws IOException {
+ public void evalPerceptronModel() throws Exception {
POSModel perceptronModel = new POSModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-pos-perceptron.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"));
evalPosModel(perceptronModel, new
BigInteger("209440430718727101220960491543652921728"));
}
@Test
- public void evalParserModel() throws IOException {
+ public void evalParserModel() throws Exception {
ParserModel model = new ParserModel(
- new File(EvalUtil.getOpennlpDataDir(),
"models-sf/en-parser-chunking.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));
- MessageDigest digest = EvalUtil.createDigest();
+ MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
Parser parser = ParserFactory.create(model);
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
index 7f8a17f..3ab1a7d 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
@@ -38,24 +38,24 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
-public class UniversalDependency20Eval {
+public class UniversalDependency20Eval extends AbstractEvalTest {
- private static File SPA_ANCORA_TRAIN =
- new
File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
- private static File SPA_ANCORA_DEV =
- new
File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
+ private static File SPA_ANCORA_TRAIN;
+ private static File SPA_ANCORA_DEV;
@BeforeClass
- public static void ensureTestDataIsCorrect() throws IOException {
- SourceForgeModelEval.ensureTestDataIsCorrect();
+ public static void verifyTrainingData() throws Exception {
- EvalUtil.verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
+ SPA_ANCORA_TRAIN = new
File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
+ SPA_ANCORA_DEV = new
File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
+
+ verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
new BigInteger("224942804200733453179524127037951530195"));
- EvalUtil.verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
+ verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
new BigInteger("280996187464384493180190898172297941708"));
}
- private static double trainAndEval(String lang, File trainFile,
TrainingParameters params,
+ private double trainAndEval(String lang, File trainFile, TrainingParameters
params,
File evalFile) throws IOException {
ConlluTagset tagset = ConlluTagset.X;
@@ -79,6 +79,6 @@ public class UniversalDependency20Eval {
double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
params, SPA_ANCORA_DEV);
- Assert.assertEquals(0.9057341692068787d, wordAccuracy,
EvalUtil.ACCURACY_DELTA);
+ Assert.assertEquals(0.9057341692068787d, wordAccuracy, ACCURACY_DELTA);
}
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].