This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 5eec98c7 OPENNLP-1731: Add Junits for NGramLanguageModelTool (#778)
5eec98c7 is described below
commit 5eec98c7ba0258cbc27d7d8bfb6c0964920683a3
Author: NishantShri4 <[email protected]>
AuthorDate: Thu May 15 09:25:06 2025 +0100
OPENNLP-1731: Add Junits for NGramLanguageModelTool (#778)
* OPENNLP-1731: Add Junits for NGramLanguageModelTool
* OPENNLP-1731: AbstractLoggerTest : Corrected a javadoc comment.
* OPENNLP-1731: Add Junits for NGramLanguageModelTool
* OPENNLP-1731: AbstractLoggerTest : Corrected a javadoc comment.
* OPENNLP-1731: Fixed a Generic RawType warning.
* OPENNLP-1731: Rebased against upstream.
* OPENNLP-1731: Rebased against upstream.
* OPENNLP-1731: Rebased against upstream (removed extra new line).
* OPENNLP-1731: Removed an extra newline.
---
.../java/opennlp/tools/AbstractLoggerTest.java | 57 +++++++
.../languagemodel/NGramLanguageModelToolTest.java | 164 +++++++++++++++++++++
.../DefaultTrainingProgressMonitorTest.java | 130 ++++++++--------
.../languagemodel/origin_of_text_samples.txt | 4 +
.../cmdline/languagemodel/sentences_set_1.txt | 10 ++
.../cmdline/languagemodel/sentences_set_2.txt | 11 ++
.../opennlp/tools/languagemodel/sentences.txt | 2 +-
rat-excludes | 2 +
8 files changed, 310 insertions(+), 70 deletions(-)
diff --git a/opennlp-tools/src/test/java/opennlp/tools/AbstractLoggerTest.java
b/opennlp-tools/src/test/java/opennlp/tools/AbstractLoggerTest.java
new file mode 100644
index 00000000..79c944c5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/AbstractLoggerTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools;
+
+import java.util.Objects;
+
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An abstract class to configure the {@link Logger} instance to help with
unit-testing.
+ */
+public abstract class AbstractLoggerTest {
+
+ public static final String LOGGER_OPENNLP = "opennlp";
+
+ /**
+ * Prepare the logging resource.
+ * @param loggerName Name of the {@link Logger}.
+ */
+ public static void prepare(String loggerName) {
+ getLogger(loggerName).setLevel(Level.INFO);
+ }
+
+ /*
+ * Restores the logging resource to its default config.
+ */
+ public static void restore(String loggerName) {
+ getLogger(loggerName).setLevel(Level.OFF);
+ }
+
+ private static Logger getLogger(String loggerName) {
+ Logger logger = (Logger) LoggerFactory.getLogger(loggerName);
+ if (Objects.isNull(logger)) {
+ throw new IllegalArgumentException("A logger instance couldn't be
created for the given logger "
+ + loggerName);
+ }
+ return logger;
+ }
+}
+
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelToolTest.java
b/opennlp-tools/src/test/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelToolTest.java
new file mode 100644
index 00000000..b50fcf51
--- /dev/null
+++
b/opennlp-tools/src/test/java/opennlp/tools/cmdline/languagemodel/NGramLanguageModelToolTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.languagemodel;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+
+import nl.altindag.log.LogCaptor;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.function.Executable;
+import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import opennlp.tools.AbstractLoggerTest;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.ngram.NGramGenerator;
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+
+class NGramLanguageModelToolTest extends AbstractLoggerTest {
+
+ private static final int NGRAM_MIN_LENGTH = 1;
+ private static final int NGRAM_MAX_LENGTH = 3;
+
+ @TempDir
+ private static File testDir;
+
+ private static final InputStream sysInputStream = System.in;
+
+ @BeforeAll
+ public static void prepare() {
+ prepare(LOGGER_OPENNLP);
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideNgramDictionaryXML")
+ void testRunTool(String dataFileName, String[][] providedAndPredictedTokens)
{
+
+ try (LogCaptor logCaptorNglmTool =
LogCaptor.forClass(NGramLanguageModelTool.class);
+ LogCaptor logCaptorPerfMon =
LogCaptor.forClass(PerformanceMonitor.class)) {
+
+ //Get test input-file.
+ File inputData = new File(testDir, String.format(dataFileName));
+
+ //Configure input stream to provide user-input.
+ StringBuilder userInput = new StringBuilder();
+ for (String[] item : providedAndPredictedTokens) {
+ userInput.append(item[0]).append("\n");
+ }
+ System.setIn(new ByteArrayInputStream(userInput.toString().getBytes()));
+
+ //Invoke the tool.
+ NGramLanguageModelTool nGramLMTool = new NGramLanguageModelTool();
+ nGramLMTool.run(new String[] {inputData.getPath()});
+
+ //Collect any LogStream Events generated via the tool.
+ List<String> actual = new LinkedList<>();
+ actual.addAll(logCaptorNglmTool.getInfoLogs());
+ actual.addAll(logCaptorPerfMon.getInfoLogs());
+
+
+ List<Executable> assertions = new LinkedList<>();
+
+ //assert the expected and actual values of predicted next token for
equality.
+ for (String[] item : providedAndPredictedTokens) {
+ assertions.add(() -> Assertions.assertTrue(actual.stream()
+ .filter(l -> l.contains(String.join(", ", item[0].split(" "))))
+ .findFirst().orElseThrow(AssertionError::new).contains(item[1])));
+ }
+
+ //assert completion stats
+ assertions.add(() -> Assertions.assertEquals("Total: " +
providedAndPredictedTokens.length + " nglm",
+ actual.stream().filter(l ->
l.contains("Total")).findFirst().orElseThrow(AssertionError::new)));
+
+ Assertions.assertAll(assertions);
+ }
+ }
+
+ private static Stream<Arguments> provideNgramDictionaryXML() {
+
+ List<Arguments> arguments = new LinkedList<>();
+
+ List<Map<String, String[][]>> testFileNamesWithProvidedAndPredictedTokens =
+ List.of(Map.of("sentences_set_1",
+ new String[][] {{"data availability is by", "now"},
+ {"machine and deep learning", "algorithms"}}),
+ Map.of("sentences_set_2", new String[][] {{"lunar landing mission
was the", "first"}}));
+
+ for (Map<String, String[][]> testInput :
testFileNamesWithProvidedAndPredictedTokens) {
+
+ NGramModel ngModel = new NGramModel();
+
+ for (Map.Entry<String, String[][]> entry : testInput.entrySet()) {
+
+ try (InputStream is = ngModel.getClass().getResourceAsStream(
+ String.format("/opennlp/tools/cmdline/languagemodel/%s.txt",
entry.getKey()));
+ BufferedReader reader = new BufferedReader(new
InputStreamReader(is, StandardCharsets.UTF_8));
+ FileOutputStream fos = new FileOutputStream(new File(testDir,
+ String.format("%s_ngram_dict.xml", entry.getKey())))) {
+
+ //Read the test data file line by line and generate ngrams based on
MIN_LENGTH and MAX_LENGTH.
+ reader.lines()
+ .map(l -> NGramGenerator.generate(Arrays.asList(l.split(" ")),
NGRAM_MAX_LENGTH, " "))
+ .flatMap(Collection::stream)
+ .forEach(t -> ngModel.add(new StringList(t.split(" ")),
NGRAM_MIN_LENGTH, NGRAM_MAX_LENGTH));
+
+ //Output the ngram dictionary in a test file.
+ ngModel.serialize(fos);
+
+ //create input arguments for the test method.
+ arguments.add(
+ Arguments.of(String.format("%s_ngram_dict.xml", entry.getKey()),
+ entry.getValue()));
+
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "IO Error while creating test data files " + e.getMessage(), e);
+ }
+ }
+ }
+ return arguments.stream();
+ }
+
+ /**
+ * Restores testing resources to original configuration.
+ */
+ @AfterAll
+ public static void afterAll() {
+ restore(LOGGER_OPENNLP);
+ System.setIn(sysInputStream);
+ }
+}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java
b/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java
index 14fd5bd3..c534ea31 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java
@@ -20,109 +20,101 @@ package opennlp.tools.monitoring;
import java.util.List;
import java.util.Map;
-import ch.qos.logback.classic.Level;
-import ch.qos.logback.classic.Logger;
-import ch.qos.logback.classic.spi.ILoggingEvent;
-import ch.qos.logback.core.read.ListAppender;
+import nl.altindag.log.LogCaptor;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
-import org.slf4j.LoggerFactory;
+import opennlp.tools.AbstractLoggerTest;
import opennlp.tools.util.TrainingParameters;
-import static java.util.stream.Collectors.toList;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
-class DefaultTrainingProgressMonitorTest {
-
- private static final String LOGGER_NAME = "opennlp";
- private static final Logger logger = (Logger)
LoggerFactory.getLogger(LOGGER_NAME);
- private static final Level originalLogLevel = logger != null ?
logger.getLevel() : Level.OFF;
-
- private TrainingProgressMonitor progressMonitor;
- private final ListAppender<ILoggingEvent> appender = new ListAppender<>();
+class DefaultTrainingProgressMonitorTest extends AbstractLoggerTest {
+ private final TrainingProgressMonitor progressMonitor = new
DefaultTrainingProgressMonitor();
@BeforeAll
- static void beforeAll() {
- logger.setLevel(Level.INFO);
- }
-
- @BeforeEach
- public void setup() {
- progressMonitor = new DefaultTrainingProgressMonitor();
- appender.list.clear();
- logger.addAppender(appender);
- appender.start();
+ public static void prepare() {
+ prepare(LOGGER_OPENNLP);
}
@Test
void testFinishedIteration() {
- progressMonitor.finishedIteration(1, 19830, 20801,
TrainingMeasure.ACCURACY, 0.953319551944618);
- progressMonitor.finishedIteration(2, 19852, 20801,
TrainingMeasure.ACCURACY, 0.9543771934041633);
- progressMonitor.display(true);
-
- //Assert that two logging events are captured for two iterations.
- List<String> actual =
appender.list.stream().map(ILoggingEvent::getMessage).
- collect(toList());
- List<String> expected = List.of("1: (19830/20801) Training Accuracy :
0.953319551944618",
- "2: (19852/20801) Training Accuracy : 0.9543771934041633");
- assertArrayEquals(expected.toArray(), actual.toArray());
-
+ try (LogCaptor logCaptor =
LogCaptor.forClass(DefaultTrainingProgressMonitor.class)) {
+ progressMonitor.finishedIteration(1, 19830, 20801,
TrainingMeasure.ACCURACY, 0.953319551944618);
+ progressMonitor.finishedIteration(2, 19852, 20801,
TrainingMeasure.ACCURACY, 0.9543771934041633);
+ progressMonitor.display(true);
+
+ //Assert that two logging events are captured for two iterations.
+ List<String> actual = logCaptor.getInfoLogs();
+
+ List<String> expected = List.of("1: (19830/20801) Training Accuracy :
0.953319551944618",
+ "2: (19852/20801) Training Accuracy : 0.9543771934041633");
+ assertArrayEquals(expected.toArray(), actual.toArray());
+ }
}
@Test
void testFinishedTrainingWithStopCriteria() {
- StopCriteria<Double> stopCriteria = new IterDeltaAccuracyUnderTolerance(
- new TrainingParameters(Map.of("Tolerance", .00002)));
- progressMonitor.finishedTraining(150, stopCriteria);
- progressMonitor.display(true);
-
- //Assert that the logs captured the training completion message with
StopCriteria satisfied.
- List<String> actual =
appender.list.stream().map(ILoggingEvent::getMessage).
- collect(toList());
- List<String> expected = List.of("Stopping: change in training set accuracy
less than {2.0E-5}");
- assertArrayEquals(expected.toArray(), actual.toArray());
+ try (LogCaptor logCaptor =
LogCaptor.forClass(DefaultTrainingProgressMonitor.class)) {
+ StopCriteria<Double> stopCriteria = new IterDeltaAccuracyUnderTolerance(
+ new TrainingParameters(Map.of("Tolerance", .00002)));
+ progressMonitor.finishedTraining(150, stopCriteria);
+ progressMonitor.display(true);
+
+ //Assert that the logs captured the training completion message with
StopCriteria satisfied.
+ List<String> actual = logCaptor.getInfoLogs();
+ List<String> expected = List.of("Stopping: change in training set
accuracy less than {2.0E-5}");
+ assertArrayEquals(expected.toArray(), actual.toArray());
+ }
}
@Test
void testFinishedTrainingWithoutStopCriteria() {
- progressMonitor.finishedTraining(150, null);
- progressMonitor.display(true);
-
- //Assert that the logs captured the training completion message when all
iterations are exhausted.
- List<String> actual =
appender.list.stream().map(ILoggingEvent::getMessage).
- collect(toList());
- List<String> expected = List.of("Training Finished after completing 150
Iterations successfully.");
- assertArrayEquals(expected.toArray(), actual.toArray());
+ try (LogCaptor logCaptor =
LogCaptor.forClass(DefaultTrainingProgressMonitor.class)) {
+ progressMonitor.finishedTraining(150, null);
+ progressMonitor.display(true);
+
+ //Assert that the logs captured the training completion message when all
iterations are exhausted.
+ List<String> actual = logCaptor.getInfoLogs();
+ List<String> expected = List.of("Training Finished after completing 150
Iterations successfully.");
+ assertArrayEquals(expected.toArray(), actual.toArray());
+ }
}
@Test
void displayAndClear() {
- progressMonitor.finishedTraining(150, null);
- progressMonitor.display(true);
-
- //Assert that the previous invocation of display has cleared the recorded
training progress.
- appender.list.clear();
- progressMonitor.display(true);
- assertEquals(0, appender.list.size());
+ try (LogCaptor logCaptor =
LogCaptor.forClass(DefaultTrainingProgressMonitor.class)) {
+ progressMonitor.finishedTraining(150, null);
+ progressMonitor.display(true);
+
+ //Assert that the previous invocation of display has cleared the
recorded training progress.
+ logCaptor.clearLogs();
+ progressMonitor.display(true);
+ assertEquals(0, logCaptor.getInfoLogs().size());
+ }
}
@Test
void displayAndKeep() {
- progressMonitor.finishedTraining(150, null);
- progressMonitor.display(false);
-
- //Assert that the previous invocation of display has not cleared the
recorded training progress.
- progressMonitor.display(false);
- assertEquals(2, appender.list.size());
+ try (LogCaptor logCaptor =
LogCaptor.forClass(DefaultTrainingProgressMonitor.class)) {
+ progressMonitor.finishedTraining(150, null);
+ progressMonitor.display(false);
+
+ //Assert that the previous invocation of display has not cleared the
recorded training progress.
+ progressMonitor.display(false);
+ assertEquals(2, logCaptor.getInfoLogs().size());
+ }
}
+ /**
+ * Restores testing resources to original configuration.
+ */
@AfterAll
- static void afterAll() {
- logger.setLevel(originalLogLevel);
+ public static void afterAll() {
+ restore(LOGGER_OPENNLP);
}
}
+
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/origin_of_text_samples.txt
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/origin_of_text_samples.txt
new file mode 100644
index 00000000..ab54e23c
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/origin_of_text_samples.txt
@@ -0,0 +1,4 @@
+| Test data file name | Source
|
+| --------------------- |
----------------------------------------------------------------- |
+| sentences_set_1.txt | https://emerj.com/what-is-machine-learning/
|
+| sentences_set_2.txt |
https://www.nasa.gov/history/50-years-ago-apollo-11-preparations/ |
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_1.txt
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_1.txt
new file mode 100644
index 00000000..a016acda
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_1.txt
@@ -0,0 +1,10 @@
+There are different approaches to getting machines to learn
+These include using basic decision trees to clustering to layers of artificial
neural networks
+It depends upon what task you are trying to accomplish and the type and amount
of data that you have available
+This dynamic sees itself played out in applications as varying as medical
diagnostics or self-driving cars
+One of the most common mistakes among machine learning beginners is testing
training data successfully and having the illusion of success
+Domingo and others emphasize the importance of keeping some of the data set
separate when testing models
+And only using that reserved data to test a chosen model followed by learning
on the whole data set
+When a learning algorithm is not working often the quicker path to success is
to feed the machine more data
+The availability of which is by now well-known as a primary driver of progress
in machine and deep learning algorithms in recent years
+However, this can lead to issues with scalability in which we have more data
but time to learn that data remains an issue
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_2.txt
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_2.txt
new file mode 100644
index 00000000..ce29ed57
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_2.txt
@@ -0,0 +1,11 @@
+In January 1969 only the most optimistic could have predicted that in just six
months time humans would be walking on the surface of the Moon
+NASA was making preparations for that historic mission
+Early in the month NASA announced the crew for Apollo 11
+It was the first mission that would attempt a lunar landing
+Commander Neil Arm Armstrong and Lunar Module Pilot Edwin Aldrin and Command
Module Pilot Michael Collins were the crew members Components of their
spacecraft and rocket arrived at the Kennedy Space Center in January and
February
+At the Manned Spacecraft Center now the Johnson Space Center in Houston
facilities were being prepared to receive the first humans to return from the
Moon
+North American Rockwell of Downey shipped the Apollo 11 spacecraft to Kennedy
Space Center on Jan 23 1969
+Workers transferred the modules to the Manned Spacecraft Operations Building
where they removed them from their shipping containers
+On January 29 they mated the two modules and installed the assembly in an
altitude chamber for testing
+Earlier in the month the Grumman Aircraft and Engineering Corporation had
delivered the Apollo 11 Lunar Module to Kennedy Space Center
+The three stages of the Saturn V rocket arrived in January and February for
stacking in the Vehicle Assembly Building
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
b/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
index 4cd40b4b..4b004619 100644
--- a/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
+++ b/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
@@ -1,7 +1,7 @@
The word2vec software of Tomas Mikolov and colleagues has gained a lot of
traction lately and provides state-of-the-art word embeddings
The learning models behind the software are described in two research papers
We found the description of the models in these papers to be somewhat cryptic
and hard to follow
-While the motivations and presentation may be obvious to the neural-networks
language-mofdeling crowd we had to struggle quite a bit to figure out the
rationale behind the equations
+While the motivations and presentation may be obvious to the neural-networks
language-modeling crowd we had to struggle quite a bit to figure out the
rationale behind the equations
This note is an attempt to explain the negative sampling equation in
Distributed Representations of Words and Phrases and their Compositionality by
Tomas Mikolov Ilya Sutskever Kai Chen Greg Corrado and Jeffrey Dean
The departure point of the paper is the skip-gram model
In this model we are given a corpus of words w and their contexts c
diff --git a/rat-excludes b/rat-excludes
index 7c873b1b..3ad3ae4d 100644
--- a/rat-excludes
+++ b/rat-excludes
@@ -47,6 +47,8 @@
src/test/resources/opennlp/tools/formats/brat/voa-with-relations.txt
src/test/resources/opennlp/tools/languagemodel/sentences.txt
src/test/resources/opennlp/tools/lemmatizer/trial.old.tsv
src/test/resources/opennlp/tools/lemmatizer/output.txt
+src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_1.txt
+src/test/resources/opennlp/tools/cmdline/languagemodel/sentences_set_2.txt
<!-- head rule files, format does not allow AL header -->
lang/en/parser/en-head_rules