This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1333_Write_unit_test_for_parser_top_k_parses in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit d2f58caed08793f2ea6aff4c7fc3a020a8f32898 Author: Martin Wiesner <[email protected]> AuthorDate: Sat Mar 4 13:10:27 2023 +0100 OPENNLP-1333 Write unit test for parser top "k" parses - provides new test cases for `k = 1, 2, 3` for both Parser implementations - uses test data from https://github.com/apache/opennlp/pull/392 - adds `toStringPennTreebank` in `Parse` to obtain a uniform string representation for verification or comparison --- .../src/main/java/opennlp/tools/parser/Parse.java | 9 ++++ .../tools/parser/AbstractParserModelTest.java | 58 ++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java index 2e5d873c..33880baf 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java @@ -449,6 +449,15 @@ public class Parse implements Cloneable, Comparable<Parse> { return text.substring(span.getStart(), span.getEnd()); } + /** + * @return Retrieves a String representation using Penn Treebank-style formatting. + */ + public String toStringPennTreebank() { + StringBuffer buffer = new StringBuffer(); + show(buffer); + return buffer.toString(); + } + /** * Represents this {@link Parse} in a human-readable way. */ diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java index 9df819a9..346e4eb7 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java @@ -20,6 +20,9 @@ package opennlp.tools.parser; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; import java.util.stream.Stream; import org.junit.jupiter.api.Assertions; @@ -27,7 +30,9 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; +import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.Span; /** @@ -86,6 +91,59 @@ public abstract class AbstractParserModelTest { Assertions.assertNotNull(s); } + /* + * Verifies changes in OPENNLP-1330 and addresses follow-up OPENNLP-1333 + * See: https://issues.apache.org/jira/projects/OPENNLP/issues/OPENNLP-1333 + * + * Uses test data from PR 392 (https://github.com/apache/opennlp/pull/392). + */ + @ParameterizedTest + @ValueSource(ints = {1, 2, 3}) + void testParsingTopParses(int k) { + // fixtures + final String sent = "Eric is testing."; + final String refParseTopChunking = + "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NP (DT testing.)))))"; + final String refParseTopTreeInsert = + "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NN testing.))))"; + + // prepare + List<String> tokens = Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent)); + String text = String.join(" ", tokens); + + Parse sentP = new Parse(text, new Span(0, text.length()), + AbstractBottomUpParser.INC_NODE, 0, 0); + int start = 0; + int i = 0; + for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) { + String tok = ti.next(); + sentP.insert(new Parse(text, new Span(start, start + tok.length()), + AbstractBottomUpParser.TOK_NODE, 0, i)); + start += tok.length() + 1; + } + + opennlp.tools.parser.Parser parser = ParserFactory.create(getModel()); + Assertions.assertNotNull(parser); + + // TEST: parsing + Parse[] parses = parser.parse(sentP, k); + Assertions.assertNotNull(parses); + Assertions.assertEquals(k, parses.length); + double previousProb = 0; // initial ref value + for (int j = 0; j < parses.length; j++) { + Assertions.assertTrue(parses[j].getProb() < previousProb); + String asPennTreebankStyle = parses[j].toStringPennTreebank(); + // System.out.println(parses[j].getProb() + " - " + asPennTreebankStyle); + if (j == 0) { + if (ParserType.CHUNKING.equals(getModel().getParserType())) { + Assertions.assertEquals(refParseTopChunking, asPennTreebankStyle); + } else if (ParserType.TREEINSERT.equals(getModel().getParserType())) { + Assertions.assertEquals(refParseTopTreeInsert, asPennTreebankStyle); + } + } + } + } + /* * Produces a stream of <parse|text> pairs for parameterized unit tests. */
