This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1719-Add-additional-ITs-for-verification-of-UD-POS-models in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 52b7ee106451726a4e9cbbb85ad7fdd674e96eee Author: Martin Wiesner <[email protected]> AuthorDate: Mon Dec 2 17:59:45 2024 +0100 OPENNLP-1719: Add additional ITs for verification of UD POS models - adds POSTaggerMEIT with a sample sentences for - CA (by @kinow) - DE (ud data) - EN (dev-manual) - PL (via community: @alsmolarczyk) - PT (ud data) --- .../java/opennlp/tools/postag/POSTaggerMEIT.java | 121 +++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java new file mode 100644 index 00000000..19b212a9 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.postag; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.tokenize.TokenizerME; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class POSTaggerMEIT { + + private static final boolean debug = false; + + @ParameterizedTest(name = "Verify \"{0}\" sample") + @MethodSource(value = "provideData") + void testPOSTagger(String langCode, String input, String[] expectedTags) throws IOException { + + Tokenizer tokenizer = new TokenizerME(langCode); + POSTagger tagger = new POSTaggerME(langCode); + + String[] tokens = tokenizer.tokenize(input); + assertNotNull(tokens); + assertEquals(expectedTags.length, tokens.length); + String[] tags = tagger.tag(tokens); + assertNotNull(tags); + assertEquals(expectedTags.length, tags.length); + StringBuilder fullyTagged = new StringBuilder(); + for (int i = 0; i < tags.length; i++) { + fullyTagged.append(tokens[i]).append("_").append(tags[i]).append(" "); + } + if (debug) { + System.out.println(fullyTagged); + } + + List<Integer> incorrectTagsPositions = new ArrayList<>(); + for (int i = 0; i < tags.length; i++) { + StringBuilder sb = new StringBuilder(); + sb.append(tokens[i]).append("[").append(tags[i]).append("]"); + if (expectedTags[i].equals(tags[i])) { + sb.append(" <-- " + "OK"); + } else { + sb.append(" <-- " + "NOK" + ", pos=").append(i); + incorrectTagsPositions.add(i); + } + if (debug) { + System.out.println(sb); + } + // assertEquals(expectedTags[i], tags[i]); + } + assertTrue(incorrectTagsPositions.size() <= 1); + } + + private static Stream<Arguments> provideData() { + return Stream.of( + // see: Dev Manual + Arguments.of("en", + "Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .", + new String[]{"PROPN", "PROPN", "AUX", "NOUN", "ADP", "ADJ", "PROPN", "PUNCT", "DET", "PROPN", + "VERB", "NOUN", "PUNCT"}), + // see: 'de-ud-train-sample.conllu' + Arguments.of("de", + "Fachlich kompetent, sehr gute Beratung und ein freundliches Team .", + new String[]{"ADV", "ADJ", "PUNCT", "ADV", "ADJ", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "PUNCT"}), + // see: 'pt-br-ud-sample.conllu' + Arguments.of("pt", + "Numa reunião entre representantes da Secretaria da Criança do DF ea juíza da Vara de Execuções de " + + "Medidas Socioeducativas, Lavínia Tupi Vieira Fonseca, ficou acordado que dos 25 internos, " + + "12 serão internados na Unidade de Planaltina e os outros 13 devem retornar para a Unidade do " + + "Recanto das Emas, antigo Ciago .", + // pos=10 -> NOK + new String[]{"ADP+DET", "NOUN", "ADP", "NOUN", "ADP+DET", "PROPN", "ADP+DET", "PROPN", "ADP+DET", + "PROPN", "CCONJ", "NOUN", "ADP+DET", "PROPN", "ADP", "PROPN", "ADP", "PROPN", "PROPN", "PUNCT", + "PROPN", "PROPN", "PROPN", "PROPN", "PUNCT", "VERB", "ADJ", "CCONJ", "ADP+DET", "NUM", "NOUN", + "PUNCT", "NUM", "AUX", "VERB", "ADP+DET", "PROPN", "ADP", "PROPN", "CCONJ", "DET", "DET", "NUM", + "AUX", "VERB", "ADP", "DET", "PROPN", "ADP+DET", "PROPN", "ADP+DET", "PROPN", "PUNCT", "ADJ", + "PROPN", "PUNCT"}), + // see: @kinow + Arguments.of("ca", + "Un gran embossament d'aire fred es comença a despenjar cap al centre d'Europa.", + // OpenNLP, different at: idx pos 2, 3, 5, and 13(+14) -> however, only pos 5 is "wrong" (ref) + new String[]{"DET", "ADJ", "NOUN", "ADP", "NOUN", "ADJ", "PRON", "VERB", "ADP", "VERB", "NOUN", + "ADP+DET", "NOUN", "ADP", "PROPN", "PUNCT"}) + // REFERENCE ("gold"): + // "DET", "ADJ", "NOUN", "ADP", "NOUN", "ADJ", "PRON", "VERB", "ADP", "VERB", "NOUN", "ADP+DET", + // "NOUN", "ADP", "PROPN", "PUNCT"}) + + // Spacy, wrong tags at: idx pos 2, 3 and 14 + //"DET", "ADJ", "ADV", "PROPN", "NOUN", "ADJ", "PRON", "VERB", "ADP", "VERB", "NOUN", "ADP" + "DET", + // "NOUN", "PROPN", "PROPN", "PUNCT" + // ok! , ok! , ??? , ??? , ok! , ok! , ok! , ok! , ok! , ok! , ok! , ok! + ok! , + // ok! , ??? , ok! , ok! + + ); + } +}
