This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch germeval in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 72b23c5a74b27ce6e87eea6347ffb0f6adaf81a8 Author: Richard Zowalla <[email protected]> AuthorDate: Thu Mar 5 20:00:07 2026 +0100 OPENNLP-976 - Implement GermEval2024 Format --- .../tools/cmdline/StreamFactoryRegistry.java | 2 + .../formats/GermEval2014NameSampleStream.java | 250 +++++++++++++++++ .../GermEval2014NameSampleStreamFactory.java | 104 +++++++ .../GermEval2014NameSampleStreamFactoryTest.java | 133 +++++++++ .../formats/GermEval2014NameSampleStreamTest.java | 298 +++++++++++++++++++++ .../opennlp/tools/formats/germeval2014.sample | 44 +++ .../tools/eval/GermEval2014NameFinderEval.java | 216 +++++++++++++++ 7 files changed, 1047 insertions(+) diff --git a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index c4bef61f..cfa46f1f 100644 --- a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -30,6 +30,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory; import opennlp.tools.formats.ConllXTokenSampleStreamFactory; import opennlp.tools.formats.DocumentSampleStreamFactory; import opennlp.tools.formats.EvalitaNameSampleStreamFactory; +import opennlp.tools.formats.GermEval2014NameSampleStreamFactory; import opennlp.tools.formats.LanguageDetectorSampleStreamFactory; import opennlp.tools.formats.LemmatizerSampleStreamFactory; import opennlp.tools.formats.NameSampleDataStreamFactory; @@ -107,6 +108,7 @@ public final class StreamFactoryRegistry { Conll02NameSampleStreamFactory.registerFactory(); Conll03NameSampleStreamFactory.registerFactory(); EvalitaNameSampleStreamFactory.registerFactory(); + GermEval2014NameSampleStreamFactory.registerFactory(); ConllXPOSSampleStreamFactory.registerFactory(); ConllXSentenceSampleStreamFactory.registerFactory(); ConllXTokenSampleStreamFactory.registerFactory(); diff --git a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java new file mode 100644 index 00000000..ed2e1a53 --- /dev/null +++ b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.commons.Internal; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; +import opennlp.tools.util.Span; +import opennlp.tools.util.StringUtil; + +/** + * Parser for the GermEval 2014 Named Entity Recognition Shared Task data. + * <p> + * The data is in a tab-separated format with four columns: + * <ol> + * <li>Token index (1-based per sentence)</li> + * <li>Token text</li> + * <li>Outer named entity tag (IOB2 scheme)</li> + * <li>Nested/embedded named entity tag (IOB2 scheme)</li> + * </ol> + * Comment lines starting with {@code #} mark document boundaries and contain + * source URL and date metadata. Blank lines separate sentences. + * <p> + * The data uses four main entity types: Person (PER), Location (LOC), + * Organization (ORG) and Other (OTH), with additional {@code deriv} and + * {@code part} suffixes for derived forms and name parts respectively. + * <p> + * Since {@link NameSample} does not support overlapping spans, this stream + * requires selecting either the {@link NerLayer#OUTER outer} or + * {@link NerLayer#INNER inner} annotation layer via a {@link NerLayer} parameter. + * <p> + * Data can be found on + * <a href="https://sites.google.com/site/germeval2014ner/data">this web site</a>. + * <p> + * <b>Note:</b> + * Do not use this class, internal use only! + */ +@Internal +public class GermEval2014NameSampleStream implements ObjectStream<NameSample> { + + /** + * Selects which NER annotation layer to read from the GermEval 2014 data. + */ + public enum NerLayer { + /** The outer (top-level) named entity annotations (column 3). */ + OUTER, + /** The nested/embedded named entity annotations (column 4). */ + INNER + } + + public static final int GENERATE_PERSON_ENTITIES = 0x01; + public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1; + public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2; + public static final int GENERATE_MISC_ENTITIES = 0x01 << 3; + + private final ObjectStream<String> lineStream; + private final int types; + private final NerLayer layer; + + /** + * Initializes a {@link GermEval2014NameSampleStream}. + * + * @param lineStream An {@link ObjectStream} over the lines + * in the GermEval 2014 data file. + * @param types The entity types to include in the Name Sample object stream. + * @param layer The {@link NerLayer} to read. + */ + public GermEval2014NameSampleStream(final ObjectStream<String> lineStream, + final int types, final NerLayer layer) { + this.lineStream = lineStream; + this.types = types; + this.layer = layer; + } + + /** + * Initializes a {@link GermEval2014NameSampleStream}. + * + * @param in The {@link InputStreamFactory} for the input file. + * @param types The entity types to include in the Name Sample object stream. + * @param layer The {@link NerLayer} to read. + * @throws IOException Thrown if IO errors occurred. + */ + public GermEval2014NameSampleStream(final InputStreamFactory in, final int types, + final NerLayer layer) throws IOException { + this(new PlainTextByLineStream(in, StandardCharsets.UTF_8), types, layer); + } + + static Span extract(final int begin, final int end, final String beginTag) + throws InvalidFormatException { + + final String type = mapTagToType(beginTag); + return new Span(begin, end, type); + } + + private static String mapTagToType(final String tag) throws InvalidFormatException { + // Strip B- or I- prefix + final String rawType = tag.substring(2); + + return switch (rawType) { + case "PER" -> "person"; + case "PERderiv" -> "personderiv"; + case "PERpart" -> "personpart"; + case "LOC" -> "location"; + case "LOCderiv" -> "locationderiv"; + case "LOCpart" -> "locationpart"; + case "ORG" -> "organization"; + case "ORGderiv" -> "organizationderiv"; + case "ORGpart" -> "organizationpart"; + case "OTH" -> "misc"; + case "OTHderiv" -> "miscderiv"; + case "OTHpart" -> "miscpart"; + default -> throw new InvalidFormatException("Unknown type: " + rawType); + }; + } + + private boolean isTypeEnabled(final String tag) { + if (tag.startsWith("B-PER") || tag.startsWith("I-PER")) { + return (types & GENERATE_PERSON_ENTITIES) != 0; + } + if (tag.startsWith("B-ORG") || tag.startsWith("I-ORG")) { + return (types & GENERATE_ORGANIZATION_ENTITIES) != 0; + } + if (tag.startsWith("B-LOC") || tag.startsWith("I-LOC")) { + return (types & GENERATE_LOCATION_ENTITIES) != 0; + } + if (tag.startsWith("B-OTH") || tag.startsWith("I-OTH")) { + return (types & GENERATE_MISC_ENTITIES) != 0; + } + return tag.equals("O"); + } + + private List<Span> convertTagsToSpans(final List<String> tags) throws IOException { + final List<Span> names = new ArrayList<>(); + + int beginIndex = -1; + int endIndex = -1; + + for (int i = 0; i < tags.size(); i++) { + String tag = tags.get(i); + + if (!tag.equals("O") && !isTypeEnabled(tag)) { + tag = "O"; + } + + if (tag.startsWith("B-")) { + if (beginIndex != -1) { + names.add(extract(beginIndex, endIndex, tags.get(beginIndex))); + } + beginIndex = i; + endIndex = i + 1; + } else if (tag.startsWith("I-")) { + endIndex++; + } else if (tag.equals("O")) { + if (beginIndex != -1) { + names.add(extract(beginIndex, endIndex, tags.get(beginIndex))); + beginIndex = -1; + endIndex = -1; + } + } else { + throw new IOException("Invalid tag: " + tag); + } + } + + // if one span remains, create it here + if (beginIndex != -1) { + names.add(extract(beginIndex, endIndex, tags.get(beginIndex))); + } + + return names; + } + + @Override + public NameSample read() throws IOException { + + final List<String> sentence = new ArrayList<>(); + final List<String> outerTags = new ArrayList<>(); + final List<String> innerTags = new ArrayList<>(); + + boolean isClearAdaptiveData = false; + + // Empty line indicates end of sentence + String line; + while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { + + // Comment lines starting with # mark document boundaries + if (line.startsWith("#")) { + isClearAdaptiveData = true; + continue; + } + + final String[] fields = line.split("\t"); + + if (fields.length >= 4) { + sentence.add(fields[1]); + outerTags.add(fields[2]); + innerTags.add(fields[3].trim()); + } else { + throw new IOException("Expected at least four tab-separated fields per line " + + "in GermEval 2014 data, got " + fields.length + " for line '" + line + "'!"); + } + } + + if (sentence.size() > 0) { + final List<String> selectedTags = (layer == NerLayer.OUTER) ? outerTags : innerTags; + final List<Span> names = convertTagsToSpans(selectedTags); + + return new NameSample(sentence.toArray(new String[0]), + names.toArray(new Span[0]), isClearAdaptiveData); + } else if (line != null) { + // Just filter out empty events, if two lines in a row are empty + return read(); + } else { + // source stream is not returning anymore lines + return null; + } + } + + @Override + public void reset() throws IOException, UnsupportedOperationException { + lineStream.reset(); + } + + @Override + public void close() throws IOException { + lineStream.close(); + } +} diff --git a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java new file mode 100644 index 00000000..760fb371 --- /dev/null +++ b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.commons.Internal; +import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +/** + * <b>Note:</b> + * Do not use this class, internal use only! + * + * @see GermEval2014NameSampleStream + */ +@Internal +public class GermEval2014NameSampleStreamFactory extends + LanguageSampleStreamFactory<NameSample, GermEval2014NameSampleStreamFactory.Parameters> { + + public interface Parameters extends BasicFormatParams { + @ParameterDescription(valueName = "per,loc,org,misc") + String getTypes(); + + @ParameterDescription(valueName = "outer|inner", description = "NER annotation layer to use. " + + "Use 'outer' for top-level entities or 'inner' for nested/embedded entities.") + String getLayer(); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(NameSample.class, + "germeval2014", new GermEval2014NameSampleStreamFactory(Parameters.class)); + } + + protected GermEval2014NameSampleStreamFactory(final Class<Parameters> params) { + super(params); + } + + @Override + public ObjectStream<NameSample> create(final String[] args) { + + final Parameters params = validateBasicFormatParameters(args, Parameters.class); + + language = "deu"; + + int typesToGenerate = 0; + + if (params.getTypes().contains("per")) { + typesToGenerate = typesToGenerate | + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES; + } + if (params.getTypes().contains("org")) { + typesToGenerate = typesToGenerate | + GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES; + } + if (params.getTypes().contains("loc")) { + typesToGenerate = typesToGenerate | + GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES; + } + if (params.getTypes().contains("misc")) { + typesToGenerate = typesToGenerate | + GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES; + } + + final NerLayer layer; + final String layerParam = params.getLayer(); + if (layerParam == null || "outer".equals(layerParam)) { + layer = NerLayer.OUTER; + } else if ("inner".equals(layerParam)) { + layer = NerLayer.INNER; + } else { + throw new TerminateToolException(1, "Unsupported layer: " + layerParam + + ". Use 'outer' or 'inner'."); + } + + try { + return new GermEval2014NameSampleStream( + FormatUtil.createInputStreamFactory(params.getData()), typesToGenerate, layer); + } catch (final IOException e) { + throw new TerminateToolException(-1, + "IO Error while creating an Input Stream: " + e.getMessage(), e); + } + } +} diff --git a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java new file mode 100644 index 00000000..c6528a06 --- /dev/null +++ b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import opennlp.tools.cmdline.ObjectStreamFactory; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class GermEval2014NameSampleStreamFactoryTest extends + AbstractSampleStreamFactoryTest<NameSample, GermEval2014NameSampleStreamFactory.Parameters> { + + private static final String SAMPLE = "germeval2014.sample"; + + // SUT + private GermEval2014NameSampleStreamFactory factory; + + private String sampleFileFullPath; + + @Override + protected AbstractSampleStreamFactory<NameSample, + GermEval2014NameSampleStreamFactory.Parameters> getFactory() { + return factory; + } + + @Override + protected String getDataFilePath() { + return sampleFileFullPath; + } + + @BeforeAll + static void initEnv() { + GermEval2014NameSampleStreamFactory.registerFactory(); + } + + @BeforeEach + void setUp() { + final ObjectStreamFactory<NameSample, GermEval2014NameSampleStreamFactory.Parameters> f = + StreamFactoryRegistry.getFactory(NameSample.class, "germeval2014"); + assertInstanceOf(GermEval2014NameSampleStreamFactory.class, f); + factory = ((GermEval2014NameSampleStreamFactory) f); + assertEquals(GermEval2014NameSampleStreamFactory.Parameters.class, factory.params); + sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + SAMPLE).getPath(); + } + + @Test + void testCreateWithValidParameter() throws IOException { + try (final ObjectStream<NameSample> stream = factory.create( + new String[]{"-types", "per,loc,org,misc", "-layer", "outer", + "-data", sampleFileFullPath})) { + final NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {"outer", "inner"}) + void testCreateWithDifferentLayers(final String layer) throws IOException { + try (final ObjectStream<NameSample> stream = factory.create( + new String[]{"-types", "per,loc,org,misc", "-layer", layer, + "-data", sampleFileFullPath})) { + final NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @ParameterizedTest + @ValueSource(strings = {"", "per", "loc", "org", "misc", "per,loc,org,misc"}) + void testCreateWithDifferentTypes(final String types) throws IOException { + try (final ObjectStream<NameSample> stream = factory.create( + new String[]{"-types", types, "-layer", "outer", "-data", sampleFileFullPath})) { + final NameSample sample = stream.read(); + assertNotNull(sample); + } + } + + @Test + void testCreateWithInvalidLayer() { + assertThrows(TerminateToolException.class, () -> { + try (final ObjectStream<NameSample> stream = factory.create( + new String[]{"-types", "per,loc,org,misc", "-layer", "xyz", + "-data", sampleFileFullPath})) { + final NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } + + /* + * Note: Overriding this test case, as more params are required! + */ + @Test + @Override + protected void testCreateWithInvalidDataFilePath() { + assertThrows(TerminateToolException.class, () -> { + try (final ObjectStream<NameSample> stream = factory.create(new String[] + {"-types", "per,loc,org,misc", "-layer", "outer", + "-data", sampleFileFullPath + "xyz"})) { + final NameSample sample = stream.read(); + assertNotNull(sample); + } + }); + } +} diff --git a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java new file mode 100644 index 00000000..0118a313 --- /dev/null +++ b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +/** + * Test for the {@link GermEval2014NameSampleStream} class. + */ +public class GermEval2014NameSampleStreamTest extends AbstractSampleStreamTest { + + private static final String SAMPLE = "germeval2014.sample"; + + private static final int ALL_TYPES = + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES + | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES + | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES + | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES; + + @Test + void testParsingSampleFirstSentence() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // First sentence: 14 tokens + Assertions.assertEquals(14, sample.getSentence().length); + Assertions.assertEquals("Schartau", sample.getSentence()[0]); + Assertions.assertEquals(".", sample.getSentence()[13]); + + // Comment line means clear adaptive data + Assertions.assertTrue(sample.isClearAdaptiveDataSet()); + + // 4 outer entities: Schartau (PER), Tagesspiegel (ORG), Fischer (PER), Berlin (LOC) + Assertions.assertEquals(4, sample.getNames().length); + + // Verify Schartau = PER at position 0 + final Span schartau = findSpanAt(sample.getNames(), 0); + Assertions.assertNotNull(schartau); + Assertions.assertEquals("person", schartau.getType()); + Assertions.assertEquals(0, schartau.getStart()); + Assertions.assertEquals(1, schartau.getEnd()); + + // Verify Tagesspiegel = ORG at position 4 + final Span tagesspiegel = findSpanAt(sample.getNames(), 4); + Assertions.assertNotNull(tagesspiegel); + Assertions.assertEquals("organization", tagesspiegel.getType()); + + // Verify Fischer = PER at position 9 + final Span fischer = findSpanAt(sample.getNames(), 9); + Assertions.assertNotNull(fischer); + Assertions.assertEquals("person", fischer.getType()); + + // Verify Berlin = LOC at position 12 + final Span berlin = findSpanAt(sample.getNames(), 12); + Assertions.assertNotNull(berlin); + Assertions.assertEquals("location", berlin.getType()); + } + } + + @Test + void testOuterLayerEntities() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + // Skip first sentence + sampleStream.read(); + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // Second sentence: 13 tokens (Bayern München...) + Assertions.assertEquals(13, sample.getSentence().length); + Assertions.assertEquals("Bayern", sample.getSentence()[0]); + Assertions.assertEquals("München", sample.getSentence()[1]); + Assertions.assertTrue(sample.isClearAdaptiveDataSet()); + + // Outer layer: Bayern München (ORG), deutschen (LOCderiv) = 2 spans + Assertions.assertEquals(2, sample.getNames().length); + + // Bayern München = ORG (0,2) + final Span org = findSpanAt(sample.getNames(), 0); + Assertions.assertNotNull(org); + Assertions.assertEquals("organization", org.getType()); + Assertions.assertEquals(2, org.getEnd()); + + // deutschen = LOCderiv (10,11) + final Span locDeriv = findSpanAt(sample.getNames(), 10); + Assertions.assertNotNull(locDeriv); + Assertions.assertEquals("locationderiv", locDeriv.getType()); + Assertions.assertEquals(11, locDeriv.getEnd()); + } + } + + @Test + void testInnerLayerEntities() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.INNER)) { + // Skip first sentence (all inner tags are O) + final NameSample first = sampleStream.read(); + Assertions.assertNotNull(first); + Assertions.assertEquals(0, first.getNames().length); + + // Second sentence has inner layer entities + final NameSample sample = sampleStream.read(); + Assertions.assertNotNull(sample); + + // Inner layer: Bayern (LOC), München (LOC) = 2 spans + Assertions.assertEquals(2, sample.getNames().length); + + final Span bayernLoc = findSpanAt(sample.getNames(), 0); + Assertions.assertNotNull(bayernLoc); + Assertions.assertEquals("location", bayernLoc.getType()); + Assertions.assertEquals(1, bayernLoc.getEnd()); + + final Span muenchenLoc = findSpanAt(sample.getNames(), 1); + Assertions.assertNotNull(muenchenLoc); + Assertions.assertEquals("location", muenchenLoc.getType()); + Assertions.assertEquals(2, muenchenLoc.getEnd()); + } + } + + @Test + void testMiscEntityType() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + sampleStream.read(); // skip 1st + sampleStream.read(); // skip 2nd + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // Third sentence: "Ecce homo ist ein Werk ." + Assertions.assertEquals(6, sample.getSentence().length); + Assertions.assertFalse(sample.isClearAdaptiveDataSet()); + + // Ecce homo = OTH -> misc + Assertions.assertEquals(1, sample.getNames().length); + final Span oth = sample.getNames()[0]; + Assertions.assertEquals("misc", oth.getType()); + Assertions.assertEquals(0, oth.getStart()); + Assertions.assertEquals(2, oth.getEnd()); + } + } + + @Test + void testPartEntityType() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + sampleStream.read(); // skip 1st + sampleStream.read(); // skip 2nd + sampleStream.read(); // skip 3rd + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // Fourth sentence: "ARD-Programmchef Volker Herres sagte ." + Assertions.assertEquals(5, sample.getSentence().length); + + // ARD-Programmchef = ORGpart, Volker Herres = PER + Assertions.assertEquals(2, sample.getNames().length); + + final Span orgPart = findSpanAt(sample.getNames(), 0); + Assertions.assertNotNull(orgPart); + Assertions.assertEquals("organizationpart", orgPart.getType()); + Assertions.assertEquals(1, orgPart.getEnd()); + + final Span person = findSpanAt(sample.getNames(), 1); + Assertions.assertNotNull(person); + Assertions.assertEquals("person", person.getType()); + Assertions.assertEquals(3, person.getEnd()); + } + } + + @Test + void testStreamExhaustion() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + sampleStream.read(); // 1st + sampleStream.read(); // 2nd + sampleStream.read(); // 3rd + sampleStream.read(); // 4th + Assertions.assertNull(sampleStream.read()); // end of stream + } + } + + @Test + void testFilterPersonEntitiesOnly() throws IOException { + try (final ObjectStream<NameSample> sampleStream = + openData(GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, NerLayer.OUTER)) { + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // Only PER entities from first sentence: Schartau, Fischer + Assertions.assertEquals(2, sample.getNames().length); + for (final Span name : sample.getNames()) { + Assertions.assertTrue(name.getType().startsWith("person")); + } + } + } + + @Test + void testFilterLocationEntitiesOnly() throws IOException { + try (final ObjectStream<NameSample> sampleStream = + openData(GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, NerLayer.OUTER)) { + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + // Only LOC entities from first sentence: Berlin + Assertions.assertEquals(1, sample.getNames().length); + Assertions.assertEquals("location", sample.getNames()[0].getType()); + } + } + + @Test + void testFilterNoEntities() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(0, NerLayer.OUTER)) { + final NameSample sample = sampleStream.read(); + + Assertions.assertNotNull(sample); + Assertions.assertEquals(0, sample.getNames().length); + } + } + + @Test + void testReset() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + final NameSample sample = sampleStream.read(); + sampleStream.reset(); + + Assertions.assertEquals(sample, sampleStream.read()); + } + } + + @Test + void testDocumentBoundaryClearsAdaptiveData() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + final NameSample first = sampleStream.read(); + Assertions.assertTrue(first.isClearAdaptiveDataSet()); // has # comment + + final NameSample second = sampleStream.read(); + Assertions.assertTrue(second.isClearAdaptiveDataSet()); // has # comment + + final NameSample third = sampleStream.read(); + Assertions.assertFalse(third.isClearAdaptiveDataSet()); // no # comment + + final NameSample fourth = sampleStream.read(); + Assertions.assertFalse(fourth.isClearAdaptiveDataSet()); // no # comment + } + } + + @Test + void testAllEntityTypesPresent() throws IOException { + try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, NerLayer.OUTER)) { + final Set<String> foundTypes = new HashSet<>(); + NameSample sample; + while ((sample = sampleStream.read()) != null) { + for (final Span name : sample.getNames()) { + foundTypes.add(name.getType()); + } + } + // Should find: person, organization, location, locationderiv, misc, organizationpart + Assertions.assertTrue(foundTypes.containsAll( + Arrays.asList("person", "organization", "location", "misc"))); + } + } + + private ObjectStream<NameSample> openData(final int types, final NerLayer layer) + throws IOException { + return new GermEval2014NameSampleStream(getFactory(SAMPLE), types, layer); + } + + private Span findSpanAt(final Span[] spans, final int start) { + for (final Span span : spans) { + if (span.getStart() == start) { + return span; + } + } + return null; + } +} diff --git a/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample new file mode 100644 index 00000000..b28ae382 --- /dev/null +++ b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample @@ -0,0 +1,44 @@ +# n-tv.de vom 26.02.2005 [2005-02-26] +1 Schartau B-PER O +2 sagte O O +3 dem O O +4 " O O +5 Tagesspiegel B-ORG O +6 " O O +7 vom O O +8 Freitag O O +9 , O O +10 Fischer B-PER O +11 sei O O +12 in O O +13 Berlin B-LOC O +14 . O O + +# stern.de vom 21.03.2006 [2006-03-21] +1 Bayern B-ORG B-LOC +2 München I-ORG B-LOC +3 ist O O +4 wieder O O +5 alleiniger O O +6 Favorit O O +7 auf O O +8 den O O +9 Gewinn O O +10 der O O +11 deutschen B-LOCderiv O +12 Fußball-Meisterschaft O O +13 . O O + +1 Ecce B-OTH O +2 homo I-OTH O +3 ist O O +4 ein O O +5 Werk O O +6 . O O + +1 ARD-Programmchef B-ORGpart O +2 Volker B-PER O +3 Herres I-PER O +4 sagte O O +5 . O O + diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java new file mode 100644 index 00000000..c67e84bc --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.eval; + +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import opennlp.tools.formats.GermEval2014NameSampleStream; +import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.NameSample; +import opennlp.tools.namefind.TokenNameFinderEvaluator; +import opennlp.tools.namefind.TokenNameFinderFactory; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.util.MarkableFileInputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelUtil; + +/** + * Evaluates the name finder against the GermEval 2014 NER corpus (German). + * <p> + * Download the data files from the GermEval 2014 shared task + * <a href="https://sites.google.com/site/germeval2014ner/data">site</a> + * and place them into this directory: {@code $OPENNLP_DATA_DIR/germeval2014/}. + * <p> + * Expected files: + * <ul> + * <li>{@code NER-de-train.tsv} - Training data</li> + * <li>{@code NER-de-test.tsv} - Test data</li> + * </ul> + */ +public class GermEval2014NameFinderEval extends AbstractEvalTest { + + private static final int ALL_TYPES = + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES + | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES + | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES + | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES; + + private static File trainingFile; + private static File testFile; + + private TokenNameFinderModel train(final File trainFile, final TrainingParameters params, + final int types) throws IOException { + + final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream( + new MarkableFileInputStreamFactory(trainFile), types, NerLayer.OUTER); + + return NameFinderME.train("deu", null, samples, params, new TokenNameFinderFactory()); + } + + private void eval(final TokenNameFinderModel model, final File testData, + final int types, final double expectedFMeasure) throws IOException { + + final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream( + new MarkableFileInputStreamFactory(testData), types, NerLayer.OUTER); + + final TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model)); + evaluator.evaluate(samples); + + Assertions.assertEquals(expectedFMeasure, evaluator.getFMeasure().getFMeasure(), ACCURACY_DELTA); + } + + @BeforeAll + static void verifyTrainingData() throws Exception { + + trainingFile = new File(getOpennlpDataDir(), "germeval2014/NER-de-train.tsv"); + testFile = new File(getOpennlpDataDir(), "germeval2014/NER-de-test.tsv"); + + verifyTrainingData(new GermEval2014NameSampleStream( + new MarkableFileInputStreamFactory(trainingFile), + ALL_TYPES, NerLayer.OUTER), + new BigInteger("175386258960384643455328517118707394452")); + verifyTrainingData(new GermEval2014NameSampleStream( + new MarkableFileInputStreamFactory(testFile), + ALL_TYPES, NerLayer.OUTER), + new BigInteger("112232325598196372951673841456976805014")); + } + + // -- Person entity evaluation -- + + @Test + void evalPersonPerceptron() throws IOException { + final TrainingParameters params = createPerceptronParams(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, 0.6086631814787155d); + } + + @Test + void evalPersonMaxentGis() throws IOException { + final TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, 0.5204518893650175d); + } + + // -- Organization entity evaluation -- + + @Test + void evalOrganizationPerceptron() throws IOException { + final TrainingParameters params = createPerceptronParams(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES, 0.5588235294117646d); + } + + @Test + void evalOrganizationMaxentGis() throws IOException { + final TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES, 0.4594180704441041d); + } + + // -- Location entity evaluation -- + + @Test + void evalLocationPerceptron() throws IOException { + final TrainingParameters params = createPerceptronParams(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, 0.6705613411226822d); + } + + @Test + void evalLocationMaxentGis() throws IOException { + final TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, 0.5537280701754386d); + } + + // -- Misc (OTH) entity evaluation -- + + @Test + void evalMiscPerceptron() throws IOException { + final TrainingParameters params = createPerceptronParams(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES, 0.4482142857142857d); + } + + @Test + void evalMiscMaxentGis() throws IOException { + final TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + + final TokenNameFinderModel model = train(trainingFile, params, + GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES); + + eval(model, testFile, + GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES, 0.3932267168391345d); + } + + // -- Combined (all types) evaluation -- + + @Test + void evalCombinedPerceptron() throws IOException { + final TrainingParameters params = createPerceptronParams(); + + final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES); + + eval(model, testFile, ALL_TYPES, 0.6016631636662707d); + } + + @Test + void evalCombinedMaxentGis() throws IOException { + final TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); + + final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES); + + eval(model, testFile, ALL_TYPES, 0.5229054890631449d); + } +}
