This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch germeval
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 72b23c5a74b27ce6e87eea6347ffb0f6adaf81a8
Author: Richard Zowalla <[email protected]>
AuthorDate: Thu Mar 5 20:00:07 2026 +0100

    OPENNLP-976 - Implement GermEval2024 Format
---
 .../tools/cmdline/StreamFactoryRegistry.java       |   2 +
 .../formats/GermEval2014NameSampleStream.java      | 250 +++++++++++++++++
 .../GermEval2014NameSampleStreamFactory.java       | 104 +++++++
 .../GermEval2014NameSampleStreamFactoryTest.java   | 133 +++++++++
 .../formats/GermEval2014NameSampleStreamTest.java  | 298 +++++++++++++++++++++
 .../opennlp/tools/formats/germeval2014.sample      |  44 +++
 .../tools/eval/GermEval2014NameFinderEval.java     | 216 +++++++++++++++
 7 files changed, 1047 insertions(+)

diff --git 
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index c4bef61f..cfa46f1f 100644
--- 
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -30,6 +30,7 @@ import 
opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
 import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
 import opennlp.tools.formats.DocumentSampleStreamFactory;
 import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.GermEval2014NameSampleStreamFactory;
 import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
 import opennlp.tools.formats.LemmatizerSampleStreamFactory;
 import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -107,6 +108,7 @@ public final class StreamFactoryRegistry {
     Conll02NameSampleStreamFactory.registerFactory();
     Conll03NameSampleStreamFactory.registerFactory();
     EvalitaNameSampleStreamFactory.registerFactory();
+    GermEval2014NameSampleStreamFactory.registerFactory();
     ConllXPOSSampleStreamFactory.registerFactory();
     ConllXSentenceSampleStreamFactory.registerFactory();
     ConllXTokenSampleStreamFactory.registerFactory();
diff --git 
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
new file mode 100644
index 00000000..ed2e1a53
--- /dev/null
+++ 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.commons.Internal;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the GermEval 2014 Named Entity Recognition Shared Task data.
+ * <p>
+ * The data is in a tab-separated format with four columns:
+ * <ol>
+ *   <li>Token index (1-based per sentence)</li>
+ *   <li>Token text</li>
+ *   <li>Outer named entity tag (IOB2 scheme)</li>
+ *   <li>Nested/embedded named entity tag (IOB2 scheme)</li>
+ * </ol>
+ * Comment lines starting with {@code #} mark document boundaries and contain
+ * source URL and date metadata. Blank lines separate sentences.
+ * <p>
+ * The data uses four main entity types: Person (PER), Location (LOC),
+ * Organization (ORG) and Other (OTH), with additional {@code deriv} and
+ * {@code part} suffixes for derived forms and name parts respectively.
+ * <p>
+ * Since {@link NameSample} does not support overlapping spans, this stream
+ * requires selecting either the {@link NerLayer#OUTER outer} or
+ * {@link NerLayer#INNER inner} annotation layer via a {@link NerLayer} 
parameter.
+ * <p>
+ * Data can be found on
+ * <a href="https://sites.google.com/site/germeval2014ner/data";>this web 
site</a>.
+ * <p>
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ */
+@Internal
+public class GermEval2014NameSampleStream implements ObjectStream<NameSample> {
+
+  /**
+   * Selects which NER annotation layer to read from the GermEval 2014 data.
+   */
+  public enum NerLayer {
+    /** The outer (top-level) named entity annotations (column 3). */
+    OUTER,
+    /** The nested/embedded named entity annotations (column 4). */
+    INNER
+  }
+
+  public static final int GENERATE_PERSON_ENTITIES = 0x01;
+  public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1;
+  public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
+  public static final int GENERATE_MISC_ENTITIES = 0x01 << 3;
+
+  private final ObjectStream<String> lineStream;
+  private final int types;
+  private final NerLayer layer;
+
+  /**
+   * Initializes a {@link GermEval2014NameSampleStream}.
+   *
+   * @param lineStream An {@link ObjectStream} over the lines
+   *                   in the GermEval 2014 data file.
+   * @param types      The entity types to include in the Name Sample object 
stream.
+   * @param layer      The {@link NerLayer} to read.
+   */
+  public GermEval2014NameSampleStream(final ObjectStream<String> lineStream,
+                                      final int types, final NerLayer layer) {
+    this.lineStream = lineStream;
+    this.types = types;
+    this.layer = layer;
+  }
+
+  /**
+   * Initializes a {@link GermEval2014NameSampleStream}.
+   *
+   * @param in    The {@link InputStreamFactory} for the input file.
+   * @param types The entity types to include in the Name Sample object stream.
+   * @param layer The {@link NerLayer} to read.
+   * @throws IOException Thrown if IO errors occurred.
+   */
+  public GermEval2014NameSampleStream(final InputStreamFactory in, final int 
types,
+                                      final NerLayer layer) throws IOException 
{
+    this(new PlainTextByLineStream(in, StandardCharsets.UTF_8), types, layer);
+  }
+
+  static Span extract(final int begin, final int end, final String beginTag)
+      throws InvalidFormatException {
+
+    final String type = mapTagToType(beginTag);
+    return new Span(begin, end, type);
+  }
+
+  private static String mapTagToType(final String tag) throws 
InvalidFormatException {
+    // Strip B- or I- prefix
+    final String rawType = tag.substring(2);
+
+    return switch (rawType) {
+      case "PER" -> "person";
+      case "PERderiv" -> "personderiv";
+      case "PERpart" -> "personpart";
+      case "LOC" -> "location";
+      case "LOCderiv" -> "locationderiv";
+      case "LOCpart" -> "locationpart";
+      case "ORG" -> "organization";
+      case "ORGderiv" -> "organizationderiv";
+      case "ORGpart" -> "organizationpart";
+      case "OTH" -> "misc";
+      case "OTHderiv" -> "miscderiv";
+      case "OTHpart" -> "miscpart";
+      default -> throw new InvalidFormatException("Unknown type: " + rawType);
+    };
+  }
+
+  private boolean isTypeEnabled(final String tag) {
+    if (tag.startsWith("B-PER") || tag.startsWith("I-PER")) {
+      return (types & GENERATE_PERSON_ENTITIES) != 0;
+    }
+    if (tag.startsWith("B-ORG") || tag.startsWith("I-ORG")) {
+      return (types & GENERATE_ORGANIZATION_ENTITIES) != 0;
+    }
+    if (tag.startsWith("B-LOC") || tag.startsWith("I-LOC")) {
+      return (types & GENERATE_LOCATION_ENTITIES) != 0;
+    }
+    if (tag.startsWith("B-OTH") || tag.startsWith("I-OTH")) {
+      return (types & GENERATE_MISC_ENTITIES) != 0;
+    }
+    return tag.equals("O");
+  }
+
+  private List<Span> convertTagsToSpans(final List<String> tags) throws 
IOException {
+    final List<Span> names = new ArrayList<>();
+
+    int beginIndex = -1;
+    int endIndex = -1;
+
+    for (int i = 0; i < tags.size(); i++) {
+      String tag = tags.get(i);
+
+      if (!tag.equals("O") && !isTypeEnabled(tag)) {
+        tag = "O";
+      }
+
+      if (tag.startsWith("B-")) {
+        if (beginIndex != -1) {
+          names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+        }
+        beginIndex = i;
+        endIndex = i + 1;
+      } else if (tag.startsWith("I-")) {
+        endIndex++;
+      } else if (tag.equals("O")) {
+        if (beginIndex != -1) {
+          names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+          beginIndex = -1;
+          endIndex = -1;
+        }
+      } else {
+        throw new IOException("Invalid tag: " + tag);
+      }
+    }
+
+    // if one span remains, create it here
+    if (beginIndex != -1) {
+      names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+    }
+
+    return names;
+  }
+
+  @Override
+  public NameSample read() throws IOException {
+
+    final List<String> sentence = new ArrayList<>();
+    final List<String> outerTags = new ArrayList<>();
+    final List<String> innerTags = new ArrayList<>();
+
+    boolean isClearAdaptiveData = false;
+
+    // Empty line indicates end of sentence
+    String line;
+    while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
+
+      // Comment lines starting with # mark document boundaries
+      if (line.startsWith("#")) {
+        isClearAdaptiveData = true;
+        continue;
+      }
+
+      final String[] fields = line.split("\t");
+
+      if (fields.length >= 4) {
+        sentence.add(fields[1]);
+        outerTags.add(fields[2]);
+        innerTags.add(fields[3].trim());
+      } else {
+        throw new IOException("Expected at least four tab-separated fields per 
line "
+            + "in GermEval 2014 data, got " + fields.length + " for line '" + 
line + "'!");
+      }
+    }
+
+    if (sentence.size() > 0) {
+      final List<String> selectedTags = (layer == NerLayer.OUTER) ? outerTags 
: innerTags;
+      final List<Span> names = convertTagsToSpans(selectedTags);
+
+      return new NameSample(sentence.toArray(new String[0]),
+          names.toArray(new Span[0]), isClearAdaptiveData);
+    } else if (line != null) {
+      // Just filter out empty events, if two lines in a row are empty
+      return read();
+    } else {
+      // source stream is not returning anymore lines
+      return null;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    lineStream.reset();
+  }
+
+  @Override
+  public void close() throws IOException {
+    lineStream.close();
+  }
+}
diff --git 
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
new file mode 100644
index 00000000..760fb371
--- /dev/null
+++ 
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see GermEval2014NameSampleStream
+ */
+@Internal
+public class GermEval2014NameSampleStreamFactory extends
+    LanguageSampleStreamFactory<NameSample, 
GermEval2014NameSampleStreamFactory.Parameters> {
+
+  public interface Parameters extends BasicFormatParams {
+    @ParameterDescription(valueName = "per,loc,org,misc")
+    String getTypes();
+
+    @ParameterDescription(valueName = "outer|inner", description = "NER 
annotation layer to use. " +
+        "Use 'outer' for top-level entities or 'inner' for nested/embedded 
entities.")
+    String getLayer();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class,
+        "germeval2014", new 
GermEval2014NameSampleStreamFactory(Parameters.class));
+  }
+
+  protected GermEval2014NameSampleStreamFactory(final Class<Parameters> 
params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<NameSample> create(final String[] args) {
+
+    final Parameters params = validateBasicFormatParameters(args, 
Parameters.class);
+
+    language = "deu";
+
+    int typesToGenerate = 0;
+
+    if (params.getTypes().contains("per")) {
+      typesToGenerate = typesToGenerate |
+          GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES;
+    }
+    if (params.getTypes().contains("org")) {
+      typesToGenerate = typesToGenerate |
+          GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
+    }
+    if (params.getTypes().contains("loc")) {
+      typesToGenerate = typesToGenerate |
+          GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES;
+    }
+    if (params.getTypes().contains("misc")) {
+      typesToGenerate = typesToGenerate |
+          GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+    }
+
+    final NerLayer layer;
+    final String layerParam = params.getLayer();
+    if (layerParam == null || "outer".equals(layerParam)) {
+      layer = NerLayer.OUTER;
+    } else if ("inner".equals(layerParam)) {
+      layer = NerLayer.INNER;
+    } else {
+      throw new TerminateToolException(1, "Unsupported layer: " + layerParam
+          + ". Use 'outer' or 'inner'.");
+    }
+
+    try {
+      return new GermEval2014NameSampleStream(
+          FormatUtil.createInputStreamFactory(params.getData()), 
typesToGenerate, layer);
+    } catch (final IOException e) {
+      throw new TerminateToolException(-1,
+          "IO Error while creating an Input Stream: " + e.getMessage(), e);
+    }
+  }
+}
diff --git 
a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
 
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
new file mode 100644
index 00000000..c6528a06
--- /dev/null
+++ 
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class GermEval2014NameSampleStreamFactoryTest extends
+    AbstractSampleStreamFactoryTest<NameSample, 
GermEval2014NameSampleStreamFactory.Parameters> {
+
+  private static final String SAMPLE = "germeval2014.sample";
+
+  // SUT
+  private GermEval2014NameSampleStreamFactory factory;
+
+  private String sampleFileFullPath;
+
+  @Override
+  protected AbstractSampleStreamFactory<NameSample,
+      GermEval2014NameSampleStreamFactory.Parameters> getFactory() {
+    return factory;
+  }
+
+  @Override
+  protected String getDataFilePath() {
+    return sampleFileFullPath;
+  }
+
+  @BeforeAll
+  static void initEnv() {
+    GermEval2014NameSampleStreamFactory.registerFactory();
+  }
+
+  @BeforeEach
+  void setUp() {
+    final ObjectStreamFactory<NameSample, 
GermEval2014NameSampleStreamFactory.Parameters> f =
+        StreamFactoryRegistry.getFactory(NameSample.class, "germeval2014");
+    assertInstanceOf(GermEval2014NameSampleStreamFactory.class, f);
+    factory = ((GermEval2014NameSampleStreamFactory) f);
+    assertEquals(GermEval2014NameSampleStreamFactory.Parameters.class, 
factory.params);
+    sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR + 
SAMPLE).getPath();
+  }
+
+  @Test
+  void testCreateWithValidParameter() throws IOException {
+    try (final ObjectStream<NameSample> stream = factory.create(
+        new String[]{"-types", "per,loc,org,misc", "-layer", "outer",
+            "-data", sampleFileFullPath})) {
+      final NameSample sample = stream.read();
+      assertNotNull(sample);
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(strings = {"outer", "inner"})
+  void testCreateWithDifferentLayers(final String layer) throws IOException {
+    try (final ObjectStream<NameSample> stream = factory.create(
+        new String[]{"-types", "per,loc,org,misc", "-layer", layer,
+            "-data", sampleFileFullPath})) {
+      final NameSample sample = stream.read();
+      assertNotNull(sample);
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(strings = {"", "per", "loc", "org", "misc", "per,loc,org,misc"})
+  void testCreateWithDifferentTypes(final String types) throws IOException {
+    try (final ObjectStream<NameSample> stream = factory.create(
+        new String[]{"-types", types, "-layer", "outer", "-data", 
sampleFileFullPath})) {
+      final NameSample sample = stream.read();
+      assertNotNull(sample);
+    }
+  }
+
+  @Test
+  void testCreateWithInvalidLayer() {
+    assertThrows(TerminateToolException.class, () -> {
+      try (final ObjectStream<NameSample> stream = factory.create(
+          new String[]{"-types", "per,loc,org,misc", "-layer", "xyz",
+              "-data", sampleFileFullPath})) {
+        final NameSample sample = stream.read();
+        assertNotNull(sample);
+      }
+    });
+  }
+
+  /*
+   * Note: Overriding this test case, as more params are required!
+   */
+  @Test
+  @Override
+  protected void testCreateWithInvalidDataFilePath() {
+    assertThrows(TerminateToolException.class, () -> {
+      try (final ObjectStream<NameSample> stream = factory.create(new String[]
+          {"-types", "per,loc,org,misc", "-layer", "outer",
+              "-data", sampleFileFullPath + "xyz"})) {
+        final NameSample sample = stream.read();
+        assertNotNull(sample);
+      }
+    });
+  }
+}
diff --git 
a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
 
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
new file mode 100644
index 00000000..0118a313
--- /dev/null
+++ 
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Test for the {@link GermEval2014NameSampleStream} class.
+ */
+public class GermEval2014NameSampleStreamTest extends AbstractSampleStreamTest 
{
+
+  private static final String SAMPLE = "germeval2014.sample";
+
+  private static final int ALL_TYPES =
+      GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+
+  @Test
+  void testParsingSampleFirstSentence() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // First sentence: 14 tokens
+      Assertions.assertEquals(14, sample.getSentence().length);
+      Assertions.assertEquals("Schartau", sample.getSentence()[0]);
+      Assertions.assertEquals(".", sample.getSentence()[13]);
+
+      // Comment line means clear adaptive data
+      Assertions.assertTrue(sample.isClearAdaptiveDataSet());
+
+      // 4 outer entities: Schartau (PER), Tagesspiegel (ORG), Fischer (PER), 
Berlin (LOC)
+      Assertions.assertEquals(4, sample.getNames().length);
+
+      // Verify Schartau = PER at position 0
+      final Span schartau = findSpanAt(sample.getNames(), 0);
+      Assertions.assertNotNull(schartau);
+      Assertions.assertEquals("person", schartau.getType());
+      Assertions.assertEquals(0, schartau.getStart());
+      Assertions.assertEquals(1, schartau.getEnd());
+
+      // Verify Tagesspiegel = ORG at position 4
+      final Span tagesspiegel = findSpanAt(sample.getNames(), 4);
+      Assertions.assertNotNull(tagesspiegel);
+      Assertions.assertEquals("organization", tagesspiegel.getType());
+
+      // Verify Fischer = PER at position 9
+      final Span fischer = findSpanAt(sample.getNames(), 9);
+      Assertions.assertNotNull(fischer);
+      Assertions.assertEquals("person", fischer.getType());
+
+      // Verify Berlin = LOC at position 12
+      final Span berlin = findSpanAt(sample.getNames(), 12);
+      Assertions.assertNotNull(berlin);
+      Assertions.assertEquals("location", berlin.getType());
+    }
+  }
+
+  @Test
+  void testOuterLayerEntities() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      // Skip first sentence
+      sampleStream.read();
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // Second sentence: 13 tokens (Bayern München...)
+      Assertions.assertEquals(13, sample.getSentence().length);
+      Assertions.assertEquals("Bayern", sample.getSentence()[0]);
+      Assertions.assertEquals("München", sample.getSentence()[1]);
+      Assertions.assertTrue(sample.isClearAdaptiveDataSet());
+
+      // Outer layer: Bayern München (ORG), deutschen (LOCderiv) = 2 spans
+      Assertions.assertEquals(2, sample.getNames().length);
+
+      // Bayern München = ORG (0,2)
+      final Span org = findSpanAt(sample.getNames(), 0);
+      Assertions.assertNotNull(org);
+      Assertions.assertEquals("organization", org.getType());
+      Assertions.assertEquals(2, org.getEnd());
+
+      // deutschen = LOCderiv (10,11)
+      final Span locDeriv = findSpanAt(sample.getNames(), 10);
+      Assertions.assertNotNull(locDeriv);
+      Assertions.assertEquals("locationderiv", locDeriv.getType());
+      Assertions.assertEquals(11, locDeriv.getEnd());
+    }
+  }
+
+  @Test
+  void testInnerLayerEntities() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.INNER)) {
+      // Skip first sentence (all inner tags are O)
+      final NameSample first = sampleStream.read();
+      Assertions.assertNotNull(first);
+      Assertions.assertEquals(0, first.getNames().length);
+
+      // Second sentence has inner layer entities
+      final NameSample sample = sampleStream.read();
+      Assertions.assertNotNull(sample);
+
+      // Inner layer: Bayern (LOC), München (LOC) = 2 spans
+      Assertions.assertEquals(2, sample.getNames().length);
+
+      final Span bayernLoc = findSpanAt(sample.getNames(), 0);
+      Assertions.assertNotNull(bayernLoc);
+      Assertions.assertEquals("location", bayernLoc.getType());
+      Assertions.assertEquals(1, bayernLoc.getEnd());
+
+      final Span muenchenLoc = findSpanAt(sample.getNames(), 1);
+      Assertions.assertNotNull(muenchenLoc);
+      Assertions.assertEquals("location", muenchenLoc.getType());
+      Assertions.assertEquals(2, muenchenLoc.getEnd());
+    }
+  }
+
+  @Test
+  void testMiscEntityType() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      sampleStream.read(); // skip 1st
+      sampleStream.read(); // skip 2nd
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // Third sentence: "Ecce homo ist ein Werk ."
+      Assertions.assertEquals(6, sample.getSentence().length);
+      Assertions.assertFalse(sample.isClearAdaptiveDataSet());
+
+      // Ecce homo = OTH -> misc
+      Assertions.assertEquals(1, sample.getNames().length);
+      final Span oth = sample.getNames()[0];
+      Assertions.assertEquals("misc", oth.getType());
+      Assertions.assertEquals(0, oth.getStart());
+      Assertions.assertEquals(2, oth.getEnd());
+    }
+  }
+
+  @Test
+  void testPartEntityType() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      sampleStream.read(); // skip 1st
+      sampleStream.read(); // skip 2nd
+      sampleStream.read(); // skip 3rd
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // Fourth sentence: "ARD-Programmchef Volker Herres sagte ."
+      Assertions.assertEquals(5, sample.getSentence().length);
+
+      // ARD-Programmchef = ORGpart, Volker Herres = PER
+      Assertions.assertEquals(2, sample.getNames().length);
+
+      final Span orgPart = findSpanAt(sample.getNames(), 0);
+      Assertions.assertNotNull(orgPart);
+      Assertions.assertEquals("organizationpart", orgPart.getType());
+      Assertions.assertEquals(1, orgPart.getEnd());
+
+      final Span person = findSpanAt(sample.getNames(), 1);
+      Assertions.assertNotNull(person);
+      Assertions.assertEquals("person", person.getType());
+      Assertions.assertEquals(3, person.getEnd());
+    }
+  }
+
+  @Test
+  void testStreamExhaustion() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      sampleStream.read(); // 1st
+      sampleStream.read(); // 2nd
+      sampleStream.read(); // 3rd
+      sampleStream.read(); // 4th
+      Assertions.assertNull(sampleStream.read()); // end of stream
+    }
+  }
+
+  @Test
+  void testFilterPersonEntitiesOnly() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream =
+             openData(GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, 
NerLayer.OUTER)) {
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // Only PER entities from first sentence: Schartau, Fischer
+      Assertions.assertEquals(2, sample.getNames().length);
+      for (final Span name : sample.getNames()) {
+        Assertions.assertTrue(name.getType().startsWith("person"));
+      }
+    }
+  }
+
+  @Test
+  void testFilterLocationEntitiesOnly() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream =
+             openData(GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, 
NerLayer.OUTER)) {
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      // Only LOC entities from first sentence: Berlin
+      Assertions.assertEquals(1, sample.getNames().length);
+      Assertions.assertEquals("location", sample.getNames()[0].getType());
+    }
+  }
+
+  @Test
+  void testFilterNoEntities() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(0, 
NerLayer.OUTER)) {
+      final NameSample sample = sampleStream.read();
+
+      Assertions.assertNotNull(sample);
+      Assertions.assertEquals(0, sample.getNames().length);
+    }
+  }
+
+  @Test
+  void testReset() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      final NameSample sample = sampleStream.read();
+      sampleStream.reset();
+
+      Assertions.assertEquals(sample, sampleStream.read());
+    }
+  }
+
+  @Test
+  void testDocumentBoundaryClearsAdaptiveData() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      final NameSample first = sampleStream.read();
+      Assertions.assertTrue(first.isClearAdaptiveDataSet()); // has # comment
+
+      final NameSample second = sampleStream.read();
+      Assertions.assertTrue(second.isClearAdaptiveDataSet()); // has # comment
+
+      final NameSample third = sampleStream.read();
+      Assertions.assertFalse(third.isClearAdaptiveDataSet()); // no # comment
+
+      final NameSample fourth = sampleStream.read();
+      Assertions.assertFalse(fourth.isClearAdaptiveDataSet()); // no # comment
+    }
+  }
+
+  @Test
+  void testAllEntityTypesPresent() throws IOException {
+    try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES, 
NerLayer.OUTER)) {
+      final Set<String> foundTypes = new HashSet<>();
+      NameSample sample;
+      while ((sample = sampleStream.read()) != null) {
+        for (final Span name : sample.getNames()) {
+          foundTypes.add(name.getType());
+        }
+      }
+      // Should find: person, organization, location, locationderiv, misc, 
organizationpart
+      Assertions.assertTrue(foundTypes.containsAll(
+          Arrays.asList("person", "organization", "location", "misc")));
+    }
+  }
+
+  private ObjectStream<NameSample> openData(final int types, final NerLayer 
layer)
+      throws IOException {
+    return new GermEval2014NameSampleStream(getFactory(SAMPLE), types, layer);
+  }
+
+  private Span findSpanAt(final Span[] spans, final int start) {
+    for (final Span span : spans) {
+      if (span.getStart() == start) {
+        return span;
+      }
+    }
+    return null;
+  }
+}
diff --git 
a/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
 
b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
new file mode 100644
index 00000000..b28ae382
--- /dev/null
+++ 
b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
@@ -0,0 +1,44 @@
+#      n-tv.de vom 26.02.2005  [2005-02-26]
+1      Schartau        B-PER   O
+2      sagte   O       O
+3      dem     O       O
+4      "       O       O
+5      Tagesspiegel    B-ORG   O
+6      "       O       O
+7      vom     O       O
+8      Freitag O       O
+9      ,       O       O
+10     Fischer B-PER   O
+11     sei     O       O
+12     in      O       O
+13     Berlin  B-LOC   O
+14     .       O       O
+
+#      stern.de vom 21.03.2006 [2006-03-21]
+1      Bayern  B-ORG   B-LOC
+2      München I-ORG   B-LOC
+3      ist     O       O
+4      wieder  O       O
+5      alleiniger      O       O
+6      Favorit O       O
+7      auf     O       O
+8      den     O       O
+9      Gewinn  O       O
+10     der     O       O
+11     deutschen       B-LOCderiv      O
+12     Fußball-Meisterschaft   O       O
+13     .       O       O
+
+1      Ecce    B-OTH   O
+2      homo    I-OTH   O
+3      ist     O       O
+4      ein     O       O
+5      Werk    O       O
+6      .       O       O
+
+1      ARD-Programmchef        B-ORGpart       O
+2      Volker  B-PER   O
+3      Herres  I-PER   O
+4      sagte   O       O
+5      .       O       O
+
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
 
b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
new file mode 100644
index 00000000..c67e84bc
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.formats.GermEval2014NameSampleStream;
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderEvaluator;
+import opennlp.tools.namefind.TokenNameFinderFactory;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+/**
+ * Evaluates the name finder against the GermEval 2014 NER corpus (German).
+ * <p>
+ * Download the data files from the GermEval 2014 shared task
+ * <a href="https://sites.google.com/site/germeval2014ner/data";>site</a>
+ * and place them into this directory: {@code $OPENNLP_DATA_DIR/germeval2014/}.
+ * <p>
+ * Expected files:
+ * <ul>
+ *   <li>{@code NER-de-train.tsv} - Training data</li>
+ *   <li>{@code NER-de-test.tsv} - Test data</li>
+ * </ul>
+ */
+public class GermEval2014NameFinderEval extends AbstractEvalTest {
+
+  private static final int ALL_TYPES =
+      GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES
+          | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+
+  private static File trainingFile;
+  private static File testFile;
+
+  private TokenNameFinderModel train(final File trainFile, final 
TrainingParameters params,
+                                     final int types) throws IOException {
+
+    final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream(
+        new MarkableFileInputStreamFactory(trainFile), types, NerLayer.OUTER);
+
+    return NameFinderME.train("deu", null, samples, params, new 
TokenNameFinderFactory());
+  }
+
+  private void eval(final TokenNameFinderModel model, final File testData,
+                    final int types, final double expectedFMeasure) throws 
IOException {
+
+    final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream(
+        new MarkableFileInputStreamFactory(testData), types, NerLayer.OUTER);
+
+    final TokenNameFinderEvaluator evaluator = new 
TokenNameFinderEvaluator(new NameFinderME(model));
+    evaluator.evaluate(samples);
+
+    Assertions.assertEquals(expectedFMeasure, 
evaluator.getFMeasure().getFMeasure(), ACCURACY_DELTA);
+  }
+
+  @BeforeAll
+  static void verifyTrainingData() throws Exception {
+
+    trainingFile = new File(getOpennlpDataDir(), 
"germeval2014/NER-de-train.tsv");
+    testFile = new File(getOpennlpDataDir(), "germeval2014/NER-de-test.tsv");
+
+    verifyTrainingData(new GermEval2014NameSampleStream(
+            new MarkableFileInputStreamFactory(trainingFile),
+            ALL_TYPES, NerLayer.OUTER),
+        new BigInteger("175386258960384643455328517118707394452"));
+    verifyTrainingData(new GermEval2014NameSampleStream(
+            new MarkableFileInputStreamFactory(testFile),
+            ALL_TYPES, NerLayer.OUTER),
+        new BigInteger("112232325598196372951673841456976805014"));
+  }
+
+  // -- Person entity evaluation --
+
+  @Test
+  void evalPersonPerceptron() throws IOException {
+    final TrainingParameters params = createPerceptronParams();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, 
0.6086631814787155d);
+  }
+
+  @Test
+  void evalPersonMaxentGis() throws IOException {
+    final TrainingParameters params = 
ModelUtil.createDefaultTrainingParameters();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES, 
0.5204518893650175d);
+  }
+
+  // -- Organization entity evaluation --
+
+  @Test
+  void evalOrganizationPerceptron() throws IOException {
+    final TrainingParameters params = createPerceptronParams();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES, 
0.5588235294117646d);
+  }
+
+  @Test
+  void evalOrganizationMaxentGis() throws IOException {
+    final TrainingParameters params = 
ModelUtil.createDefaultTrainingParameters();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES, 
0.4594180704441041d);
+  }
+
+  // -- Location entity evaluation --
+
+  @Test
+  void evalLocationPerceptron() throws IOException {
+    final TrainingParameters params = createPerceptronParams();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, 
0.6705613411226822d);
+  }
+
+  @Test
+  void evalLocationMaxentGis() throws IOException {
+    final TrainingParameters params = 
ModelUtil.createDefaultTrainingParameters();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES, 
0.5537280701754386d);
+  }
+
+  // -- Misc (OTH) entity evaluation --
+
+  @Test
+  void evalMiscPerceptron() throws IOException {
+    final TrainingParameters params = createPerceptronParams();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES, 
0.4482142857142857d);
+  }
+
+  @Test
+  void evalMiscMaxentGis() throws IOException {
+    final TrainingParameters params = 
ModelUtil.createDefaultTrainingParameters();
+
+    final TokenNameFinderModel model = train(trainingFile, params,
+        GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES);
+
+    eval(model, testFile,
+        GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES, 
0.3932267168391345d);
+  }
+
+  // -- Combined (all types) evaluation --
+
+  @Test
+  void evalCombinedPerceptron() throws IOException {
+    final TrainingParameters params = createPerceptronParams();
+
+    final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES);
+
+    eval(model, testFile, ALL_TYPES, 0.6016631636662707d);
+  }
+
+  @Test
+  void evalCombinedMaxentGis() throws IOException {
+    final TrainingParameters params = 
ModelUtil.createDefaultTrainingParameters();
+
+    final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES);
+
+    eval(model, testFile, ALL_TYPES, 0.5229054890631449d);
+  }
+}


Reply via email to