This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
OPENNLP-1584-FeatureGeneratorUtil-shall-detect-German-umlauts-with-dot-as-'cp'
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 161279c6de84bca1013deeb49045d15cae91624a
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon Jul 1 10:43:15 2024 +0200

    OPENNLP-1584 FeatureGeneratorUtil shall detect German umlauts with dot as 
'cp'
    - enhances the Pattern in FeatureGeneratorUtil
    - adds related test cases
    - improves JavaDoc of WindowFeatureGenerator along the path
---
 .../tools/util/featuregen/FeatureGeneratorUtil.java    |  5 +----
 .../tools/util/featuregen/WindowFeatureGenerator.java  | 15 +++++++++------
 .../util/featuregen/FeatureGeneratorUtilTest.java      | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
index e6b8af95..22373021 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
@@ -25,10 +25,7 @@ import java.util.regex.Pattern;
  */
 public class FeatureGeneratorUtil {
 
-  private static final String TOKEN_CLASS_PREFIX = "wc";
-  private static final String TOKEN_AND_CLASS_PREFIX = "w&c";
-
-  private static final Pattern capPeriod = Pattern.compile("^[A-Z]\\.$");
+  private static final Pattern capPeriod = Pattern.compile("^[A-ZÄÖÜ]\\.$");
 
   /**
    * Generates a class name for the specified token.
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java
index d6359881..c58573aa 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java
@@ -22,13 +22,16 @@ import java.util.ArrayList;
 import java.util.List;
 
 /**
- * Generates previous and next features for a given {@link 
AdaptiveFeatureGenerator}.
- * The window size can be specified.
- * <p>
+ * Generates previous (left-sided) and next (right-sided) features for a
+ * given {@link AdaptiveFeatureGenerator}. The window size can be specified.
+ * <p><br/>
  * Features:
- * Current token is always included unchanged
- * Previous tokens are prefixed with p distance
- * Next tokens are prefix with n distance
+ * <ul>
+ * <li> Current token is always included unchanged,</li>
+ * <li>Previous tokens are prefixed with {@code p} distance,</li>
+ * <li>Next tokens are prefix with {@code n} distance.</li>
+ *
+ * @see AdaptiveFeatureGenerator
  */
 public class WindowFeatureGenerator implements AdaptiveFeatureGenerator {
 
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
index 9655678b..cd35f092 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
@@ -50,6 +50,24 @@ public class FeatureGeneratorUtilTest {
     Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("#"));
     Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("%"));
     Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("&"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("§"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("^"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("°"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("("));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature(")"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("/"));
+    Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("\\"));
+  }
+
+  @Test
+  void testGerman() {
+    Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Änne"));
+    Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Özlem"));
+    Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Ümit"));
+    Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ä."));
+    Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ö."));
+    Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ü."));
+    Assertions.assertEquals("sc", FeatureGeneratorUtil.tokenFeature("Ü"));
   }
 
   @Test

Reply via email to