(opennlp) branch main updated: OPENNLP-1555 TokenizerME should detect multi-dot abbreviations (#599)

mawiesne Thu, 02 May 2024 00:31:31 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new 3dc72457 OPENNLP-1555 TokenizerME should detect multi-dot 
abbreviations (#599)
3dc72457 is described below

commit 3dc72457ac93d3d36560dd8c0665c89f8ec62661
Author: Martin Wiesner <[email protected]>
AuthorDate: Thu May 2 09:31:22 2024 +0200

    OPENNLP-1555 TokenizerME should detect multi-dot abbreviations (#599)
---
 .../java/opennlp/tools/tokenize/TokenizerME.java   |  4 +++-
 .../tools/tokenize/TokenizerFactoryTest.java       | 22 +++++++++++++++++++++-
 .../test/resources/opennlp/tools/lang/abb_DE.xml   |  3 +++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 5e8a1a2e..6664ef27 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -220,7 +220,9 @@ public class TokenizerME extends AbstractTokenizer {
             if (isAcceptableAbbreviation(tok)) {
               newTokens.add(new Span(start, end));
               tokProbs.add(tokenProb);
-              start = j + 1; // To compensate for the abbreviation dot
+              long numberOfDots = tok.codePoints().filter(ch -> ch == 
'.').count();
+              j = j + (int) numberOfDots; // To compensate for abbreviation 
dot(s)
+              start = j + 1;
             } else {
               newTokens.add(new Span(start, j));
               tokProbs.add(tokenProb);
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index 9adf5bdd..23d2ba7a 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -228,10 +228,21 @@ public class TokenizerFactoryTest {
   void testCustomPatternForTokenizerMEWithAbbreviationsDeu() throws 
IOException {
     String lang = "deu";
     String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$";
-    String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der 
botanischen Monographie.";
+    String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten " +
+            "Traum von der botanischen Monographie.";
     checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
   }
 
+  @Test // to verify OPENNLP-1555
+  void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDeu() throws 
IOException {
+    String lang = "deu";
+    String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$";
+    // Adds an extra "z.B.", the compact form of "z. B." (zum Beispiel => for 
example)
+    String sentence = "Ich wähle z.B. den auf S. 183 ff. mitgeteilten " +
+            "Traum von der botanischen Monographie.";
+    checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
+  }
+
   @Test
   void testCustomPatternForTokenizerMEWithAbbreviationsDut() throws 
IOException {
     String lang = "dut";
@@ -241,6 +252,15 @@ public class TokenizerFactoryTest {
     checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
   }
 
+  @Test // to verify OPENNLP-1555
+  void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDut() throws 
IOException {
+    String lang = "dut";
+    String pattern = "^[A-Za-z0-9äöüëèéïĳÄÖÜËÉÈÏĲ]+$";
+    String sentence = "Ik kies voor de droom van de botanische monografie die 
" +
+            "op p. 183 e.v. wordt beschreven.";
+    checkCustomPatternForTokenizerME(lang, pattern, sentence, 17);
+  }
+
   @Test
   void testCustomPatternForTokenizerMEWithAbbreviationsFra() throws 
IOException {
     String lang = "fra";
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml 
b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
index 842603da..ac7f9589 100644
--- a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -32,4 +32,7 @@
   <entry>
     <token>z. B.</token>
   </entry>
+  <entry>
+    <token>z.B.</token>
+  </entry>
 </dictionary>

(opennlp) branch main updated: OPENNLP-1555 TokenizerME should detect multi-dot abbreviations (#599)

Reply via email to