This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 3dc72457 OPENNLP-1555 TokenizerME should detect multi-dot
abbreviations (#599)
3dc72457 is described below
commit 3dc72457ac93d3d36560dd8c0665c89f8ec62661
Author: Martin Wiesner <[email protected]>
AuthorDate: Thu May 2 09:31:22 2024 +0200
OPENNLP-1555 TokenizerME should detect multi-dot abbreviations (#599)
---
.../java/opennlp/tools/tokenize/TokenizerME.java | 4 +++-
.../tools/tokenize/TokenizerFactoryTest.java | 22 +++++++++++++++++++++-
.../test/resources/opennlp/tools/lang/abb_DE.xml | 3 +++
3 files changed, 27 insertions(+), 2 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 5e8a1a2e..6664ef27 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -220,7 +220,9 @@ public class TokenizerME extends AbstractTokenizer {
if (isAcceptableAbbreviation(tok)) {
newTokens.add(new Span(start, end));
tokProbs.add(tokenProb);
- start = j + 1; // To compensate for the abbreviation dot
+ long numberOfDots = tok.codePoints().filter(ch -> ch ==
'.').count();
+ j = j + (int) numberOfDots; // To compensate for abbreviation
dot(s)
+ start = j + 1;
} else {
newTokens.add(new Span(start, j));
tokProbs.add(tokenProb);
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index 9adf5bdd..23d2ba7a 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -228,10 +228,21 @@ public class TokenizerFactoryTest {
void testCustomPatternForTokenizerMEWithAbbreviationsDeu() throws
IOException {
String lang = "deu";
String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$";
- String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der
botanischen Monographie.";
+ String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten " +
+ "Traum von der botanischen Monographie.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
}
+ @Test // to verify OPENNLP-1555
+ void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDeu() throws
IOException {
+ String lang = "deu";
+ String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$";
+ // Adds an extra "z.B.", the compact form of "z. B." (zum Beispiel => for
example)
+ String sentence = "Ich wähle z.B. den auf S. 183 ff. mitgeteilten " +
+ "Traum von der botanischen Monographie.";
+ checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
+ }
+
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsDut() throws
IOException {
String lang = "dut";
@@ -241,6 +252,15 @@ public class TokenizerFactoryTest {
checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
}
+ @Test // to verify OPENNLP-1555
+ void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDut() throws
IOException {
+ String lang = "dut";
+ String pattern = "^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$";
+ String sentence = "Ik kies voor de droom van de botanische monografie die
" +
+ "op p. 183 e.v. wordt beschreven.";
+ checkCustomPatternForTokenizerME(lang, pattern, sentence, 17);
+ }
+
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsFra() throws
IOException {
String lang = "fra";
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
index 842603da..ac7f9589 100644
--- a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -32,4 +32,7 @@
<entry>
<token>z. B.</token>
</entry>
+ <entry>
+ <token>z.B.</token>
+ </entry>
</dictionary>