(opennlp) branch main updated: OPENNLP-1563 Fix tokenization of words containing non-spacing letters. (#602)

rzo1 Tue, 28 May 2024 04:26:28 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new ffb015a1 OPENNLP-1563 Fix tokenization of words containing non-spacing 
letters. (#602)
ffb015a1 is described below

commit ffb015a14505b09b8ecc477a9d5d9d39f823b960
Author: Hrayr Matevosyan <[email protected]>
AuthorDate: Tue May 28 20:56:17 2024 +0930

    OPENNLP-1563 Fix tokenization of words containing non-spacing letters. 
(#602)
    
    * Fix tokenization of words containing non-spacing letters.
    
    * Provide a full sentence for a unit test on SimpleTokenizer.tokenize().
    
    ---------
    
    Co-authored-by: hrayrm <[email protected]>
---
 .../java/opennlp/tools/tokenize/SimpleTokenizer.java   |  2 +-
 .../opennlp/tools/tokenize/SimpleTokenizerTest.java    | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index d545f95c..beeb5ba7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -73,7 +73,7 @@ public class SimpleTokenizer extends AbstractTokenizer {
       if (StringUtil.isWhitespace(c)) {
         charType = CharacterEnum.WHITESPACE;
       }
-      else if (Character.isLetter(c)) {
+      else if (Character.isAlphabetic(c)) {
         charType = CharacterEnum.ALPHABETIC;
       }
       else if (Character.isDigit(c)) {
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
index 3748c91a..d56d02d4 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
@@ -128,4 +128,22 @@ public class SimpleTokenizerTest {
     Assertions.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", 
"b", "\r", "\n", "\r", "\n", "c"},
         tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
   }
+
+  /**
+   * Tests if it can tokenize a word containing a non-spacing character
+   * like Arabic Damma Unicode Character “◌ُ” (U+064F)
+   */
+  @Test
+  void testNonSpacingLetters() {
+    String text = "تمّ طُوّر المشروع بنجاح."; //In Arabic: "The project was 
developed successfully."
+
+    String[] tokenizedText = mTokenizer.tokenize(text);
+
+    Assertions.assertEquals(5, tokenizedText.length);
+    Assertions.assertEquals("تمّ", tokenizedText[0]);
+    Assertions.assertEquals("طُوّر", tokenizedText[1]);
+    Assertions.assertEquals("المشروع", tokenizedText[2]);
+    Assertions.assertEquals("بنجاح", tokenizedText[3]);
+    Assertions.assertEquals(".", tokenizedText[4]);
+  }
 }

(opennlp) branch main updated: OPENNLP-1563 Fix tokenization of words containing non-spacing letters. (#602)

Reply via email to