This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new ffb015a1 OPENNLP-1563 Fix tokenization of words containing non-spacing
letters. (#602)
ffb015a1 is described below
commit ffb015a14505b09b8ecc477a9d5d9d39f823b960
Author: Hrayr Matevosyan <[email protected]>
AuthorDate: Tue May 28 20:56:17 2024 +0930
OPENNLP-1563 Fix tokenization of words containing non-spacing letters.
(#602)
* Fix tokenization of words containing non-spacing letters.
* Provide a full sentence for a unit test on SimpleTokenizer.tokenize().
---------
Co-authored-by: hrayrm <[email protected]>
---
.../java/opennlp/tools/tokenize/SimpleTokenizer.java | 2 +-
.../opennlp/tools/tokenize/SimpleTokenizerTest.java | 18 ++++++++++++++++++
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index d545f95c..beeb5ba7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -73,7 +73,7 @@ public class SimpleTokenizer extends AbstractTokenizer {
if (StringUtil.isWhitespace(c)) {
charType = CharacterEnum.WHITESPACE;
}
- else if (Character.isLetter(c)) {
+ else if (Character.isAlphabetic(c)) {
charType = CharacterEnum.ALPHABETIC;
}
else if (Character.isDigit(c)) {
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
index 3748c91a..d56d02d4 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
@@ -128,4 +128,22 @@ public class SimpleTokenizerTest {
Assertions.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n",
"b", "\r", "\n", "\r", "\n", "c"},
tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
}
+
+ /**
+ * Tests if it can tokenize a word containing a non-spacing character
+ * like Arabic Damma Unicode Character “◌ُ” (U+064F)
+ */
+ @Test
+ void testNonSpacingLetters() {
+ String text = "تمّ طُوّر المشروع بنجاح."; //In Arabic: "The project was
developed successfully."
+
+ String[] tokenizedText = mTokenizer.tokenize(text);
+
+ Assertions.assertEquals(5, tokenizedText.length);
+ Assertions.assertEquals("تمّ", tokenizedText[0]);
+ Assertions.assertEquals("طُوّر", tokenizedText[1]);
+ Assertions.assertEquals("المشروع", tokenizedText[2]);
+ Assertions.assertEquals("بنجاح", tokenizedText[3]);
+ Assertions.assertEquals(".", tokenizedText[4]);
+ }
}