This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 98b8758 OPENNLP-1197: support Japanese letters in FeatureGeneratorUtil
98b8758 is described below
commit 98b8758e761caf659608f48fc5c9e9056f911050
Author: koji <[email protected]>
AuthorDate: Fri May 18 09:38:16 2018 +0900
OPENNLP-1197: support Japanese letters in FeatureGeneratorUtil
---
.../util/featuregen/FeatureGeneratorUtil.java | 10 +++-
.../tools/util/featuregen/StringPattern.java | 57 +++++++++++++++++++---
.../util/featuregen/FeatureGeneratorUtilTest.java | 19 ++++++++
.../tools/util/featuregen/StringPatternTest.java | 26 ++++++++++
4 files changed, 103 insertions(+), 9 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
index 79c2a50..e6b8af9 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
@@ -34,6 +34,8 @@ public class FeatureGeneratorUtil {
* Generates a class name for the specified token.
* The classes are as follows where the first matching class is used:
* <ul>
+ * <li>jah - Japanese Hiragana</li>
+ * <li>jak - Japanese Katakana</li>
* <li>lc - lowercase alphabetic</li>
* <li>2d - two digits </li>
* <li>4d - four digits </li>
@@ -56,7 +58,13 @@ public class FeatureGeneratorUtil {
StringPattern pattern = StringPattern.recognize(token);
String feat;
- if (pattern.isAllLowerCaseLetter()) {
+ if (pattern.isAllHiragana()) {
+ feat = "jah";
+ }
+ else if (pattern.isAllKatakana()) {
+ feat = "jak";
+ }
+ else if (pattern.isAllLowerCaseLetter()) {
feat = "lc";
}
else if (pattern.digits() == 2) {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
index eae7bc4..458912f 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
@@ -27,13 +27,15 @@ public class StringPattern {
private static final int ALL_LOWERCASE_LETTER = 0x1 << 2;
private static final int ALL_LETTERS = 0x1 << 3;
private static final int ALL_DIGIT = 0x1 << 4;
- private static final int CONTAINS_PERIOD = 0x1 << 5;
- private static final int CONTAINS_COMMA = 0x1 << 6;
- private static final int CONTAINS_SLASH = 0x1 << 7;
- private static final int CONTAINS_DIGIT = 0x1 << 8;
- private static final int CONTAINS_HYPHEN = 0x1 << 9;
- private static final int CONTAINS_LETTERS = 0x1 << 10;
- private static final int CONTAINS_UPPERCASE = 0x1 << 11;
+ private static final int ALL_HIRAGANA = 0x1 << 5;
+ private static final int ALL_KATAKANA = 0x1 << 6;
+ private static final int CONTAINS_PERIOD = 0x1 << 7;
+ private static final int CONTAINS_COMMA = 0x1 << 8;
+ private static final int CONTAINS_SLASH = 0x1 << 9;
+ private static final int CONTAINS_DIGIT = 0x1 << 10;
+ private static final int CONTAINS_HYPHEN = 0x1 << 11;
+ private static final int CONTAINS_LETTERS = 0x1 << 12;
+ private static final int CONTAINS_UPPERCASE = 0x1 << 13;
private final int pattern;
@@ -46,7 +48,8 @@ public class StringPattern {
public static StringPattern recognize(String token) {
- int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT |
ALL_LETTERS;
+ int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT |
ALL_LETTERS
+ | ALL_HIRAGANA | ALL_KATAKANA;
int digits = 0;
@@ -83,6 +86,7 @@ public class StringPattern {
if (letterType == Character.DECIMAL_DIGIT_NUMBER) {
pattern |= CONTAINS_DIGIT;
+ pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
digits++;
} else {
pattern &= ~ALL_DIGIT;
@@ -109,6 +113,29 @@ public class StringPattern {
break;
}
}
+
+ // for Japanese...
+ final int codePoint = token.codePointAt(i);
+ final Character.UnicodeScript us = Character.UnicodeScript.of(codePoint);
+ if (us != Character.UnicodeScript.COMMON) {
+ if (us == Character.UnicodeScript.LATIN) {
+ pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
+ }
+ else if (us == Character.UnicodeScript.HAN) {
+ pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA | ALL_LOWERCASE_LETTER);
+ }
+ else if (us == Character.UnicodeScript.HIRAGANA) {
+ pattern &= ~(ALL_KATAKANA | ALL_LOWERCASE_LETTER);
+ }
+ else if (us == Character.UnicodeScript.KATAKANA) {
+ pattern &= ~(ALL_HIRAGANA | ALL_LOWERCASE_LETTER);
+ }
+ }
+ else {
+ if (ch == ',' || ch == '.' || ch == '?' || ch == '!') {
+ pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
+ }
+ }
}
return new StringPattern(pattern, digits);
@@ -150,6 +177,20 @@ public class StringPattern {
}
/**
+ * @return true if all chars are hiragana.
+ */
+ public boolean isAllHiragana() {
+ return (pattern & ALL_HIRAGANA) > 0;
+ }
+
+ /**
+ * @return true if all chars are katakana.
+ */
+ public boolean isAllKatakana() {
+ return (pattern & ALL_KATAKANA) > 0;
+ }
+
+ /**
* Retrieves the number of digits.
*/
public int digits() {
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
index cca0d83..7d7f233 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
@@ -41,5 +41,24 @@ public class FeatureGeneratorUtilTest {
Assert.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("A."));
Assert.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Mike"));
Assert.assertEquals("other",
FeatureGeneratorUtil.tokenFeature("somethingStupid"));
+
+ // symbols
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature(","));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("."));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("?"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("!"));
+ }
+
+ @Test
+ public void testJapanese() {
+ // Hiragana
+ Assert.assertEquals("jah", FeatureGeneratorUtil.tokenFeature("そういえば"));
+ Assert.assertEquals("jah",
FeatureGeneratorUtil.tokenFeature("おーぷん・そ〜す・そふとうぇあ"));
+ Assert.assertEquals("other",
FeatureGeneratorUtil.tokenFeature("あぱっち・そふとうぇあ財団"));
+
+ // Katakana
+ Assert.assertEquals("jak", FeatureGeneratorUtil.tokenFeature("ジャパン"));
+ Assert.assertEquals("jak",
FeatureGeneratorUtil.tokenFeature("オープン・ソ〜ス・ソフトウェア"));
+ Assert.assertEquals("other",
FeatureGeneratorUtil.tokenFeature("アパッチ・ソフトウェア財団"));
}
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
index 187bb2f..75a7b8f 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
@@ -29,6 +29,7 @@ public class StringPatternTest {
Assert.assertTrue(StringPattern.recognize("TesT").isAllLetter());
Assert.assertTrue(StringPattern.recognize("grün").isAllLetter());
Assert.assertTrue(StringPattern.recognize("üäöæß").isAllLetter());
+ Assert.assertTrue(StringPattern.recognize("あア亜Aa").isAllLetter());
}
@Test
@@ -37,6 +38,9 @@ public class StringPatternTest {
Assert.assertFalse(StringPattern.recognize("tEST").isInitialCapitalLetter());
Assert.assertTrue(StringPattern.recognize("TesT").isInitialCapitalLetter());
Assert.assertTrue(StringPattern.recognize("Üäöæß").isInitialCapitalLetter());
+
Assert.assertFalse(StringPattern.recognize("いイ井").isInitialCapitalLetter());
+
Assert.assertTrue(StringPattern.recognize("Iいイ井").isInitialCapitalLetter());
+
Assert.assertTrue(StringPattern.recognize("Iいイ井").isInitialCapitalLetter());
}
@Test
@@ -45,6 +49,8 @@ public class StringPatternTest {
Assert.assertTrue(StringPattern.recognize("ÄÄÄÜÜÜÖÖÖÖ").isAllCapitalLetter());
Assert.assertFalse(StringPattern.recognize("ÄÄÄÜÜÜÖÖä").isAllCapitalLetter());
Assert.assertFalse(StringPattern.recognize("ÄÄÄÜÜdÜÖÖ").isAllCapitalLetter());
+ Assert.assertTrue(StringPattern.recognize("ABC").isAllCapitalLetter());
+ Assert.assertFalse(StringPattern.recognize("うウ宇").isAllCapitalLetter());
}
@Test
@@ -56,6 +62,8 @@ public class StringPatternTest {
Assert.assertFalse(StringPattern.recognize("TEST").isAllLowerCaseLetter());
Assert.assertFalse(StringPattern.recognize("testT").isAllLowerCaseLetter());
Assert.assertFalse(StringPattern.recognize("tesÖt").isAllLowerCaseLetter());
+ Assert.assertTrue(StringPattern.recognize("abc").isAllLowerCaseLetter());
+ Assert.assertFalse(StringPattern.recognize("えエ絵").isAllLowerCaseLetter());
}
@Test
@@ -63,6 +71,21 @@ public class StringPatternTest {
Assert.assertTrue(StringPattern.recognize("123456").isAllDigit());
Assert.assertFalse(StringPattern.recognize("123,56").isAllDigit());
Assert.assertFalse(StringPattern.recognize("12356f").isAllDigit());
+ Assert.assertTrue(StringPattern.recognize("123456").isAllDigit());
+ }
+
+ @Test
+ public void testIsAllHiragana() {
+ Assert.assertTrue(StringPattern.recognize("あぱっち・るしーん").isAllHiragana());
+
Assert.assertFalse(StringPattern.recognize("あぱっち・そふとうぇあ財団").isAllHiragana());
+
Assert.assertFalse(StringPattern.recognize("あぱっち・るしーんV1.0").isAllHiragana());
+ }
+
+ @Test
+ public void testIsAllKatakana() {
+ Assert.assertTrue(StringPattern.recognize("アパッチ・ルシーン").isAllKatakana());
+
Assert.assertFalse(StringPattern.recognize("アパッチ・ソフトウェア財団").isAllKatakana());
+
Assert.assertFalse(StringPattern.recognize("アパッチ・ルシーンV1.0").isAllKatakana());
}
@Test
@@ -70,6 +93,7 @@ public class StringPatternTest {
Assert.assertEquals(6, StringPattern.recognize("123456").digits());
Assert.assertEquals(3, StringPattern.recognize("123fff").digits());
Assert.assertEquals(0, StringPattern.recognize("test").digits());
+ Assert.assertEquals(3, StringPattern.recognize("123fff").digits());
}
@Test
@@ -98,6 +122,8 @@ public class StringPatternTest {
Assert.assertTrue(StringPattern.recognize("test1").containsDigit());
Assert.assertTrue(StringPattern.recognize("23,5").containsDigit());
Assert.assertFalse(StringPattern.recognize("test./-,").containsDigit());
+ Assert.assertTrue(StringPattern.recognize("テスト1").containsDigit());
+ Assert.assertFalse(StringPattern.recognize("テストTEST").containsDigit());
}
@Test
--
To stop receiving notification emails like this one, please contact
[email protected].