This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new a69184c1 OPENNLP-141 Tokenizers alphanumeric optimization only
recognizes a-z as alpha chars (#506)
a69184c1 is described below
commit a69184c1a8689ce804d24bce697ab5c0eba1c4cd
Author: Martin Wiesner <[email protected]>
AuthorDate: Thu Mar 2 07:09:45 2023 +0100
OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as
alpha chars (#506)
---
.../java/opennlp/tools/tokenize/TokenizerME.java | 7 ----
.../java/opennlp/tools/tokenize/lang/Factory.java | 39 +++++++++++++++++-----
.../tools/tokenize/TokenizerFactoryTest.java | 13 ++++----
3 files changed, 37 insertions(+), 22 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index a76f3a8c..88f73dde 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -85,13 +85,6 @@ public class TokenizerME extends AbstractTokenizer {
*/
public static final String NO_SPLIT = "F";
- /**
- * Alpha-Numeric Pattern
- * @deprecated As of release 1.5.2, replaced by {@link
Factory#getAlphanumeric(String)}
- */
- @Deprecated
- public static final Pattern alphaNumeric =
Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
-
private final Pattern alphanumeric;
/*
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
index 171613a8..9ec267a7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
@@ -25,24 +25,45 @@ import opennlp.tools.tokenize.TokenContextGenerator;
public class Factory {
- public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$";
+ public static final Pattern DEFAULT_ALPHANUMERIC =
Pattern.compile("^[A-Za-z0-9]+$");
+
+ private static final Pattern PORTUGUESE =
Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+ private static final Pattern FRENCH =
Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$");
+
+ // For reference: https://www.sttmedia.com/characterfrequency-dutch
+ private static final Pattern DUTCH =
Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$");
+ private static final Pattern GERMAN =
Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$");
/**
- * Gets the alphanumeric pattern for the language. Please save the value
- * locally because this call is expensive.
+ * Gets the alphanumeric pattern for a language.
*
- * @param languageCode The language code. If {@code null}, or unknown,
- * the default pattern will be returned.
- * @return The alphanumeric pattern for the language or the default pattern.
+ * @param languageCode The ISO_639-1 code. If {@code null}, or unknown, the
+ * {@link #DEFAULT_ALPHANUMERIC} pattern will be
returned.
+ * @return The alphanumeric {@link Pattern} for the language, or the default
pattern.
*/
public Pattern getAlphanumeric(String languageCode) {
+ // For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
if ("pt".equals(languageCode) || "por".equals(languageCode)) {
- return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+ return PORTUGUESE;
}
-
- return Pattern.compile(DEFAULT_ALPHANUMERIC);
+ if ("fr".equals(languageCode) || "fre".equals(languageCode) ||
"fra".equals(languageCode)) {
+ return FRENCH;
+ }
+ if ("nl".equals(languageCode) || "nld".equals(languageCode) ||
"dut".equals(languageCode)) {
+ return DUTCH;
+ }
+ if ("de".equals(languageCode) || "deu".equals(languageCode) ||
"ger".equals(languageCode)) {
+ return GERMAN;
+ }
+ return DEFAULT_ALPHANUMERIC;
}
+ /**
+ * Initializes a customized {@link TokenContextGenerator} via a set of
{@code abbreviations}.
+ *
+ * @param languageCode The ISO_639-1 code to be used.
+ * @param abbreviations The abbreviations to be used for new instance.
+ */
public TokenContextGenerator createTokenContextGenerator(String
languageCode, Set<String> abbreviations) {
return new DefaultTokenContextGenerator(abbreviations);
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index e759c854..3a958229 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -42,8 +42,7 @@ import opennlp.tools.util.TrainingParameters;
*/
public class TokenizerFactoryTest {
- private static ObjectStream<TokenSample> createSampleStream()
- throws IOException {
+ private static ObjectStream<TokenSample> createSampleStream() throws
IOException {
InputStreamFactory in = new ResourceAsStreamFactory(
TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train");
@@ -74,7 +73,8 @@ public class TokenizerFactoryTest {
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof
DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC,
factory.getAlphaNumericPattern().pattern());
+ String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+ Assertions.assertEquals(defaultPattern,
factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -89,7 +89,7 @@ public class TokenizerFactoryTest {
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof
DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC,
factory.getAlphaNumericPattern().pattern());
+ Assertions.assertEquals(defaultPattern,
factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -107,7 +107,8 @@ public class TokenizerFactoryTest {
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof
DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC,
factory.getAlphaNumericPattern().pattern());
+ String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+ Assertions.assertEquals(defaultPattern,
factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -122,7 +123,7 @@ public class TokenizerFactoryTest {
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof
DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC,
factory.getAlphaNumericPattern().pattern());
+ Assertions.assertEquals(defaultPattern,
factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());