This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-141_Tokenizers_alpha_numeric_optimization_only_recognizes_a-z_as_alpha_chars in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit aa6a242dd53eb351956042c3f8a8fa3999aa1677 Author: Martin Wiesner <[email protected]> AuthorDate: Sun Feb 26 15:38:46 2023 +0100 OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars - adds pre-compiled alphanumeric patterns to `Factory` for French, Dutch, and German - removes long-time deprecated static field in `TokenizerME` - adjusts `TokenizerFactoryTest` to hold against pre-compiled default pattern - improves JavaDoc --- .../java/opennlp/tools/tokenize/TokenizerME.java | 7 ---- .../java/opennlp/tools/tokenize/lang/Factory.java | 39 +++++++++++++++++----- .../tools/tokenize/TokenizerFactoryTest.java | 13 ++++---- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java index a76f3a8c..88f73dde 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java @@ -85,13 +85,6 @@ public class TokenizerME extends AbstractTokenizer { */ public static final String NO_SPLIT = "F"; - /** - * Alpha-Numeric Pattern - * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumeric(String)} - */ - @Deprecated - public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC); - private final Pattern alphanumeric; /* diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java index 171613a8..5de023f4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java @@ -25,24 +25,45 @@ import opennlp.tools.tokenize.TokenContextGenerator; public class Factory { - public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$"; + public static final Pattern DEFAULT_ALPHANUMERIC = Pattern.compile("^[A-Za-z0-9]+$"); + + private static final Pattern PORTOGUESE = Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$"); + private static final Pattern FRENCH = Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$"); + + // For reference: https://www.sttmedia.com/characterfrequency-dutch + private static final Pattern DUTCH = Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$"); + private static final Pattern GERMAN = Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$"); /** - * Gets the alphanumeric pattern for the language. Please save the value - * locally because this call is expensive. + * Gets the alphanumeric pattern for a language. * - * @param languageCode The language code. If {@code null}, or unknown, - * the default pattern will be returned. - * @return The alphanumeric pattern for the language or the default pattern. + * @param languageCode The ISO_639-1 code. If {@code null}, or unknown, the + * {@link #DEFAULT_ALPHANUMERIC} pattern will be returned. + * @return The alphanumeric {@link Pattern} for the language, or the default pattern. */ public Pattern getAlphanumeric(String languageCode) { + // For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes if ("pt".equals(languageCode) || "por".equals(languageCode)) { - return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$"); + return PORTOGUESE; } - - return Pattern.compile(DEFAULT_ALPHANUMERIC); + if ("fr".equals(languageCode) || "fre".equals(languageCode) || "fra".equals(languageCode)) { + return FRENCH; + } + if ("nl".equals(languageCode) || "nld".equals(languageCode) || "dut".equals(languageCode)) { + return DUTCH; + } + if ("de".equals(languageCode) || "deu".equals(languageCode) || "ger".equals(languageCode)) { + return GERMAN; + } + return DEFAULT_ALPHANUMERIC; } + /** + * Initializes a customized {@link TokenContextGenerator} via a set of {@code abbreviations}. + * + * @param languageCode The ISO_639-1 code to be used. + * @param abbreviations The abbreviations to be used for new instance. + */ public TokenContextGenerator createTokenContextGenerator(String languageCode, Set<String> abbreviations) { return new DefaultTokenContextGenerator(abbreviations); } diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java index e759c854..3a958229 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java @@ -42,8 +42,7 @@ import opennlp.tools.util.TrainingParameters; */ public class TokenizerFactoryTest { - private static ObjectStream<TokenSample> createSampleStream() - throws IOException { + private static ObjectStream<TokenSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train"); @@ -74,7 +73,8 @@ public class TokenizerFactoryTest { Assertions.assertNotNull(factory.getAbbreviationDictionary()); Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); + String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern(); + Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern()); Assertions.assertEquals(lang, factory.getLanguageCode()); Assertions.assertEquals(lang, model.getLanguage()); Assertions.assertFalse(factory.isUseAlphaNumericOptimization()); @@ -89,7 +89,7 @@ public class TokenizerFactoryTest { Assertions.assertNotNull(factory.getAbbreviationDictionary()); Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); + Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern()); Assertions.assertEquals(lang, factory.getLanguageCode()); Assertions.assertEquals(lang, model.getLanguage()); Assertions.assertFalse(factory.isUseAlphaNumericOptimization()); @@ -107,7 +107,8 @@ public class TokenizerFactoryTest { Assertions.assertNull(factory.getAbbreviationDictionary()); Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); + String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern(); + Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern()); Assertions.assertEquals(lang, factory.getLanguageCode()); Assertions.assertEquals(lang, model.getLanguage()); Assertions.assertFalse(factory.isUseAlphaNumericOptimization()); @@ -122,7 +123,7 @@ public class TokenizerFactoryTest { Assertions.assertNull(factory.getAbbreviationDictionary()); Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator); - Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern()); + Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern()); Assertions.assertEquals(lang, factory.getLanguageCode()); Assertions.assertEquals(lang, model.getLanguage()); Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
