This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1474_Create_tokenizer_factories_for_other_langs_(Spanish,_Italian,_.) in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit ec19211bda1fad70efd9590dd4d4c5bb7d191523 Author: Martin Wiesner <[email protected]> AuthorDate: Sat Mar 4 13:40:16 2023 +0100 OPENNLP-1474 Create tokenizer factories for other langs (Spanish, Italian, ...) - adds Spanish and Italian alphabet regex patterns to `...lang.Factory` - adjusts German pattern to include `é` and `É` to cover established loan words such as "Café" or "Cuvée" - adjusts `TokenizerFactoryTest` to use langCode "eng" instead of "spa" as Spanish will (now) return a specialized pattern --- .../java/opennlp/tools/tokenize/lang/Factory.java | 27 ++++++++++++++++++---- .../tools/tokenize/TokenizerFactoryTest.java | 10 ++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java index 9ec267a7..3099da73 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java @@ -30,9 +30,19 @@ public class Factory { private static final Pattern PORTUGUESE = Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$"); private static final Pattern FRENCH = Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$"); - // For reference: https://www.sttmedia.com/characterfrequency-dutch + // From: https://www.sttmedia.com/characterfrequency-dutch private static final Pattern DUTCH = Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$"); - private static final Pattern GERMAN = Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$"); + + // Note: The extra é and É are included to cover German "Lehnwörter" such as "Café" + private static final Pattern GERMAN = Pattern.compile("^[A-Za-z0-9äéöüÄÉÖÜß]+$"); + + // From: https://it.wikipedia.org/wiki/Alfabeto_italiano + private static final Pattern ITALIAN = Pattern.compile("^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$"); + + // From: https://en.wikiversity.org/wiki/Alphabet/Spanish_alphabet & + // https://en.wikipedia.org/wiki/Spanish_orthography#Alphabet_in_Spanish & + // https://www.fundeu.es/consulta/tilde-en-la-y-y-griega-o-ye-24786/ + private static final Pattern SPANISH = Pattern.compile("^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$"); /** * Gets the alphanumeric pattern for a language. @@ -43,18 +53,25 @@ public class Factory { */ public Pattern getAlphanumeric(String languageCode) { // For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes + if ("es".equals(languageCode) || "spa".equals(languageCode)) { + return SPANISH; + } + if ("it".equals(languageCode) || "ita".equals(languageCode)) { + return ITALIAN; + } if ("pt".equals(languageCode) || "por".equals(languageCode)) { return PORTUGUESE; } + if ("de".equals(languageCode) || "deu".equals(languageCode) || "ger".equals(languageCode)) { + return GERMAN; + } if ("fr".equals(languageCode) || "fre".equals(languageCode) || "fra".equals(languageCode)) { return FRENCH; } if ("nl".equals(languageCode) || "nld".equals(languageCode) || "dut".equals(languageCode)) { return DUTCH; } - if ("de".equals(languageCode) || "deu".equals(languageCode) || "ger".equals(languageCode)) { - return GERMAN; - } + return DEFAULT_ALPHANUMERIC; } diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java index 3a958229..930ab3a0 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java @@ -65,7 +65,7 @@ public class TokenizerFactoryTest { void testDefault() throws IOException { Dictionary dic = loadAbbDictionary(); - final String lang = "spa"; + final String lang = "eng"; TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null)); @@ -99,7 +99,7 @@ public class TokenizerFactoryTest { void testNullDict() throws IOException { Dictionary dic = null; - final String lang = "spa"; + final String lang = "eng"; TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null)); @@ -134,7 +134,7 @@ public class TokenizerFactoryTest { Dictionary dic = null; final String lang = "spa"; - String pattern = "^[0-9A-Za-z]+$"; + String pattern = "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$"; TokenizerModel model = train(new TokenizerFactory(lang, dic, true, Pattern.compile(pattern))); @@ -167,7 +167,7 @@ public class TokenizerFactoryTest { void testDummyFactory() throws IOException { Dictionary dic = loadAbbDictionary(); - final String lang = "spa"; + final String lang = "eng"; String pattern = "^[0-9A-Za-z]+$"; TokenizerModel model = train(new DummyTokenizerFactory(lang, dic, true, @@ -199,7 +199,7 @@ public class TokenizerFactoryTest { @Test void testCreateDummyFactory() throws IOException { Dictionary dic = loadAbbDictionary(); - final String lang = "spa"; + final String lang = "eng"; String pattern = "^[0-9A-Za-z]+$"; TokenizerFactory factory = TokenizerFactory.create(
