This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 12ec3dba OPENNLP-1474 Create tokenizer factories for other langs
(Spanish, Italian, ...) (#516)
12ec3dba is described below
commit 12ec3dba98688435b10fe585315b4def92f22328
Author: Martin Wiesner <[email protected]>
AuthorDate: Mon Mar 6 20:23:07 2023 +0100
OPENNLP-1474 Create tokenizer factories for other langs (Spanish, Italian,
...) (#516)
---
.../java/opennlp/tools/tokenize/lang/Factory.java | 40 +++++++++++++++++++---
.../tools/tokenize/TokenizerFactoryTest.java | 10 +++---
2 files changed, 40 insertions(+), 10 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
index 9ec267a7..ebb236cc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
@@ -30,10 +30,27 @@ public class Factory {
private static final Pattern PORTUGUESE =
Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
private static final Pattern FRENCH =
Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$");
- // For reference: https://www.sttmedia.com/characterfrequency-dutch
+ // From: https://www.sttmedia.com/characterfrequency-dutch
private static final Pattern DUTCH =
Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$");
- private static final Pattern GERMAN =
Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$");
+ // Note: The extra é and É are included to cover German "Lehnwörter" such as
"Café"
+ private static final Pattern GERMAN =
Pattern.compile("^[A-Za-z0-9äéöüÄÉÖÜß]+$");
+
+ // From: https://en.wikipedia.org/wiki/Polish_alphabet
+ // https://pl.wikipedia.org/wiki/Alfabet_polski
+ private static final Pattern POLISH =
Pattern.compile("^[A-Za-z0-9żźćńółęąśŻŹĆĄŚĘŁÓŃ]+$");
+
+ // From: https://it.wikipedia.org/wiki/Alfabeto_italiano
+ private static final Pattern ITALIAN =
Pattern.compile("^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$");
+
+ // From: https://en.wikiversity.org/wiki/Alphabet/Spanish_alphabet &
+ //
https://en.wikipedia.org/wiki/Spanish_orthography#Alphabet_in_Spanish &
+ // https://www.fundeu.es/consulta/tilde-en-la-y-y-griega-o-ye-24786/
+ private static final Pattern SPANISH =
Pattern.compile("^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$");
+
+ // From: https://en.wikipedia.org/wiki/Catalan_orthography#Spelling_patterns
+ private static final Pattern CATALAN =
Pattern.compile("^[0-9a-zàèéíïòóúüçA-ZÀÈÉÍÏÒÓÚÜÇ]+$");
+
/**
* Gets the alphanumeric pattern for a language.
*
@@ -43,18 +60,31 @@ public class Factory {
*/
public Pattern getAlphanumeric(String languageCode) {
// For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+ if ("es".equals(languageCode) || "spa".equals(languageCode)) {
+ return SPANISH;
+ }
+ if ("it".equals(languageCode) || "ita".equals(languageCode)) {
+ return ITALIAN;
+ }
if ("pt".equals(languageCode) || "por".equals(languageCode)) {
return PORTUGUESE;
}
+ if ("ca".equals(languageCode) || "cat".equals(languageCode)) {
+ return CATALAN;
+ }
+ if ("pl".equals(languageCode) || "pol".equals(languageCode)) {
+ return POLISH;
+ }
+ if ("de".equals(languageCode) || "deu".equals(languageCode) ||
"ger".equals(languageCode)) {
+ return GERMAN;
+ }
if ("fr".equals(languageCode) || "fre".equals(languageCode) ||
"fra".equals(languageCode)) {
return FRENCH;
}
if ("nl".equals(languageCode) || "nld".equals(languageCode) ||
"dut".equals(languageCode)) {
return DUTCH;
}
- if ("de".equals(languageCode) || "deu".equals(languageCode) ||
"ger".equals(languageCode)) {
- return GERMAN;
- }
+
return DEFAULT_ALPHANUMERIC;
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index 3a958229..930ab3a0 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -65,7 +65,7 @@ public class TokenizerFactoryTest {
void testDefault() throws IOException {
Dictionary dic = loadAbbDictionary();
- final String lang = "spa";
+ final String lang = "eng";
TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
@@ -99,7 +99,7 @@ public class TokenizerFactoryTest {
void testNullDict() throws IOException {
Dictionary dic = null;
- final String lang = "spa";
+ final String lang = "eng";
TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
@@ -134,7 +134,7 @@ public class TokenizerFactoryTest {
Dictionary dic = null;
final String lang = "spa";
- String pattern = "^[0-9A-Za-z]+$";
+ String pattern = "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$";
TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
Pattern.compile(pattern)));
@@ -167,7 +167,7 @@ public class TokenizerFactoryTest {
void testDummyFactory() throws IOException {
Dictionary dic = loadAbbDictionary();
- final String lang = "spa";
+ final String lang = "eng";
String pattern = "^[0-9A-Za-z]+$";
TokenizerModel model = train(new DummyTokenizerFactory(lang, dic, true,
@@ -199,7 +199,7 @@ public class TokenizerFactoryTest {
@Test
void testCreateDummyFactory() throws IOException {
Dictionary dic = loadAbbDictionary();
- final String lang = "spa";
+ final String lang = "eng";
String pattern = "^[0-9A-Za-z]+$";
TokenizerFactory factory = TokenizerFactory.create(