Bar Perach created OPENNLP-772:
----------------------------------
Summary: Japanese end of sentence fix
Key: OPENNLP-772
URL: https://issues.apache.org/jira/browse/OPENNLP-772
Project: OpenNLP
Issue Type: Improvement
Components: Sentence Detector
Affects Versions: tools-1.5.3
Reporter: Bar Perach
the end of sentence characters list was wrong for japanese
removed duplicate code
Index: opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
===================================================================
--- opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
(revision 1678426)
+++ opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
(local)
@@ -36,14 +36,12 @@
public static final char[] thEosCharacters = new char[] { ' ','\n' };
+ // TODO add more sentence enders
+ public static final char[] jpEosCharacters = new char[] {'。', '!', '?'};
+
public EndOfSentenceScanner createEndOfSentenceScanner(String languageCode) {
- if ("th".equals(languageCode)) {
- return new DefaultEndOfSentenceScanner(new char[]{' ','\n'});
- } else if("pt".equals(languageCode)) {
- return new DefaultEndOfSentenceScanner(ptEosCharacters);
- }
- return new DefaultEndOfSentenceScanner(defaultEosCharacters);
+ return new DefaultEndOfSentenceScanner(getEOSCharacters(languageCode));
}
public EndOfSentenceScanner createEndOfSentenceScanner(
@@ -76,6 +74,8 @@
return thEosCharacters;
} else if ("pt".equals(languageCode)) {
return ptEosCharacters;
+ } else if ("jp".equals(languageCode)) {
+ return jpEosCharacters;
}
return defaultEosCharacters;
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)