Bar Perach created OPENNLP-772:
----------------------------------

             Summary: Japanese end of sentence fix
                 Key: OPENNLP-772
                 URL: https://issues.apache.org/jira/browse/OPENNLP-772
             Project: OpenNLP
          Issue Type: Improvement
          Components: Sentence Detector
    Affects Versions: tools-1.5.3
            Reporter: Bar Perach


the end of sentence characters list was wrong for japanese
removed duplicate code

Index: opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
===================================================================
--- opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java      
(revision 1678426)
+++ opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java      
(local)
@@ -36,14 +36,12 @@
 
   public static final char[] thEosCharacters = new char[] { ' ','\n' };
 
+  // TODO add more sentence enders
+  public static final char[] jpEosCharacters = new char[] {'。', '!', '?'};
+
   public EndOfSentenceScanner createEndOfSentenceScanner(String languageCode) {
-    if ("th".equals(languageCode)) {
-      return new DefaultEndOfSentenceScanner(new char[]{' ','\n'});
-    } else if("pt".equals(languageCode)) {
-      return new DefaultEndOfSentenceScanner(ptEosCharacters);
-    }
 
-    return new DefaultEndOfSentenceScanner(defaultEosCharacters);
+    return new DefaultEndOfSentenceScanner(getEOSCharacters(languageCode));
   }
 
   public EndOfSentenceScanner createEndOfSentenceScanner(
@@ -76,6 +74,8 @@
       return thEosCharacters;
     } else if ("pt".equals(languageCode)) {
       return ptEosCharacters;
+    } else if ("jp".equals(languageCode)) {
+      return jpEosCharacters;
     }
 
     return defaultEosCharacters;






--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to