[ 
https://issues.apache.org/jira/browse/OPENNLP-1214?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16636312#comment-16636312
 ] 

ASF GitHub Bot commented on OPENNLP-1214:
-----------------------------------------

kojisekig closed pull request #329: OPENNLP-1214: use hash to avoid linear 
search in DefaultEndOfSentence…
URL: https://github.com/apache/opennlp/pull/329
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
index 75d0ec00c..2b8c0bee9 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
@@ -19,7 +19,9 @@
 package opennlp.tools.sentdetect;
 
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 /**
  * Default implementation of the {@link EndOfSentenceScanner}.
@@ -28,7 +30,9 @@
  */
 public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
 
-  private char[] eosCharacters;
+  private Set<Character> eosCharacters;
+  @Deprecated
+  private char[] eosChars;
 
   /**
    * Initializes the current instance.
@@ -36,7 +40,11 @@
    * @param eosCharacters
    */
   public DefaultEndOfSentenceScanner(char[] eosCharacters) {
-    this.eosCharacters = eosCharacters;
+    this.eosCharacters = new HashSet<>();
+    for (char eosChar: eosCharacters) {
+      this.eosCharacters.add(eosChar);
+    }
+    this.eosChars = eosCharacters;
   }
 
   public List<Integer> getPositions(String s) {
@@ -49,19 +57,21 @@ public DefaultEndOfSentenceScanner(char[] eosCharacters) {
 
   public List<Integer> getPositions(char[] cbuf) {
     List<Integer> l = new ArrayList<>();
-    char[] eosCharacters = getEndOfSentenceCharacters();
     for (int i = 0; i < cbuf.length; i++) {
-      for (char eosCharacter : eosCharacters) {
-        if (cbuf[i] == eosCharacter) {
-          l.add(i);
-          break;
-        }
+      if (eosCharacters.contains(cbuf[i])) {
+        l.add(i);
       }
     }
     return l;
   }
 
+  @Deprecated
   public char[] getEndOfSentenceCharacters() {
+    return eosChars;
+  }
+
+  @Override
+  public Set<Character> getEOSCharacters() {
     return eosCharacters;
   }
 }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
index a29119beb..8c2822bd1 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -42,7 +43,7 @@
 
   private Set<String> inducedAbbreviations;
 
-  private char[] eosCharacters;
+  private Set<Character> eosCharacters;
 
   /**
    * Creates a new <code>SDContextGenerator</code> instance with
@@ -66,7 +67,10 @@ public DefaultSDContextGenerator(char[] eosCharacters) {
    */
   public DefaultSDContextGenerator(Set<String> inducedAbbreviations, char[] 
eosCharacters) {
     this.inducedAbbreviations = inducedAbbreviations;
-    this.eosCharacters = eosCharacters;
+    this.eosCharacters = new HashSet<>();
+    for (char eosChar: eosCharacters) {
+      this.eosCharacters.add(eosChar);
+    }
     buf = new StringBuffer();
     collectFeats = new ArrayList<>();
   }
@@ -121,12 +125,9 @@ private static String escapeChar(Character c) {
     int c = position;
     { ///assign prefix, stop if you run into a period though otherwise stop at 
space
       while (--c > prefixStart) {
-        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
-          if (sb.charAt(c) == eosCharacters[eci]) {
-            prefixStart = c;
-            c++; // this gets us out of while loop.
-            break;
-          }
+        if (eosCharacters.contains(sb.charAt(c))) {
+          prefixStart = c;
+          c++; // this gets us out of while loop.
         }
       }
       prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim();
@@ -138,12 +139,9 @@ private static String escapeChar(Character c) {
     {
       c = position;
       while (++c < suffixEnd) {
-        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
-          if (sb.charAt(c) == eosCharacters[eci]) {
-            suffixEnd = c;
-            c--; // this gets us out of while loop.
-            break;
-          }
+        if (eosCharacters.contains(sb.charAt(c))) {
+          suffixEnd = c;
+          c--; // this gets us out of while loop.
         }
       }
     }
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
index b48ad3f32..7963e377f 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
@@ -18,6 +18,7 @@
 package opennlp.tools.sentdetect;
 
 import java.util.List;
+import java.util.Set;
 
 /**
  * Scans Strings, StringBuffers, and char[] arrays for the offsets of
@@ -34,8 +35,15 @@
    * Returns an array of character which can indicate the end of a sentence.
    * @return an array of character which can indicate the end of a sentence.
    */
+  @Deprecated
   char[] getEndOfSentenceCharacters();
 
+  /**
+   * Returns a set of character which can indicate the end of a sentence.
+   * @return a set of character which can indicate the end of a sentence.
+   */
+  Set<Character> getEOSCharacters();
+
   /**
    * The receiver scans the specified string for sentence ending characters and
    * returns their offsets.
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
new file mode 100644
index 000000000..f0104982b
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.sentdetect.lang.Factory;
+
+public class DefaultSDContextGeneratorTest {
+
+  @Test
+  public void testGetContext() throws Exception {
+    SDContextGenerator sdContextGenerator =
+        new DefaultSDContextGenerator(Collections.<String>emptySet(), 
Factory.defaultEosCharacters);
+
+    String[] context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+    
Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/v=/s=/n=Smith/ncap".split("/"), 
context);
+
+    context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 
29);
+    
Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/v=RONDHUIT/vcap/s=/n=as".split("/"),
 context);
+  }
+
+  @Test
+  public void testGetContextWithAbbreviations() throws Exception {
+    SDContextGenerator sdContextGenerator =
+        new DefaultSDContextGenerator(new 
HashSet<>(Arrays.asList("Mr./Inc.".split("/"))),
+            Factory.defaultEosCharacters);
+
+    String[] context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+    
Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/xabbrev/v=/s=/n=Smith/ncap".split("/"),
 context);
+
+    context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 
29);
+    
Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/xabbrev/v=RONDHUIT/vcap/s=/n=as".split("/"),
 context);
+  }
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> use hash to avoid linear search in DefaultEndOfSentenceScanner
> --------------------------------------------------------------
>
>                 Key: OPENNLP-1214
>                 URL: https://issues.apache.org/jira/browse/OPENNLP-1214
>             Project: OpenNLP
>          Issue Type: Improvement
>    Affects Versions: 1.9.0
>            Reporter: Koji Sekiguchi
>            Priority: Minor
>             Fix For: 1.9.1
>
>
> When DefaultEndOfSentenceScanner scans a sentence, it uses linear search to 
> check if each characters in the sentence is one of eos characters. I think 
> we'd better use HashSet to keep eosCharacters instead of char[].
> In accordance with this replacement, I'd like to make 
> getEndOfSentenceCharacters() deprecated because it returns char[] and nobody 
> in OpenNLP calls it at present, and I'd like to add the equivalent method 
> which returns Set<Character> of eos chars. Though it cannot keep the order of 
> eos chars but I don't think it can be a problem anyway.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to