Re: [PR] OPENNLP-912: Rule based sentence detector (opennlp)

via GitHub Wed, 29 Nov 2023 00:13:05 -0800


rzo1 commented on code in PR #390:
URL: https://github.com/apache/opennlp/pull/390#discussion_r1408871740



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+  private String sentence;
+
+  private int start;
+
+  private int end;
+
+  private CharSequence text;
+
+  private Reader reader;
+
+  private int bufferLength;
+
+  private LanguageTool languageTool;
+
+  private Matcher beforeMatcher;
+
+  private Matcher afterMatcher;
+
+  boolean found;

Review Comment:
   Should be private.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {
+      cleanList.clear();
+    }
+  }
+
+  /**
+   * TODO: Move rules into profiles

Review Comment:
   Is this still valid? What is meant by this TODO?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Clean {
+
+  String regex;

Review Comment:
   Might be worth to use a `Pattern` here to avoid compiling the regex in every 
`replaceAll(...)` call.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {
+      cleanList.clear();
+    }
+  }
+
+  /**
+   * TODO: Move rules into profiles
+   */
+  public void rules() {
+
+    cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+    cleanList.add(new Clean("\\n \\n", "\n"));
+
+    cleanList.add(new Clean("\\n\\n", "\n"));
+
+    cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+    cleanList.add(new Clean("(?<=\\s)\\n", ""));
+    cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+    cleanList.add(new Clean("\\n", "\n"));
+    cleanList.add(new Clean("\\\\n", "\n"));
+    cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+    cleanList.add(new 
Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+    cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+//    cleanList.add(new Clean("\\.{5,}", " "));

Review Comment:
   Can we remove outcommented lines?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {

Review Comment:
   The list is never null



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Rule.java:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+/**
+ * Represents break or exception rule. Contains after break and before
+ * break patterns,
+ *
+ */
+public class Rule {

Review Comment:
   Record?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Section.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Section {
+
+  int left;

Review Comment:
   these should be private



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment";>segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+  private static final Pattern STAR_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+  private static final Pattern PLUS_PATTERN = Pattern
+      
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+  private static final Pattern RANGE_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+  private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+  /**
+   * Replaces block quotes in regular expressions with normal quotes. For
+   * example "\Qabc\E" will be replace with "\a\b\c".
+   *
+   * @param pattern
+   * @return pattern with replaced block quotes
+   */
+  public static String removeBlockQuotes(String pattern) {

Review Comment:
   might be worth adding a check otherwise this might produce a null pointer.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+  private String sentence;
+
+  private int start;
+
+  private int end;
+
+  private CharSequence text;
+
+  private Reader reader;
+
+  private int bufferLength;
+
+  private LanguageTool languageTool;
+
+  private Matcher beforeMatcher;
+
+  private Matcher afterMatcher;
+
+  boolean found;
+
+  private Set<Integer> breakSections;
+
+  private List<Section> noBreakSections;
+
+  public SentenceTokenizerME(LanguageTool languageTool, CharSequence text) {
+    this.text = text;
+    this.reader = null;
+    this.bufferLength = text.length();
+    this.languageTool = languageTool;
+    this.sentence = null;
+    this.start = 0;
+    this.end = 0;
+  }
+
+  public SentenceTokenizerME(LanguageTool languageTool, Reader reader, int 
bufferLength) {
+    if (bufferLength <= 0) {
+      throw new IllegalArgumentException("Buffer size: " + bufferLength +
+          " must be positive.");
+    }
+    this.text = null;
+    this.reader = reader;
+    this.bufferLength = bufferLength;
+    this.languageTool = languageTool;
+    this.sentence = null;
+    this.start = 0;
+    this.end = 0;
+  }
+
+  public List<String> sentenceTokenizer() {
+
+    List<String> sentenceList = new ArrayList<>();
+    CharSequence text = getText();
+    if (breakSections == null) {
+      getBreakSections();
+    }
+    for (Integer breakSection : breakSections) {
+      if (breakSection == 0) {
+        continue;
+      }
+      if (breakSection >= text.length()) {
+        break;
+      }
+      end = breakSection;
+      if (!isBreak()) {
+        continue;
+      }
+      sentence = text.subSequence(start, end).toString();
+      start = end;
+
+      sentence = removeSpace(sentence);
+      if (sentence != null) {
+        sentenceList.add(sentence);
+      }
+    }
+    if (end < text.length()) {
+      end = text.length();
+      sentence = text.subSequence(start, end).toString();
+      sentence = removeSpace(sentence);
+      if (sentence != null) {
+        sentenceList.add(sentence);
+      }
+    }
+    return sentenceList;
+  }
+
+  public String removeSpace(String segment) {
+    if (segment != null) {
+      int first = 0;
+      int last = segment.length();
+      while (first < segment.length() && 
StringUtil.isWhitespace(segment.charAt(first))) {
+        first++;
+      }
+      while (last > 0 && StringUtil.isWhitespace(segment.charAt(last - 1))) {
+        last--;
+      }
+      if (last - first > 0) {
+        return segment.substring(first, last);
+      }
+    }
+    return null;
+  }
+
+  public Set<Integer> getBreakSections() {
+    if (breakSections == null) {
+      breakSections = new TreeSet<Integer>();
+      for (Rule rule : languageTool.getBreakRuleList()) {
+
+        Pattern beforePattern = languageTool.compile(rule.getBeforePattern());
+        Pattern afterPattern = languageTool.compile(rule.getAfterPattern());
+        this.beforeMatcher = beforePattern.matcher(text);
+        this.afterMatcher = afterPattern.matcher(text);
+        this.found = true;
+        while (find()) {
+          breakSections.add(getBreakPosition());
+        }
+      }
+    }
+    return breakSections;
+  }
+
+  private boolean find() {
+    found = false;
+    while ((!found) && beforeMatcher.find()) {
+      afterMatcher.region(beforeMatcher.end(), text.length());
+      found = afterMatcher.lookingAt();
+    }
+    return found;
+  }
+
+  private int getBreakPosition() {
+    return afterMatcher.start();
+  }
+
+  public List<Section> getNoBreakSections() {
+    if (noBreakSections == null) {
+      noBreakSections = new ArrayList<Section>();
+      Pattern pattern = languageTool.getNoBreakPattern();
+      Matcher matcher = pattern.matcher(getText());
+      while (matcher.find()) {
+        noBreakSections.add(new Section(matcher.start(), matcher.end()));
+      }
+    }
+    return noBreakSections;
+  }
+
+  public CharSequence getText() {
+    if (text == null) {
+      text = read(bufferLength + 1);
+    }
+    return text;
+  }
+
+  private String read(int amount) {
+    char[] charBuffer = new char[amount];
+    int count = read(reader, charBuffer);
+
+    String result;
+    if (count == amount) {
+      result = new String(charBuffer, 0, count - 1);
+    } else if (count > 0 && count < amount) {
+      result = new String(charBuffer, 0, count);
+    } else {
+      result = "";
+    }
+
+    return result;
+  }
+
+  private int read(Reader reader, char[] buffer) {
+
+    int start = 0;
+    int count;
+
+    try {
+      while (true) {
+        if (!(((count = reader.read(buffer, start, buffer.length - start)) != 
-1)
+            && start < buffer.length)) {
+          break;
+        }
+        start += count;
+      }
+    } catch (IOException e) {
+      e.printStackTrace();

Review Comment:
   We shouldn*t just print the stack trace: Either rethrow as runtime exception 
or at least log it.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {

Review Comment:
   +1



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Clean {

Review Comment:
   Can be a `Record` ?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();

Review Comment:
   Why is this `public` ?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Represents rule for segmenting text in some language. Contains {@link Rule}
+ * list.
+ *
+ */
+public class LanguageRule {

Review Comment:
   Record?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {
+      cleanList.clear();
+    }
+  }
+
+  /**
+   * TODO: Move rules into profiles
+   */
+  public void rules() {
+
+    cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+    cleanList.add(new Clean("\\n \\n", "\n"));
+
+    cleanList.add(new Clean("\\n\\n", "\n"));
+
+    cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+    cleanList.add(new Clean("(?<=\\s)\\n", ""));
+    cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+    cleanList.add(new Clean("\\n", "\n"));
+    cleanList.add(new Clean("\\\\n", "\n"));
+    cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+    cleanList.add(new 
Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+    cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+//    cleanList.add(new Clean("\\.{5,}", " "));
+    cleanList.add(new Clean("\\/{3}", ""));
+
+//    cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
+//    cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
+
+    cleanList.add(new Clean("\\n(?=•')", "\r"));
+    cleanList.add(new Clean("''", "\""));
+    cleanList.add(new Clean("``", "\""));
+
+  }
+
+  public void html() {

Review Comment:
   Maybe we can replace this pattern with a builder pattern (instead of 
profiles), so one can do something like:
   
   `new Cleaner.Builder().withDefaults().withHTML().withPDF().build()` 



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageTool.java:
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Pattern;
+
+public class LanguageTool {
+
+  private LanguageRule languageRule;
+
+  private String languageName;
+
+  private Map<String, Object> cache;
+
+  public static final String MAX_LOOKBEHIND_LENGTH_PARAM = 
"maxLookbehindLength";
+
+  public static final int DEFAULT_MAX_LOOKBEHIND_LENGTH = 100;
+
+  private int maxLookbehindLength;
+
+  private Map<String, Object> parameterMap;
+
+  private List<Rule> breakRuleList;
+
+  private Pattern noBreakPattern;
+
+  public LanguageTool(String languageName, LanguageRule languageRule) {
+    this(languageName, languageRule, Collections.emptyMap());
+  }
+
+  public LanguageTool(String languageName, LanguageRule languageRule, 
Map<String, Object> paramMap) {
+    this.languageRule = languageRule;
+    this.languageName = languageName;
+    parameterMap = new HashMap<String, Object>(paramMap);
+    if (parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM) != null) {
+      this.maxLookbehindLength = (int) 
parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM);
+    } else {
+      this.maxLookbehindLength = DEFAULT_MAX_LOOKBEHIND_LENGTH;
+    }
+    init();
+  }
+
+  private void init() {
+
+    this.cache = new ConcurrentHashMap<String, Object>();
+    this.breakRuleList = new ArrayList<Rule>();
+    StringBuilder noBreakPatternBuilder = new StringBuilder();
+
+    for (Rule rule : languageRule.getRuleList()) {
+
+      if (rule.isBreak()) {
+        breakRuleList.add(rule);
+      } else {
+        if (noBreakPatternBuilder.length() > 0) {
+          noBreakPatternBuilder.append('|');
+        }
+        String patternString = createNoBreakPatternString(rule);
+        noBreakPatternBuilder.append(patternString);
+      }
+    }
+
+    if (noBreakPatternBuilder.length() > 0) {
+      String noBreakPatternString = noBreakPatternBuilder.toString();
+      noBreakPattern = compile(noBreakPatternString);
+    } else {
+      noBreakPattern = null;
+    }
+
+  }
+
+  public Map<String, Object> getParameterMap() {
+    return parameterMap;
+  }
+
+  public LanguageRule getLanguageRule() {
+    return languageRule;
+  }
+
+  public String getLanguageName() {
+    return languageName;
+  }
+
+  public Map<String, Object> getCache() {
+    return cache;
+  }
+
+  public Pattern compile(String regex) {

Review Comment:
   We might be able to simplify by using `computeIfAbsent`.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment";>segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+  private static final Pattern STAR_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+  private static final Pattern PLUS_PATTERN = Pattern
+      
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+  private static final Pattern RANGE_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+  private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+  /**
+   * Replaces block quotes in regular expressions with normal quotes. For
+   * example "\Qabc\E" will be replace with "\a\b\c".
+   *
+   * @param pattern

Review Comment:
   missing description



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment";>segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+  private static final Pattern STAR_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+  private static final Pattern PLUS_PATTERN = Pattern
+      
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+  private static final Pattern RANGE_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+  private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+  /**
+   * Replaces block quotes in regular expressions with normal quotes. For
+   * example "\Qabc\E" will be replace with "\a\b\c".

Review Comment:
   replaced



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+  private String sentence;
+
+  private int start;
+
+  private int end;
+
+  private CharSequence text;
+
+  private Reader reader;
+
+  private int bufferLength;
+
+  private LanguageTool languageTool;
+
+  private Matcher beforeMatcher;
+
+  private Matcher afterMatcher;
+
+  boolean found;
+
+  private Set<Integer> breakSections;
+
+  private List<Section> noBreakSections;
+
+  public SentenceTokenizerME(LanguageTool languageTool, CharSequence text) {

Review Comment:
   I wonder if we can rebuild this to avoid creating a Tokenizer for every 
piece of text? Wouldn't it be of more value to provide the text as a method 
parameter and compute the stuff on the fly? It would also allow us to make it 
threadsafe in the future.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Section.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Section {

Review Comment:
   record?



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizer.java:
##########
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.List;
+
+/**
+ * The interface for rule based sentence detector
+ */
+public interface SentenceTokenizer {

Review Comment:
   +1 (and the method would require a proper description). It is totally 
unclear what the provided method does/shall do from an implementor perspective.



##########
opennlp-tools/src/test/java/opennlp/tools/sentdetect/segment/GoldenRulesTest.java:
##########
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
+
+/**
+ * Thanks for the GoldenRules of
+ * <a 
href="https://github.com/diasks2/pragmatic_segmenter";>pragmatic_segmenter</a>
+ */
+public class GoldenRulesTest {
+
+  public Cleaner cleaner = new Cleaner();
+
+  public List<String> segment(String text) {
+    if (cleaner != null) {
+      text = cleaner.clean(text);
+    }
+
+    InputStream inputStream = getClass().getResourceAsStream(

Review Comment:
   we should close the stream + read it once and consume the cached result for 
every test run.



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment";>segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+  private static final Pattern STAR_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+  private static final Pattern PLUS_PATTERN = Pattern
+      
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+  private static final Pattern RANGE_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+  private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+      .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+  /**
+   * Replaces block quotes in regular expressions with normal quotes. For
+   * example "\Qabc\E" will be replace with "\a\b\c".
+   *
+   * @param pattern
+   * @return pattern with replaced block quotes
+   */
+  public static String removeBlockQuotes(String pattern) {
+    StringBuilder patternBuilder = new StringBuilder();
+    boolean quote = false;
+    char previousChar = 0;
+
+    for (int i = 0; i < pattern.length(); ++i) {
+      char currentChar = pattern.charAt(i);
+
+      if (quote) {
+        if (previousChar == '\\' && currentChar == 'E') {
+          quote = false;
+          // Need to remove "\\" at the end as it has been added
+          // in previous iteration.
+          patternBuilder.delete(patternBuilder.length() - 2,
+              patternBuilder.length());
+        } else {
+          patternBuilder.append('\\');
+          patternBuilder.append(currentChar);
+        }
+      } else {
+        if (previousChar == '\\' && currentChar == 'Q') {
+          quote = true;
+          // Need to remove "\" at the end as it has been added
+          // in previous iteration.
+          patternBuilder.deleteCharAt(patternBuilder.length() - 1);
+        } else {
+          patternBuilder.append(currentChar);
+        }
+      }
+
+      previousChar = currentChar;
+    }
+
+    return patternBuilder.toString();
+  }
+
+  /**
+   * Changes unlimited length pattern to limited length pattern. It is done by
+   * replacing constructs with "*" and "+" symbols with their finite
+   * counterparts - "{0,n}" and {1,n}.
+   * As a side effect block quotes are replaced with normal quotes
+   * by using {@link #removeBlockQuotes(String)}.
+   *
+   * @param pattern  pattern to be finitized
+   * @param infinity "n" number
+   * @return limited length pattern
+   */
+  public static String finitize(String pattern, int infinity) {
+    String finitePattern = removeBlockQuotes(pattern);
+
+    Matcher starMatcher = STAR_PATTERN.matcher(finitePattern);
+    finitePattern = starMatcher.replaceAll("{0," + infinity + "}");
+
+    Matcher plusMatcher = PLUS_PATTERN.matcher(finitePattern);
+    finitePattern = plusMatcher.replaceAll("{1," + infinity + "}");
+
+    Matcher rangeMatcher = RANGE_PATTERN.matcher(finitePattern);
+    finitePattern = rangeMatcher.replaceAll("{$1," + infinity + "}");
+
+    return finitePattern;
+  }
+
+  /**
+   * Replaces capturing groups with non-capturing groups in the given regular
+   * expression. As a side effect block quotes are replaced with normal quotes
+   * by using {@link #removeBlockQuotes(String)}.
+   *
+   * @param pattern

Review Comment:
   missing description, see above regarding paramter validation



##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {
+      cleanList.clear();
+    }
+  }
+
+  /**
+   * TODO: Move rules into profiles
+   */
+  public void rules() {
+
+    cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+    cleanList.add(new Clean("\\n \\n", "\n"));
+
+    cleanList.add(new Clean("\\n\\n", "\n"));
+
+    cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+    cleanList.add(new Clean("(?<=\\s)\\n", ""));
+    cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+    cleanList.add(new Clean("\\n", "\n"));
+    cleanList.add(new Clean("\\\\n", "\n"));
+    cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+    cleanList.add(new 
Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+    cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+//    cleanList.add(new Clean("\\.{5,}", " "));
+    cleanList.add(new Clean("\\/{3}", ""));
+
+//    cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
+//    cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
+
+    cleanList.add(new Clean("\\n(?=•')", "\r"));
+    cleanList.add(new Clean("''", "\""));
+    cleanList.add(new Clean("``", "\""));
+
+  }
+
+  public void html() {

Review Comment:
   Guess we should also allow, that a user can just add custom rules:
   
   new 
Cleaner.Builder().withDefaults().withHTML().withPDF().withCustomRules(rules).build()
   
   



##########
opennlp-tools/src/main/resources/opennlp/tools/sentdetect/segment/rules.xml:
##########
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>

Review Comment:
   How did you generate this xml file? It does not seem to originate from 
[the](https://github.com/diasks2/pragmatic_segmenter 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@opennlp.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] OPENNLP-912: Rule based sentence detector (opennlp)

Reply via email to