[
https://issues.apache.org/jira/browse/OPENNLP-912?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17790966#comment-17790966
]
ASF GitHub Bot commented on OPENNLP-912:
----------------------------------------
rzo1 commented on code in PR #390:
URL: https://github.com/apache/opennlp/pull/390#discussion_r1408871740
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+ private String sentence;
+
+ private int start;
+
+ private int end;
+
+ private CharSequence text;
+
+ private Reader reader;
+
+ private int bufferLength;
+
+ private LanguageTool languageTool;
+
+ private Matcher beforeMatcher;
+
+ private Matcher afterMatcher;
+
+ boolean found;
Review Comment:
Should be private.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
+
+ public String clean(String text) {
+ for (Clean clean : cleanList) {
+ text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+ }
+ return text;
+ }
+
+ public void clear() {
+ if (cleanList != null) {
+ cleanList.clear();
+ }
+ }
+
+ /**
+ * TODO: Move rules into profiles
Review Comment:
Is this still valid? What is meant by this TODO?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Clean {
+
+ String regex;
Review Comment:
Might be worth to use a `Pattern` here to avoid compiling the regex in every
`replaceAll(...)` call.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
+
+ public String clean(String text) {
+ for (Clean clean : cleanList) {
+ text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+ }
+ return text;
+ }
+
+ public void clear() {
+ if (cleanList != null) {
+ cleanList.clear();
+ }
+ }
+
+ /**
+ * TODO: Move rules into profiles
+ */
+ public void rules() {
+
+ cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+ cleanList.add(new Clean("\\n \\n", "\n"));
+
+ cleanList.add(new Clean("\\n\\n", "\n"));
+
+ cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+ cleanList.add(new Clean("(?<=\\s)\\n", ""));
+ cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+ cleanList.add(new Clean("\\n", "\n"));
+ cleanList.add(new Clean("\\\\n", "\n"));
+ cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+ cleanList.add(new
Clean("\\{b\\^>\\d*<b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+ cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+// cleanList.add(new Clean("\\.{5,}", " "));
Review Comment:
Can we remove outcommented lines?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
+
+ public String clean(String text) {
+ for (Clean clean : cleanList) {
+ text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+ }
+ return text;
+ }
+
+ public void clear() {
+ if (cleanList != null) {
Review Comment:
The list is never null
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Rule.java:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+/**
+ * Represents break or exception rule. Contains after break and before
+ * break patterns,
+ *
+ */
+public class Rule {
Review Comment:
Record?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Section.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Section {
+
+ int left;
Review Comment:
these should be private
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment">segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+ private static final Pattern STAR_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+ private static final Pattern PLUS_PATTERN = Pattern
+
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+ private static final Pattern RANGE_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+ private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+ /**
+ * Replaces block quotes in regular expressions with normal quotes. For
+ * example "\Qabc\E" will be replace with "\a\b\c".
+ *
+ * @param pattern
+ * @return pattern with replaced block quotes
+ */
+ public static String removeBlockQuotes(String pattern) {
Review Comment:
might be worth adding a check otherwise this might produce a null pointer.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+ private String sentence;
+
+ private int start;
+
+ private int end;
+
+ private CharSequence text;
+
+ private Reader reader;
+
+ private int bufferLength;
+
+ private LanguageTool languageTool;
+
+ private Matcher beforeMatcher;
+
+ private Matcher afterMatcher;
+
+ boolean found;
+
+ private Set<Integer> breakSections;
+
+ private List<Section> noBreakSections;
+
+ public SentenceTokenizerME(LanguageTool languageTool, CharSequence text) {
+ this.text = text;
+ this.reader = null;
+ this.bufferLength = text.length();
+ this.languageTool = languageTool;
+ this.sentence = null;
+ this.start = 0;
+ this.end = 0;
+ }
+
+ public SentenceTokenizerME(LanguageTool languageTool, Reader reader, int
bufferLength) {
+ if (bufferLength <= 0) {
+ throw new IllegalArgumentException("Buffer size: " + bufferLength +
+ " must be positive.");
+ }
+ this.text = null;
+ this.reader = reader;
+ this.bufferLength = bufferLength;
+ this.languageTool = languageTool;
+ this.sentence = null;
+ this.start = 0;
+ this.end = 0;
+ }
+
+ public List<String> sentenceTokenizer() {
+
+ List<String> sentenceList = new ArrayList<>();
+ CharSequence text = getText();
+ if (breakSections == null) {
+ getBreakSections();
+ }
+ for (Integer breakSection : breakSections) {
+ if (breakSection == 0) {
+ continue;
+ }
+ if (breakSection >= text.length()) {
+ break;
+ }
+ end = breakSection;
+ if (!isBreak()) {
+ continue;
+ }
+ sentence = text.subSequence(start, end).toString();
+ start = end;
+
+ sentence = removeSpace(sentence);
+ if (sentence != null) {
+ sentenceList.add(sentence);
+ }
+ }
+ if (end < text.length()) {
+ end = text.length();
+ sentence = text.subSequence(start, end).toString();
+ sentence = removeSpace(sentence);
+ if (sentence != null) {
+ sentenceList.add(sentence);
+ }
+ }
+ return sentenceList;
+ }
+
+ public String removeSpace(String segment) {
+ if (segment != null) {
+ int first = 0;
+ int last = segment.length();
+ while (first < segment.length() &&
StringUtil.isWhitespace(segment.charAt(first))) {
+ first++;
+ }
+ while (last > 0 && StringUtil.isWhitespace(segment.charAt(last - 1))) {
+ last--;
+ }
+ if (last - first > 0) {
+ return segment.substring(first, last);
+ }
+ }
+ return null;
+ }
+
+ public Set<Integer> getBreakSections() {
+ if (breakSections == null) {
+ breakSections = new TreeSet<Integer>();
+ for (Rule rule : languageTool.getBreakRuleList()) {
+
+ Pattern beforePattern = languageTool.compile(rule.getBeforePattern());
+ Pattern afterPattern = languageTool.compile(rule.getAfterPattern());
+ this.beforeMatcher = beforePattern.matcher(text);
+ this.afterMatcher = afterPattern.matcher(text);
+ this.found = true;
+ while (find()) {
+ breakSections.add(getBreakPosition());
+ }
+ }
+ }
+ return breakSections;
+ }
+
+ private boolean find() {
+ found = false;
+ while ((!found) && beforeMatcher.find()) {
+ afterMatcher.region(beforeMatcher.end(), text.length());
+ found = afterMatcher.lookingAt();
+ }
+ return found;
+ }
+
+ private int getBreakPosition() {
+ return afterMatcher.start();
+ }
+
+ public List<Section> getNoBreakSections() {
+ if (noBreakSections == null) {
+ noBreakSections = new ArrayList<Section>();
+ Pattern pattern = languageTool.getNoBreakPattern();
+ Matcher matcher = pattern.matcher(getText());
+ while (matcher.find()) {
+ noBreakSections.add(new Section(matcher.start(), matcher.end()));
+ }
+ }
+ return noBreakSections;
+ }
+
+ public CharSequence getText() {
+ if (text == null) {
+ text = read(bufferLength + 1);
+ }
+ return text;
+ }
+
+ private String read(int amount) {
+ char[] charBuffer = new char[amount];
+ int count = read(reader, charBuffer);
+
+ String result;
+ if (count == amount) {
+ result = new String(charBuffer, 0, count - 1);
+ } else if (count > 0 && count < amount) {
+ result = new String(charBuffer, 0, count);
+ } else {
+ result = "";
+ }
+
+ return result;
+ }
+
+ private int read(Reader reader, char[] buffer) {
+
+ int start = 0;
+ int count;
+
+ try {
+ while (true) {
+ if (!(((count = reader.read(buffer, start, buffer.length - start)) !=
-1)
+ && start < buffer.length)) {
+ break;
+ }
+ start += count;
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
Review Comment:
We shouldn*t just print the stack trace: Either rethrow as runtime exception
or at least log it.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
Review Comment:
+1
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Clean {
Review Comment:
Can be a `Record` ?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
Review Comment:
Why is this `public` ?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Represents rule for segmenting text in some language. Contains {@link Rule}
+ * list.
+ *
+ */
+public class LanguageRule {
Review Comment:
Record?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
+
+ public String clean(String text) {
+ for (Clean clean : cleanList) {
+ text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+ }
+ return text;
+ }
+
+ public void clear() {
+ if (cleanList != null) {
+ cleanList.clear();
+ }
+ }
+
+ /**
+ * TODO: Move rules into profiles
+ */
+ public void rules() {
+
+ cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+ cleanList.add(new Clean("\\n \\n", "\n"));
+
+ cleanList.add(new Clean("\\n\\n", "\n"));
+
+ cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+ cleanList.add(new Clean("(?<=\\s)\\n", ""));
+ cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+ cleanList.add(new Clean("\\n", "\n"));
+ cleanList.add(new Clean("\\\\n", "\n"));
+ cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+ cleanList.add(new
Clean("\\{b\\^>\\d*<b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+ cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+// cleanList.add(new Clean("\\.{5,}", " "));
+ cleanList.add(new Clean("\\/{3}", ""));
+
+// cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
+// cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
+
+ cleanList.add(new Clean("\\n(?=•')", "\r"));
+ cleanList.add(new Clean("''", "\""));
+ cleanList.add(new Clean("``", "\""));
+
+ }
+
+ public void html() {
Review Comment:
Maybe we can replace this pattern with a builder pattern (instead of
profiles), so one can do something like:
`new Cleaner.Builder().withDefaults().withHTML().withPDF().build()`
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageTool.java:
##########
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Pattern;
+
+public class LanguageTool {
+
+ private LanguageRule languageRule;
+
+ private String languageName;
+
+ private Map<String, Object> cache;
+
+ public static final String MAX_LOOKBEHIND_LENGTH_PARAM =
"maxLookbehindLength";
+
+ public static final int DEFAULT_MAX_LOOKBEHIND_LENGTH = 100;
+
+ private int maxLookbehindLength;
+
+ private Map<String, Object> parameterMap;
+
+ private List<Rule> breakRuleList;
+
+ private Pattern noBreakPattern;
+
+ public LanguageTool(String languageName, LanguageRule languageRule) {
+ this(languageName, languageRule, Collections.emptyMap());
+ }
+
+ public LanguageTool(String languageName, LanguageRule languageRule,
Map<String, Object> paramMap) {
+ this.languageRule = languageRule;
+ this.languageName = languageName;
+ parameterMap = new HashMap<String, Object>(paramMap);
+ if (parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM) != null) {
+ this.maxLookbehindLength = (int)
parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM);
+ } else {
+ this.maxLookbehindLength = DEFAULT_MAX_LOOKBEHIND_LENGTH;
+ }
+ init();
+ }
+
+ private void init() {
+
+ this.cache = new ConcurrentHashMap<String, Object>();
+ this.breakRuleList = new ArrayList<Rule>();
+ StringBuilder noBreakPatternBuilder = new StringBuilder();
+
+ for (Rule rule : languageRule.getRuleList()) {
+
+ if (rule.isBreak()) {
+ breakRuleList.add(rule);
+ } else {
+ if (noBreakPatternBuilder.length() > 0) {
+ noBreakPatternBuilder.append('|');
+ }
+ String patternString = createNoBreakPatternString(rule);
+ noBreakPatternBuilder.append(patternString);
+ }
+ }
+
+ if (noBreakPatternBuilder.length() > 0) {
+ String noBreakPatternString = noBreakPatternBuilder.toString();
+ noBreakPattern = compile(noBreakPatternString);
+ } else {
+ noBreakPattern = null;
+ }
+
+ }
+
+ public Map<String, Object> getParameterMap() {
+ return parameterMap;
+ }
+
+ public LanguageRule getLanguageRule() {
+ return languageRule;
+ }
+
+ public String getLanguageName() {
+ return languageName;
+ }
+
+ public Map<String, Object> getCache() {
+ return cache;
+ }
+
+ public Pattern compile(String regex) {
Review Comment:
We might be able to simplify by using `computeIfAbsent`.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment">segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+ private static final Pattern STAR_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+ private static final Pattern PLUS_PATTERN = Pattern
+
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+ private static final Pattern RANGE_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+ private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+ /**
+ * Replaces block quotes in regular expressions with normal quotes. For
+ * example "\Qabc\E" will be replace with "\a\b\c".
+ *
+ * @param pattern
Review Comment:
missing description
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment">segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+ private static final Pattern STAR_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+ private static final Pattern PLUS_PATTERN = Pattern
+
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+ private static final Pattern RANGE_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+ private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+ /**
+ * Replaces block quotes in regular expressions with normal quotes. For
+ * example "\Qabc\E" will be replace with "\a\b\c".
Review Comment:
replaced
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizerME.java:
##########
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import opennlp.tools.util.StringUtil;
+
+
+public class SentenceTokenizerME implements SentenceTokenizer {
+
+ private String sentence;
+
+ private int start;
+
+ private int end;
+
+ private CharSequence text;
+
+ private Reader reader;
+
+ private int bufferLength;
+
+ private LanguageTool languageTool;
+
+ private Matcher beforeMatcher;
+
+ private Matcher afterMatcher;
+
+ boolean found;
+
+ private Set<Integer> breakSections;
+
+ private List<Section> noBreakSections;
+
+ public SentenceTokenizerME(LanguageTool languageTool, CharSequence text) {
Review Comment:
I wonder if we can rebuild this to avoid creating a Tokenizer for every
piece of text? Wouldn't it be of more value to provide the text as a method
parameter and compute the stuff on the fly? It would also allow us to make it
threadsafe in the future.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Section.java:
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Section {
Review Comment:
record?
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/SentenceTokenizer.java:
##########
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.List;
+
+/**
+ * The interface for rule based sentence detector
+ */
+public interface SentenceTokenizer {
Review Comment:
+1 (and the method would require a proper description). It is totally
unclear what the provided method does/shall do from an implementor perspective.
##########
opennlp-tools/src/test/java/opennlp/tools/sentdetect/segment/GoldenRulesTest.java:
##########
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
+
+/**
+ * Thanks for the GoldenRules of
+ * <a
href="https://github.com/diasks2/pragmatic_segmenter">pragmatic_segmenter</a>
+ */
+public class GoldenRulesTest {
+
+ public Cleaner cleaner = new Cleaner();
+
+ public List<String> segment(String text) {
+ if (cleaner != null) {
+ text = cleaner.clean(text);
+ }
+
+ InputStream inputStream = getClass().getResourceAsStream(
Review Comment:
we should close the stream + read it once and consume the cached result for
every test run.
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java:
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Thank Jarek Lipski and
+ * <a href="https://github.com/loomchild/segment">segment</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ */
+public class RuleUtil {
+
+ private static final Pattern STAR_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\*");
+
+ private static final Pattern PLUS_PATTERN = Pattern
+
.compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})(?<![\\?\\*\\+]|\\{[0-9],?[0-9]?\\}?\\})\\+");
+
+ private static final Pattern RANGE_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\{\\s*([0-9]+)\\s*,\\s*\\}");
+
+ private static final Pattern CAPTURING_GROUP_PATTERN = Pattern
+ .compile("(?<=(?<!\\\\)(?:\\\\\\\\){0,100})\\((?!\\?)");
+
+ /**
+ * Replaces block quotes in regular expressions with normal quotes. For
+ * example "\Qabc\E" will be replace with "\a\b\c".
+ *
+ * @param pattern
+ * @return pattern with replaced block quotes
+ */
+ public static String removeBlockQuotes(String pattern) {
+ StringBuilder patternBuilder = new StringBuilder();
+ boolean quote = false;
+ char previousChar = 0;
+
+ for (int i = 0; i < pattern.length(); ++i) {
+ char currentChar = pattern.charAt(i);
+
+ if (quote) {
+ if (previousChar == '\\' && currentChar == 'E') {
+ quote = false;
+ // Need to remove "\\" at the end as it has been added
+ // in previous iteration.
+ patternBuilder.delete(patternBuilder.length() - 2,
+ patternBuilder.length());
+ } else {
+ patternBuilder.append('\\');
+ patternBuilder.append(currentChar);
+ }
+ } else {
+ if (previousChar == '\\' && currentChar == 'Q') {
+ quote = true;
+ // Need to remove "\" at the end as it has been added
+ // in previous iteration.
+ patternBuilder.deleteCharAt(patternBuilder.length() - 1);
+ } else {
+ patternBuilder.append(currentChar);
+ }
+ }
+
+ previousChar = currentChar;
+ }
+
+ return patternBuilder.toString();
+ }
+
+ /**
+ * Changes unlimited length pattern to limited length pattern. It is done by
+ * replacing constructs with "*" and "+" symbols with their finite
+ * counterparts - "{0,n}" and {1,n}.
+ * As a side effect block quotes are replaced with normal quotes
+ * by using {@link #removeBlockQuotes(String)}.
+ *
+ * @param pattern pattern to be finitized
+ * @param infinity "n" number
+ * @return limited length pattern
+ */
+ public static String finitize(String pattern, int infinity) {
+ String finitePattern = removeBlockQuotes(pattern);
+
+ Matcher starMatcher = STAR_PATTERN.matcher(finitePattern);
+ finitePattern = starMatcher.replaceAll("{0," + infinity + "}");
+
+ Matcher plusMatcher = PLUS_PATTERN.matcher(finitePattern);
+ finitePattern = plusMatcher.replaceAll("{1," + infinity + "}");
+
+ Matcher rangeMatcher = RANGE_PATTERN.matcher(finitePattern);
+ finitePattern = rangeMatcher.replaceAll("{$1," + infinity + "}");
+
+ return finitePattern;
+ }
+
+ /**
+ * Replaces capturing groups with non-capturing groups in the given regular
+ * expression. As a side effect block quotes are replaced with normal quotes
+ * by using {@link #removeBlockQuotes(String)}.
+ *
+ * @param pattern
Review Comment:
missing description, see above regarding paramter validation
##########
opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+ public List<Clean> cleanList = new ArrayList<Clean>();
+
+ public String clean(String text) {
+ for (Clean clean : cleanList) {
+ text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+ }
+ return text;
+ }
+
+ public void clear() {
+ if (cleanList != null) {
+ cleanList.clear();
+ }
+ }
+
+ /**
+ * TODO: Move rules into profiles
+ */
+ public void rules() {
+
+ cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+ cleanList.add(new Clean("\\n \\n", "\n"));
+
+ cleanList.add(new Clean("\\n\\n", "\n"));
+
+ cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+ cleanList.add(new Clean("(?<=\\s)\\n", ""));
+ cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+ cleanList.add(new Clean("\\n", "\n"));
+ cleanList.add(new Clean("\\\\n", "\n"));
+ cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+ cleanList.add(new
Clean("\\{b\\^>\\d*<b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+ cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+// cleanList.add(new Clean("\\.{5,}", " "));
+ cleanList.add(new Clean("\\/{3}", ""));
+
+// cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
+// cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
+
+ cleanList.add(new Clean("\\n(?=•')", "\r"));
+ cleanList.add(new Clean("''", "\""));
+ cleanList.add(new Clean("``", "\""));
+
+ }
+
+ public void html() {
Review Comment:
Guess we should also allow, that a user can just add custom rules:
new
Cleaner.Builder().withDefaults().withHTML().withPDF().withCustomRules(rules).build()
##########
opennlp-tools/src/main/resources/opennlp/tools/sentdetect/segment/rules.xml:
##########
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
Review Comment:
How did you generate this xml file? It does not seem to originate from
[the](https://github.com/diasks2/pragmatic_segmenter
> Add a rule based sentence detector
> ----------------------------------
>
> Key: OPENNLP-912
> URL: https://issues.apache.org/jira/browse/OPENNLP-912
> Project: OpenNLP
> Issue Type: Improvement
> Reporter: Jörn Kottmann
> Priority: Major
> Labels: help-wanted
>
> It would be nice to offer a simpler rule based sentence detector, in some
> languages this might work rather well.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)