This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1729-Switch-to-easier-loading-of-Models in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit ba88e6b036727b3f8cce685a32d5bd4276aa08c2 Author: Martin Wiesner <[email protected]> AuthorDate: Mon Apr 28 22:44:09 2025 +0200 OPENNLP-1729: Switch to easier loading of Models - adapts code in WSDHelper to use ClassPathModelProvider - adapts code in ParserChunker2MatcherProcessor to use ClassPathModelProvider - adapts code in NounPOSTagger and DefaultDocProcessor to use ClassPathModelProvider --- .../opennlp/tools/coref/AbstractCorefTest.java | 8 ++--- .../Muc6FullParseCorefSampleStreamFactoryTest.java | 3 +- opennlp-similarity/pom.xml | 35 ++++++++++++++++++---- .../ParserChunker2MatcherProcessor.java | 31 +++++++++---------- .../ParserPure2MatcherProcessor.java | 29 ++++++------------ opennlp-wsd/pom.xml | 25 ++++++++++++++++ .../opennlp/tools/disambiguator/WSDHelper.java | 24 ++++++++++----- pom.xml | 8 ++++- summarizer/pom.xml | 19 ++++++++++++ .../lexicalchaining/NounPOSTagger.java | 19 ++++++++---- .../preprocess/DefaultDocProcessor.java | 14 +++++---- 11 files changed, 149 insertions(+), 66 deletions(-) diff --git a/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java b/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java index 11869c8..bebcfdb 100644 --- a/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java +++ b/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java @@ -17,9 +17,6 @@ package opennlp.tools.coref; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; @@ -29,10 +26,13 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public abstract class AbstractCorefTest { private static final Logger logger = LoggerFactory.getLogger(AbstractCorefTest.class); - + private static final String BASE_URL_MODELS_V15 = "https://opennlp.sourceforge.net/models-1.5/"; protected static final Path OPENNLP_DIR = Paths.get(System.getProperty("user.home") + "/.opennlp/"); protected static final String MODEL_DIR = "/models/coref/en"; diff --git a/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java b/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java index 7f426e0..5f852b8 100644 --- a/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java +++ b/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java @@ -27,6 +27,7 @@ import org.junit.jupiter.api.Test; import opennlp.tools.coref.AbstractCorefTest; import opennlp.tools.coref.CorefSample; import opennlp.tools.coref.mention.Parse; +import opennlp.tools.models.ModelType; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.DownloadUtil; import opennlp.tools.util.ObjectStream; @@ -49,7 +50,7 @@ public class Muc6FullParseCorefSampleStreamFactoryTest extends AbstractCorefTest @BeforeAll public static void initEnv() throws IOException { Muc6FullParseCorefSampleStreamFactory.registerFactory(); - DownloadUtil.downloadModel("en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); + DownloadUtil.downloadModel("en", ModelType.TOKENIZER, TokenizerModel.class); downloadVersion15Model(MODEL_PARSER); downloadVersion15Model(MODEL_NER_PER); downloadVersion15Model(MODEL_NER_ORG); diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index 8d774ee..2de39e5 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -37,6 +37,7 @@ <hdf5.version>1.14.3-1.5.10</hdf5.version> <javacpp.version>1.5.11</javacpp.version> <openblas.version>0.3.28-1.5.11</openblas.version> + <httpclient.version>4.5.14</httpclient.version> </properties> <repositories> @@ -59,12 +60,12 @@ <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> - <version>4.5.14</version> + <version>${httpclient.version}</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient-cache</artifactId> - <version>4.5.14</version> + <version>${httpclient.version}</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> @@ -74,12 +75,12 @@ <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpmime</artifactId> - <version>4.5.14</version> + <version>${httpclient.version}</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>fluent-hc</artifactId> - <version>4.5.14</version> + <version>${httpclient.version}</version> </dependency> </dependencies> </dependencyManagement> @@ -90,9 +91,31 @@ <artifactId>opennlp-tools</artifactId> </dependency> <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools-models</artifactId> + </dependency> + + <!-- Required English model resources at runtime --> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-tokenizer-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-pos-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sentdetect-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> </dependency> + <!-- End model resources --> + <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java index 97eda63..818c4ab 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java @@ -25,9 +25,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.cmdline.parser.ParserTool; +import opennlp.tools.models.ClassPathModelProvider; +import opennlp.tools.models.DefaultClassPathModelProvider; +import opennlp.tools.models.ModelType; import opennlp.tools.parser.AbstractBottomUpParser; import opennlp.tools.parser.Parse; import opennlp.tools.parser.Parser; @@ -49,13 +55,13 @@ import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.DownloadUtil; import opennlp.tools.util.Span; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class ParserChunker2MatcherProcessor { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final ClassPathModelProvider MODEL_PROVIDER = new DefaultClassPathModelProvider(); + static final int MIN_SENTENCE_LENGTH = 10; protected static ParserChunker2MatcherProcessor instance; @@ -89,32 +95,27 @@ public class ParserChunker2MatcherProcessor { } protected void initializeSentenceDetector() throws IOException { - SentenceModel model = DownloadUtil.downloadModel( - "en", DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); - sentenceDetector = new ThreadSafeSentenceDetectorME(model); + final SentenceModel sm = MODEL_PROVIDER.load("en", ModelType.SENTENCE_DETECTOR, SentenceModel.class); + sentenceDetector = new ThreadSafeSentenceDetectorME(sm); } protected void initializeTokenizer() throws IOException { - TokenizerModel model = DownloadUtil.downloadModel( - "en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); - tokenizer = new ThreadSafeTokenizerME(model); + final TokenizerModel tm = MODEL_PROVIDER.load("en", ModelType.TOKENIZER, TokenizerModel.class); + tokenizer = new ThreadSafeTokenizerME(tm); } protected void initializePosTagger() throws IOException { - POSModel model = DownloadUtil.downloadModel( - "en", DownloadUtil.ModelType.POS, POSModel.class); - posTagger = new ThreadSafePOSTaggerME(model); + final POSModel pm = MODEL_PROVIDER.load("en", ModelType.POS_GENERIC, POSModel.class); + posTagger = new ThreadSafePOSTaggerME(pm); } protected void initializeParser() throws IOException { - ParserModel model = DownloadUtil.downloadModel( - "en", DownloadUtil.ModelType.PARSER, ParserModel.class); + ParserModel model = DownloadUtil.downloadModel("en", ModelType.PARSER, ParserModel.class); parser = ParserFactory.create(model); } private void initializeChunker() throws IOException { - ChunkerModel model = DownloadUtil.downloadModel( - "en", DownloadUtil.ModelType.CHUNKER, ChunkerModel.class); + ChunkerModel model = DownloadUtil.downloadModel("en", ModelType.CHUNKER, ChunkerModel.class); chunker = new ChunkerME(model); } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java index c5e5dca..c5db267 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java @@ -41,9 +41,7 @@ import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import opennlp.tools.textsimilarity.LemmaPair; import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; import opennlp.tools.textsimilarity.SentencePairMatchResult; import opennlp.tools.textsimilarity.TextProcessor; @@ -51,13 +49,13 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - protected static ParserPure2MatcherProcessor pinstance; + private static ParserPure2MatcherProcessor pInstance; public synchronized static ParserPure2MatcherProcessor getInstance() { - if (pinstance == null) - pinstance = new ParserPure2MatcherProcessor(); + if (pInstance == null) + pInstance = new ParserPure2MatcherProcessor(); - return pinstance; + return pInstance; } private ParserPure2MatcherProcessor() { @@ -71,8 +69,8 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor } } - public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence( - String sentence) { + @Override + public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(String sentence) { if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH) return null; @@ -118,25 +116,16 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor return listOfChunks; } + @Override public SentencePairMatchResult assessRelevance(String para1, String para2) { - - List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1), sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2); - - List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); - - ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic(); - List<List<ParseTreeChunk>> res = md - .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst); - return new SentencePairMatchResult(res, origChunks1); - + return super.assessRelevance(para1, para2); } public static void main(String[] args) throws Exception { ParserPure2MatcherProcessor parser = ParserPure2MatcherProcessor.getInstance(); String text = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "; - List<List<ParseTreeChunk>> res = parser - .formGroupedPhrasesFromChunksForPara(text); + List<List<ParseTreeChunk>> res = parser.formGroupedPhrasesFromChunksForPara(text); System.out.println(res); String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. " diff --git a/opennlp-wsd/pom.xml b/opennlp-wsd/pom.xml index da3a81d..c8c3d2f 100644 --- a/opennlp-wsd/pom.xml +++ b/opennlp-wsd/pom.xml @@ -37,6 +37,31 @@ <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools-models</artifactId> + </dependency> + + <!-- Required English model resources at runtime --> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-tokenizer-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-pos-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-lemmatizer-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <!-- End model resources --> <dependency> <groupId>net.sf.extjwnl</groupId> diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java index 3613c0d..7ae0d26 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java @@ -35,12 +35,16 @@ import org.slf4j.LoggerFactory; import opennlp.tools.lemmatizer.Lemmatizer; import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.lemmatizer.ThreadSafeLemmatizerME; +import opennlp.tools.models.ModelType; +import opennlp.tools.models.ClassPathModelProvider; +import opennlp.tools.models.DefaultClassPathModelProvider; +import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTagFormat; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.ThreadSafePOSTaggerME; import opennlp.tools.tokenize.ThreadSafeTokenizerME; import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.util.DownloadUtil; +import opennlp.tools.tokenize.TokenizerModel; /** * A helper class that loads and organizes resources, and provides helper methods @@ -52,6 +56,8 @@ public class WSDHelper { private static final Pattern NUMBERS_PATTERN = Pattern.compile(".*[0-9].*"); + private static final ClassPathModelProvider MODEL_PROVIDER = new DefaultClassPathModelProvider(); + private static Tokenizer tokenizer; private static POSTagger tagger; private static Lemmatizer lemmatizer; @@ -274,9 +280,9 @@ public class WSDHelper { private static Lemmatizer getLemmatizer(String lang) { if (lemmatizer == null) { try { - LemmatizerModel lm = DownloadUtil.downloadModel(lang, - DownloadUtil.ModelType.LEMMATIZER, LemmatizerModel.class); - lemmatizer = new ThreadSafeLemmatizerME(lm); + final LemmatizerModel lm = MODEL_PROVIDER.load( + lang, ModelType.LEMMATIZER, LemmatizerModel.class); + lemmatizer = new ThreadSafeLemmatizerME(lm); } catch (IOException e) { throw new RuntimeException("Error opening or loading a Lemmatizer from specified resource file!", e); } @@ -288,10 +294,11 @@ public class WSDHelper { return getTagger("en"); } - private static POSTagger getTagger(String language) { + private static POSTagger getTagger(String lang) { if (tagger == null) { try { - tagger = new ThreadSafePOSTaggerME(language, POSTagFormat.PENN); + final POSModel pm = MODEL_PROVIDER.load(lang, ModelType.POS_GENERIC, POSModel.class); + tagger = new ThreadSafePOSTaggerME(pm, POSTagFormat.PENN); } catch (IOException e) { throw new RuntimeException("Error opening or loading a Tokenizer for specified language!", e); } @@ -303,10 +310,11 @@ public class WSDHelper { return getTokenizer("en"); } - private static Tokenizer getTokenizer(String language) { + private static Tokenizer getTokenizer(String lang) { if (tokenizer == null) { try { - tokenizer = new ThreadSafeTokenizerME(language); + final TokenizerModel tm = MODEL_PROVIDER.load(lang, ModelType.TOKENIZER, TokenizerModel.class); + tokenizer = new ThreadSafeTokenizerME(tm); } catch (IOException e) { throw new RuntimeException("Error opening or loading a Tokenizer for specified language!", e); } diff --git a/pom.xml b/pom.xml index 750c8d2..b808ac8 100644 --- a/pom.xml +++ b/pom.xml @@ -122,7 +122,8 @@ <maven.compiler.target>${java.version}</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - <opennlp.tools.version>2.5.4</opennlp.tools.version> + <opennlp.tools.version>2.5.5-SNAPSHOT</opennlp.tools.version> + <opennlp.models.version>1.2.0</opennlp.models.version> <opennlp.forkCount>1.0C</opennlp.forkCount> <commons-beanutils.version>1.10.1</commons-beanutils.version> @@ -163,6 +164,11 @@ <groupId>${project.groupId}</groupId> <version>${opennlp.tools.version}</version> </dependency> + <dependency> + <artifactId>opennlp-tools-models</artifactId> + <groupId>${project.groupId}</groupId> + <version>${opennlp.tools.version}</version> + </dependency> <dependency> <artifactId>opennlp-tools</artifactId> diff --git a/summarizer/pom.xml b/summarizer/pom.xml index cab7aaf..4aeb1a2 100644 --- a/summarizer/pom.xml +++ b/summarizer/pom.xml @@ -49,6 +49,25 @@ <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools-models</artifactId> + </dependency> + + <!-- Required English model resources at runtime --> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-pos-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sentdetect-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>runtime</scope> + </dependency> + <!-- End model resources --> <dependency> <groupId>edu.mit</groupId> diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java index 2acc60b..63e5844 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java @@ -24,10 +24,13 @@ import java.util.List; import java.util.Map; import java.util.Set; +import opennlp.tools.models.ClassPathModelProvider; +import opennlp.tools.models.DefaultClassPathModelProvider; +import opennlp.tools.models.ModelType; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.postag.ThreadSafePOSTaggerME; import opennlp.tools.tokenize.WhitespaceTokenizer; -import opennlp.tools.util.DownloadUtil; /** * A {@link POSTagger} wrapper implementation that relies on an OpenNLP {@link POSTaggerME}. @@ -40,7 +43,9 @@ public class NounPOSTagger implements POSTagger { public static final String[] TAGS_NOUNS = {"NOUN", "NN", "NNS", "NNP", "NNPS"}; private static final Set<String> EOS_CHARS = Set.of(".", "?", "!"); - private final POSTaggerME tagger; + private static final ClassPathModelProvider MODEL_PROVIDER = new DefaultClassPathModelProvider(); + + private final ThreadSafePOSTaggerME tagger; private final Map<Integer, String[]> tagMap = new Hashtable<>(); /** @@ -56,8 +61,8 @@ public class NounPOSTagger implements POSTagger { throw new IllegalArgumentException("Parameter 'languageCode' must not be null"); // init Tag map tagMap.put(POSTagger.NOUN, TAGS_NOUNS); - POSModel posModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.POS, POSModel.class); - tagger = new POSTaggerME(posModel); + final POSModel pm = MODEL_PROVIDER.load(languageCode, ModelType.POS_GENERIC, POSModel.class); + tagger = new ThreadSafePOSTaggerME(pm); } /** @@ -105,8 +110,10 @@ public class NounPOSTagger implements POSTagger { */ @Override public List<String> getWordsOfType(String[] tokens, int type) { - if (tokens == null) throw new IllegalArgumentException("Parameter 'tokens' must not be null"); - if (type < 0 || type > PRONOUN) throw new IllegalArgumentException("Parameter 'type' must be in range [0, 4]"); + if (tokens == null) + throw new IllegalArgumentException("Parameter 'tokens' must not be null"); + if (type < 0 || type > PRONOUN) + throw new IllegalArgumentException("Parameter 'type' must be in range [0, 4]"); List<String> ret = new ArrayList<>(); for (String t : tokens) { diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java index a638d68..6b50a9a 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java @@ -31,19 +31,24 @@ import java.util.regex.Pattern; import opennlp.summarization.Sentence; import opennlp.summarization.DocProcessor; +import opennlp.tools.models.ClassPathModelProvider; +import opennlp.tools.models.DefaultClassPathModelProvider; +import opennlp.tools.models.ModelType; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.Stemmer; -import opennlp.tools.util.DownloadUtil; /** * Parses a document to sentences. */ public class DefaultDocProcessor implements DocProcessor { + + private static final ClassPathModelProvider MODEL_PROVIDER = new DefaultClassPathModelProvider(); + private static final String REGEX = "\"|'"; private final static Pattern REPLACEMENT_PATTERN = - Pattern.compile("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;"); + Pattern.compile("&#?[0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]?;"); // Sentence fragmentation to use.. private static final int OPEN_NLP = 1; @@ -65,10 +70,10 @@ public class DefaultDocProcessor implements DocProcessor { if (languageCode == null || languageCode.isBlank()) throw new IllegalArgumentException("Parameter 'languageCode' must not be null or blank"); stemmer = new PorterStemmer(); - sentModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); + sentModel = MODEL_PROVIDER.load(languageCode, ModelType.SENTENCE_DETECTOR, SentenceModel.class); } - // Str - Document or para + // Str - Document or paragraph // sentences - List containing returned sentences // iidx - if not null update with the words in the sentence + sent id // processedSent - Sentences after stemming and stopword removal.. @@ -123,7 +128,6 @@ public class DefaultDocProcessor implements DocProcessor { } } - /** * Reads a document's content from a file. *
