This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch opennlp-models in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 5d983820d6ba585a9b8d4a91167addb30d00913f Author: Richard Zowalla <[email protected]> AuthorDate: Tue Jun 11 11:32:20 2024 +0200 OPENNLP-1567 - OpenNLP Models: Provide a Finder / Loader Implementation --- NOTICE | 27 +++++ opennlp-tools-models/pom.xml | 95 ++++++++++++++++ .../java/opennlp/tools/models/ClassPathModel.java | 35 ++++++ .../opennlp/tools/models/ClassPathModelEntry.java | 24 ++++ .../opennlp/tools/models/ClassPathModelLoader.java | 56 ++++++++++ .../opennlp/tools/models/ClasspathModelFinder.java | 121 +++++++++++++++++++++ .../tools/models/ClassPathModelFinderTest.java | 63 +++++++++++ .../tools/models/ClassPathModelLoaderTest.java | 52 +++++++++ .../tools/models/ClassPathModelUsageTest.java | 80 ++++++++++++++ pom.xml | 5 +- 10 files changed, 557 insertions(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index f5f6c4fc..e4b36a21 100644 --- a/NOTICE +++ b/NOTICE @@ -99,3 +99,30 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. jackson-databind https://github.com/FasterXML/jackson-databind The Apache Software License, Version 2.0 + +=================================================================== + +classgraph +https://github.com/classgraph/classgraph + +The MIT License (MIT) + +Copyright (c) 2019 Luke Hutchison + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/opennlp-tools-models/pom.xml b/opennlp-tools-models/pom.xml new file mode 100644 index 00000000..86850cef --- /dev/null +++ b/opennlp-tools-models/pom.xml @@ -0,0 +1,95 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp</artifactId> + <version>2.3.4-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-tools-models</artifactId> + <packaging>jar</packaging> + <name>Apache OpenNLP Tools Models</name> + + <dependencies> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>io.github.classgraph</groupId> + <artifactId>classgraph</artifactId> + <version>${classgraph.version}</version> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-api</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-engine</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-params</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-simple</artifactId> + <scope>test</scope> + </dependency> + + <!-- models --> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-sentdetect-en</artifactId> + <version>${opennlp.models.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-langdetect</artifactId> + <version>${opennlp.models.version}</version> + <scope>test</scope> + </dependency> + + </dependencies> + +</project> \ No newline at end of file diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModel.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModel.java new file mode 100644 index 00000000..39f9df2a --- /dev/null +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModel.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.util.Properties; + +public record ClassPathModel(Properties properties, byte[] model) { + + public String getModelVersion() { + return properties != null ? properties.getProperty("model.name", "unknown") : "unknown"; + } + + public String getModelName() { + return properties != null ? properties.getProperty("model.version", "unknown") : "unknown"; + } + + public String getModelSHA256() { + return properties != null ? properties.getProperty("model.sha256", "unknown") : "unknown"; + } + +} diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelEntry.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelEntry.java new file mode 100644 index 00000000..ef09f414 --- /dev/null +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelEntry.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.net.URI; +import java.util.Optional; + +public record ClassPathModelEntry(URI model, Optional<URI> properties) { + +} diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelLoader.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelLoader.java new file mode 100644 index 00000000..a236fa34 --- /dev/null +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClassPathModelLoader.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Objects; +import java.util.Properties; + +/** + * Responsible for loading OpenNLP models from the classpath. + */ +public class ClassPathModelLoader { + + /** + * Loads a {@link ClassPathModel} from a {@link ClassPathModelEntry} + * + * @param entry must not be {@code null}. + * @return a {@link ClassPathModel} containing the model resources. + * @throws IOException thrown if something went wrong during reading resources from the classpath. + */ + public ClassPathModel load(ClassPathModelEntry entry) throws IOException { + Objects.requireNonNull(entry, "entry must not be null"); + Objects.requireNonNull(entry.properties(), "entry.properties() must not be null"); + Objects.requireNonNull(entry.model(), "entry.model() must not be null"); + + final Properties properties = new Properties(); + + if (entry.properties().isPresent()) { + try (InputStream inputStream = entry.properties().get().toURL().openStream()) { + properties.load(inputStream); + } + } + + final byte[] model; + try (InputStream inputStream = entry.model().toURL().openStream()) { + model = inputStream.readAllBytes(); + } + + return new ClassPathModel(properties, model); + } +} diff --git a/opennlp-tools-models/src/main/java/opennlp/tools/models/ClasspathModelFinder.java b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClasspathModelFinder.java new file mode 100644 index 00000000..61d091f9 --- /dev/null +++ b/opennlp-tools-models/src/main/java/opennlp/tools/models/ClasspathModelFinder.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.net.URI; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import io.github.classgraph.ClassGraph; +import io.github.classgraph.ResourceList; +import io.github.classgraph.ScanResult; + + +/** + * Enables the detection of OpenNLP models in the classpath. + */ +public class ClasspathModelFinder { + + private static final String OPENNLP_MODEL_JAR_PREFIX = "opennlp-models-*.jar"; + private final String jarModelPrefix; + private Set<ClassPathModelEntry> models; + + /** + * By default, it scans for "opennlp-models-*.jar". + */ + public ClasspathModelFinder() { + this(OPENNLP_MODEL_JAR_PREFIX); + } + + /** + * @param modelJarPrefix The leafnames of the jars that should be canned (e.g. "opennlp.jar"). + * May contain a wildcard glob ("opennlp-*.jar"). It must not be {@code null}. + */ + public ClasspathModelFinder(String modelJarPrefix) { + Objects.requireNonNull(modelJarPrefix, "modelJarPrefix must not be null"); + this.jarModelPrefix = modelJarPrefix; + } + + /** + * Finds OpenNLP models within the classpath. + * + * @param reloadCache {@code true}, if the internal cache should explicitly be reloaded + * @return A Set of {@link ClassPathModelEntry ClassPathModelEntries}. It might be empty. + */ + public Set<ClassPathModelEntry> findModels(boolean reloadCache) { + + if (this.models == null || reloadCache) { + try (ScanResult sr = new ClassGraph().acceptJars(jarModelPrefix).disableDirScanning().scan()) { + + final List<URI> classpathModels = getResourcesMatchingWildcard(sr, "*.bin"); + final List<URI> classPathProperties = getResourcesMatchingWildcard(sr, "model.properties"); + + this.models = new HashSet<>(); + + for (URI model : classpathModels) { + URI m = null; + for (URI prop : classPathProperties) { + if (jarPathsMatch(model, prop)) { + m = prop; + break; + } + } + this.models.add(new ClassPathModelEntry(model, Optional.ofNullable(m))); + + } + } + } + return this.models; + } + + private List<URI> getResourcesMatchingWildcard(final ScanResult sr, final String resourceWildcard) { + try (final ResourceList resources = sr.getResourcesMatchingWildcard(resourceWildcard)) { + return resources.getURIs(); + } + } + + private boolean jarPathsMatch(URI uri1, URI uri2) { + final String[] parts1 = parseJarURI(uri1); + final String[] parts2 = parseJarURI(uri2); + + if (parts1 == null || parts2 == null) { + return false; + } + + return parts1[0].equals(parts2[0]); + } + + private String[] parseJarURI(URI uri) { + try { + if ("jar".equals(uri.getScheme())) { + final String ssp = uri.getSchemeSpecificPart(); + final int separatorIndex = ssp.indexOf("!/"); + if (separatorIndex > 0) { + final String jarFileUri = ssp.substring(0, separatorIndex); + final String entryPath = ssp.substring(separatorIndex + 2); + return new String[] {jarFileUri, entryPath}; + } + } + } catch (Exception ignored) { + } + return null; + } + +} diff --git a/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelFinderTest.java b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelFinderTest.java new file mode 100644 index 00000000..683edcaf --- /dev/null +++ b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelFinderTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ClassPathModelFinderTest { + + @Test + public void testFindOpenNLPModels() { + final ClasspathModelFinder finder = new ClasspathModelFinder(); + + final Set<ClassPathModelEntry> models = finder.findModels(false); + assertNotNull(models); + assertEquals(2, models.size()); + + for (ClassPathModelEntry entry : models) { + assertNotNull(entry.model()); + assertNotNull(entry.properties()); + assertFalse(entry.properties().isEmpty()); + } + + //call it twice, yields same results + final Set<ClassPathModelEntry> reloadedModels = finder.findModels(false); + assertNotNull(reloadedModels); + assertEquals(models, reloadedModels); + + //call it with reload cache, yields same results + final Set<ClassPathModelEntry> cacheReloadedModels = finder.findModels(true); + assertNotNull(cacheReloadedModels); + assertEquals(models, cacheReloadedModels); + assertEquals(reloadedModels, cacheReloadedModels); + } + + @Test + public void testFindOpenNLPModelsCustomPrefix() { + final ClasspathModelFinder finder = new ClasspathModelFinder("wont-find-anything*"); + + final Set<ClassPathModelEntry> models = finder.findModels(false); + assertNotNull(models); + assertEquals(0, models.size()); + } +} diff --git a/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelLoaderTest.java b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelLoaderTest.java new file mode 100644 index 00000000..864fd293 --- /dev/null +++ b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelLoaderTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ClassPathModelLoaderTest { + + @Test + public void testLoadOpenNLPModel() throws Exception { + final ClasspathModelFinder finder = new ClasspathModelFinder("opennlp-models-langdetect-*.jar"); + + final Set<ClassPathModelEntry> models = finder.findModels(false); + assertNotNull(models); + assertEquals(1, models.size()); + + final ClassPathModelEntry entry = models.iterator().next(); + assertNotNull(entry); + assertNotNull(entry.model()); + assertNotNull(entry.properties()); + + //test + final ClassPathModelLoader loader = new ClassPathModelLoader(); + + final ClassPathModel model = loader.load(entry); + assertNotNull(model); + assertNotNull(model.model()); + assertNotNull(model.properties()); + assertNotNull(model.getModelSHA256()); + assertNotNull(model.getModelName()); + assertNotNull(model.getModelVersion()); + } +} diff --git a/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelUsageTest.java b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelUsageTest.java new file mode 100644 index 00000000..d4c284aa --- /dev/null +++ b/opennlp-tools-models/src/test/java/opennlp/tools/models/ClassPathModelUsageTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.models; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import opennlp.tools.langdetect.LanguageDetector; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; +import opennlp.tools.sentdetect.SentenceDetector; +import opennlp.tools.sentdetect.SentenceDetectorME; +import opennlp.tools.sentdetect.SentenceModel; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class ClassPathModelUsageTest { + + @Test + public void testLanguageDetection() throws IOException { + + final ClassPathModel model = getClassPathModel("opennlp-models-langdetect-*.jar"); + final LanguageDetectorModel ldModel = new LanguageDetectorModel(new ByteArrayInputStream(model.model())); + assertNotNull(ldModel); + final LanguageDetector languageDetector = new LanguageDetectorME(ldModel); + assertNotNull(languageDetector); + + assertEquals("eng", + languageDetector.predictLanguage("The English language is pretty impressive.").getLang()); + + } + + @Test + public void testSentenceDetector() throws IOException { + + final ClassPathModel model = getClassPathModel("opennlp-models-sentdetect-*.jar"); + final SentenceModel sentenceModel = new SentenceModel(new ByteArrayInputStream(model.model())); + assertNotNull(sentenceModel); + final SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel); + assertNotNull(sentenceDetector); + + assertEquals(2, sentenceDetector.sentDetect("Pretty impressive stuff. I like it!").length); + + } + + private ClassPathModel getClassPathModel(String modelJarPrefix) throws IOException { + final ClasspathModelFinder finder = new ClasspathModelFinder(modelJarPrefix); + + final Set<ClassPathModelEntry> models = finder.findModels(false); + assertNotNull(models); + assertEquals(1, models.size()); + + final ClassPathModelEntry entry = models.iterator().next(); + assertNotNull(entry); + final ClassPathModelLoader loader = new ClassPathModelLoader(); + final ClassPathModel model = loader.load(entry); + assertNotNull(model); + assertNotNull(model.model()); + assertNotNull(model.properties()); + return model; + } +} diff --git a/pom.xml b/pom.xml index a406164e..4c8ae881 100644 --- a/pom.xml +++ b/pom.xml @@ -177,6 +177,8 @@ <slf4j.version>1.7.36</slf4j.version> <log4j2.version>2.23.1</log4j2.version> <jmh.version>1.37</jmh.version> + <classgraph.version>4.8.173</classgraph.version> + <opennlp.models.version>1.0.0-SNAPSHOT</opennlp.models.version> <opennlp.forkCount>1.0C</opennlp.forkCount> <coveralls.maven.plugin>4.3.0</coveralls.maven.plugin> <jacoco.maven.plugin>0.8.11</jacoco.maven.plugin> @@ -555,6 +557,7 @@ <module>opennlp-distr</module> <module>opennlp-dl</module> <module>opennlp-dl-gpu</module> - </modules> + <module>opennlp-tools-models</module> + </modules> </project>
