This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1661-Fix-custom-models-being-wiped-from-OpenNLP-user.home-directory in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 25fa67aaf9a71ceff6932e095f89f86e5f0d5106 Author: Martin Wiesner <[email protected]> AuthorDate: Mon Dec 2 20:30:19 2024 +0100 OPENNLP-1661 Fix custom models being wiped from OpenNLP user.home directory - deletes AbstractDownloadUtilTest.java removing historical code that wiped models - adds package-private DownloadUtil#existsModel(..) method to check models for certain language exist locally - adds to and adjusts related test classes --- .../main/java/opennlp/tools/util/DownloadUtil.java | 50 +++++++++++--- .../tools/util/AbstractDownloadUtilTest.java | 79 ---------------------- .../tools/util/DownloadUtilDownloadTwiceTest.java | 30 ++++---- .../java/opennlp/tools/util/DownloadUtilTest.java | 25 ++++++- 4 files changed, 81 insertions(+), 103 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java index 11b328da..bbfa2426 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java @@ -25,6 +25,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.NoSuchFileException; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; @@ -80,6 +81,41 @@ public class DownloadUtil { private static Map<String, Map<ModelType, String>> availableModels; + /** + * Checks if a model of the specified {@code modelType} has been downloaded already + * for a particular {@code language}. + * + * @param language The ISO language code of the requested model. + * @param modelType The {@link DownloadUtil.ModelType type} of model. + * @return {@code true} if a model exists locally, {@code false} otherwise. + * @throws IOException Thrown if IO errors occurred or the model is invalid. + */ + static boolean existsModel(String language, ModelType modelType) throws IOException { + Map<ModelType, String> modelsByLanguage = getAvailableModels().get(language); + if (modelsByLanguage == null) { + return false; + } else { + final String url = modelsByLanguage.get(modelType); + if (url != null) { + final Path homeDirectory = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME", + System.getProperty("user.home"))).resolve(".opennlp"); + final String filename = url.substring(url.lastIndexOf("/") + 1); + final Path localFile = Paths.get(homeDirectory.toString(), filename); + boolean exists; + try { + // if this does not throw the requested model exists AND is valid! + validateModel(new URL(url + ".sha512"), localFile); + exists = true; + } catch (NoSuchFileException e) { + exists = false; + } + return exists; + } else { + return false; + } + } + } + /** * Triggers a download for the specified {@link DownloadUtil.ModelType}. * @@ -94,7 +130,7 @@ public class DownloadUtil { Class<T> type) throws IOException { if (getAvailableModels().containsKey(language)) { - final String url = (getAvailableModels().get(language).get(modelType)); + final String url = getAvailableModels().get(language).get(modelType); if (url != null) { return downloadModel(new URL(url), type); } @@ -134,17 +170,14 @@ public class DownloadUtil { final Path localFile = Paths.get(homeDirectory.toString(), filename); if (!Files.exists(localFile)) { - logger.debug("Downloading model from {} to {}.", url, localFile); + logger.debug("Downloading model to {}.", localFile); try (final InputStream in = url.openStream()) { Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING); } - validateModel(new URL(url + ".sha512"), localFile); - logger.debug("Download complete."); } else { - System.out.println("Model file already exists. Skipping download."); logger.debug("Model file '{}' already exists. Skipping download.", filename); } @@ -167,7 +200,7 @@ public class DownloadUtil { } /** - * Validates the downloaded model. + * Validates a downloaded model via the specified {@link Path downloadedModel path}. * * @param sha512 the url to get the sha512 hash * @param downloadedModel the model file to check @@ -187,8 +220,8 @@ public class DownloadUtil { // Validate SHA512 checksum final String actualChecksum = calculateSHA512(downloadedModel); if (!actualChecksum.equalsIgnoreCase(expectedChecksum)) { - throw new IOException("SHA512 checksum validation failed. Expected: " - + expectedChecksum + ", but got: " + actualChecksum); + throw new IOException("SHA512 checksum validation failed for " + downloadedModel.getFileName() + + ". Expected: " + expectedChecksum + ", but got: " + actualChecksum); } } @@ -198,6 +231,7 @@ public class DownloadUtil { try (InputStream fis = Files.newInputStream(file); DigestInputStream dis = new DigestInputStream(fis, digest)) { byte[] buffer = new byte[4096]; + //noinspection StatementWithEmptyBody while (dis.read(buffer) != -1) { // Reading the file to update the digest } diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java deleted file mode 100644 index 9fdde8b6..00000000 --- a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.util; - -import java.io.IOException; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.nio.file.DirectoryStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; - -import org.junit.jupiter.api.BeforeAll; - -import opennlp.tools.EnabledWhenCDNAvailable; - -import static org.junit.jupiter.api.Assertions.fail; - -@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org") -public abstract class AbstractDownloadUtilTest { - - private static final String APACHE_CDN = "dlcdn.apache.org"; - - @BeforeAll - public static void cleanupWhenOnline() { - boolean isOnline; - try (Socket socket = new Socket()) { - socket.connect(new InetSocketAddress(APACHE_CDN, 80), EnabledWhenCDNAvailable.TIMEOUT_MS); - isOnline = true; - } catch (IOException e) { - // Unreachable, unresolvable or timeout - isOnline = false; - } - // If CDN is available -> go cleanup in preparation of the actual tests - if (isOnline) { - wipeExistingModelFiles("-tokens-"); - wipeExistingModelFiles("-sentence-"); - wipeExistingModelFiles("-pos-"); - wipeExistingModelFiles("-lemmas-"); - } - } - - - /* - * Helper method that wipes out mode files if they exist on the text execution env. - * Those model files are wiped from a hidden '.opennlp' subdirectory. - * - * Thereby, a clean download can be guaranteed - ín CDN is available and test are executed. - */ - private static void wipeExistingModelFiles(final String fragment) { - final Path dir = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME", - System.getProperty("user.home"))).resolve(".opennlp"); - if (Files.exists(dir)) { - try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*opennlp-*" + fragment + "*")) { - for (Path modelFileToWipe : stream) { - Files.deleteIfExists(modelFileToWipe); - } - } catch (IOException e) { - fail(e.getLocalizedMessage()); - } - } - } - -} diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java index 5f328b00..24bad516 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java @@ -35,7 +35,7 @@ import opennlp.tools.sentdetect.SentenceModel; import static org.junit.jupiter.api.Assertions.assertEquals; @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org") -public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest { +public class DownloadUtilDownloadTwiceTest { /* * Programmatic change to debug log to ensure that we can see log messages to @@ -60,24 +60,24 @@ public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest { @Test public void testDownloadModelTwice() throws IOException { + String lang = "de"; + DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR; + try (LogCaptor logCaptor = LogCaptor.forClass(DownloadUtil.class)) { - - DownloadUtil.downloadModel("de", - DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); - - assertEquals(2, logCaptor.getDebugLogs().size()); - checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download complete."); + boolean alreadyDownloaded = DownloadUtil.existsModel(lang, type); + DownloadUtil.downloadModel(lang, type, SentenceModel.class); + + if (! alreadyDownloaded) { + assertEquals(2, logCaptor.getDebugLogs().size()); + checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download complete."); + } else { + assertEquals(1, logCaptor.getDebugLogs().size()); + checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download."); + } logCaptor.clearLogs(); // try to download again - DownloadUtil.downloadModel("de", - DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); - assertEquals(1, logCaptor.getDebugLogs().size()); - checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download."); - logCaptor.clearLogs(); - - DownloadUtil.downloadModel("de", - DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); + DownloadUtil.downloadModel(lang, type, SentenceModel.class); assertEquals(1, logCaptor.getDebugLogs().size()); checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download."); logCaptor.clearLogs(); diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java index b8a61f91..8c3a67c1 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.URL; import java.util.stream.Stream; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -32,11 +33,12 @@ import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.TokenizerModel; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class DownloadUtilTest extends AbstractDownloadUtilTest { +public class DownloadUtilTest { @ParameterizedTest(name = "Verify \"{0}\" sentence model") @ValueSource(strings = {"en", "fr", "de", "it", "nl", "bg", "ca", "cs", "da", "el", @@ -62,6 +64,27 @@ public class DownloadUtilTest extends AbstractDownloadUtilTest { assertTrue(model.isLoadedFromSerialized()); } + @Test + @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org") + public void testExistsModel() throws IOException { + final String lang = "en"; + final DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR; + // Prepare + SentenceModel model = DownloadUtil.downloadModel(lang, type, SentenceModel.class); + assertNotNull(model); + assertEquals(lang, model.getLanguage()); + // Test + assertTrue(DownloadUtil.existsModel(lang, type)); + } + + @ParameterizedTest + @NullAndEmptySource + @ValueSource(strings = {"xy", "\t", "\n"}) + @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org") + public void testExistsModelInvalid(String input) throws IOException { + assertFalse(DownloadUtil.existsModel(input, DownloadUtil.ModelType.SENTENCE_DETECTOR)); + } + @ParameterizedTest(name = "Detect invalid input: \"{0}\"") @NullAndEmptySource @ValueSource(strings = {" ", "\t", "\n"})
