This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new e91ceb17 OPENNLP-1661: Fix custom models being wiped from OpenNLP
user.home directory (#704)
e91ceb17 is described below
commit e91ceb17e36120b66dafdf15d2e11fc6a59c92c6
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Dec 3 07:35:19 2024 +0100
OPENNLP-1661: Fix custom models being wiped from OpenNLP user.home
directory (#704)
- deletes AbstractDownloadUtilTest.java removing historical code that wiped
models
- adds package-private DownloadUtil#existsModel(..) method to check models
for certain language exist locally
- adds to and adjusts related test classes
---
.../main/java/opennlp/tools/util/DownloadUtil.java | 60 +++++++++++++---
.../tools/util/AbstractDownloadUtilTest.java | 79 ----------------------
.../tools/util/DownloadUtilDownloadTwiceTest.java | 30 ++++----
.../java/opennlp/tools/util/DownloadUtilTest.java | 30 ++++++--
4 files changed, 90 insertions(+), 109 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
index 11b328da..cd80792f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -77,9 +77,45 @@ public class DownloadUtil {
System.getProperty("OPENNLP_DOWNLOAD_BASE_URL",
"https://dlcdn.apache.org/opennlp/");
private static final String MODEL_URI_PATH =
System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH",
"models/ud-models-1.2/");
+ private static final String OPENNLP_DOWNLOAD_HOME = "OPENNLP_DOWNLOAD_HOME";
private static Map<String, Map<ModelType, String>> availableModels;
+ /**
+ * Checks if a model of the specified {@code modelType} has been downloaded
already
+ * for a particular {@code language}.
+ *
+ * @param language The ISO language code of the requested model.
+ * @param modelType The {@link DownloadUtil.ModelType type} of model.
+ * @return {@code true} if a model exists locally, {@code false} otherwise.
+ * @throws IOException Thrown if IO errors occurred or the computed hash sum
+ * of an associated, local model file was incorrect.
+ */
+ static boolean existsModel(String language, ModelType modelType) throws
IOException {
+ Map<ModelType, String> modelsByLanguage =
getAvailableModels().get(language);
+ if (modelsByLanguage == null) {
+ return false;
+ } else {
+ final String url = modelsByLanguage.get(modelType);
+ if (url != null) {
+ final Path homeDirectory = getDownloadHome();
+ final String filename = url.substring(url.lastIndexOf("/") + 1);
+ final Path localFile = homeDirectory.resolve(filename);
+ boolean exists;
+ if (Files.exists(localFile)) {
+ // if this does not throw the requested model is valid!
+ validateModel(new URL(url + ".sha512"), localFile);
+ exists = true;
+ } else {
+ exists = false;
+ }
+ return exists;
+ } else {
+ return false;
+ }
+ }
+ }
+
/**
* Triggers a download for the specified {@link DownloadUtil.ModelType}.
*
@@ -94,7 +130,7 @@ public class DownloadUtil {
Class<T> type) throws
IOException {
if (getAvailableModels().containsKey(language)) {
- final String url = (getAvailableModels().get(language).get(modelType));
+ final String url = getAvailableModels().get(language).get(modelType);
if (url != null) {
return downloadModel(new URL(url), type);
}
@@ -119,8 +155,7 @@ public class DownloadUtil {
*/
public static <T extends BaseModel> T downloadModel(URL url, Class<T> type)
throws IOException {
- final Path homeDirectory =
Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
- System.getProperty("user.home"))).resolve(".opennlp");
+ final Path homeDirectory = getDownloadHome();
if (!Files.isDirectory(homeDirectory)) {
try {
@@ -131,20 +166,17 @@ public class DownloadUtil {
}
final String filename =
url.toString().substring(url.toString().lastIndexOf("/") + 1);
- final Path localFile = Paths.get(homeDirectory.toString(), filename);
+ final Path localFile = homeDirectory.resolve(filename);
if (!Files.exists(localFile)) {
- logger.debug("Downloading model from {} to {}.", url, localFile);
+ logger.debug("Downloading model to {}.", localFile);
try (final InputStream in = url.openStream()) {
Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING);
}
-
validateModel(new URL(url + ".sha512"), localFile);
-
logger.debug("Download complete.");
} else {
- System.out.println("Model file already exists. Skipping download.");
logger.debug("Model file '{}' already exists. Skipping download.",
filename);
}
@@ -167,7 +199,7 @@ public class DownloadUtil {
}
/**
- * Validates the downloaded model.
+ * Validates a downloaded model via the specified {@link Path
downloadedModel path}.
*
* @param sha512 the url to get the sha512 hash
* @param downloadedModel the model file to check
@@ -187,8 +219,8 @@ public class DownloadUtil {
// Validate SHA512 checksum
final String actualChecksum = calculateSHA512(downloadedModel);
if (!actualChecksum.equalsIgnoreCase(expectedChecksum)) {
- throw new IOException("SHA512 checksum validation failed. Expected: "
- + expectedChecksum + ", but got: " + actualChecksum);
+ throw new IOException("SHA512 checksum validation failed for " +
downloadedModel.getFileName() +
+ ". Expected: " + expectedChecksum + ", but got: " + actualChecksum);
}
}
@@ -198,6 +230,7 @@ public class DownloadUtil {
try (InputStream fis = Files.newInputStream(file);
DigestInputStream dis = new DigestInputStream(fis, digest)) {
byte[] buffer = new byte[4096];
+ //noinspection StatementWithEmptyBody
while (dis.read(buffer) != -1) {
// Reading the file to update the digest
}
@@ -217,6 +250,11 @@ public class DownloadUtil {
}
}
+ private static Path getDownloadHome() {
+ return Paths.get(System.getProperty(OPENNLP_DOWNLOAD_HOME,
+ System.getProperty("user.home"))).resolve(".opennlp");
+ }
+
@Internal
static class DownloadParser {
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java
b/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java
deleted file mode 100644
index 9fdde8b6..00000000
---
a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.util;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.nio.file.DirectoryStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
-import org.junit.jupiter.api.BeforeAll;
-
-import opennlp.tools.EnabledWhenCDNAvailable;
-
-import static org.junit.jupiter.api.Assertions.fail;
-
-@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
-public abstract class AbstractDownloadUtilTest {
-
- private static final String APACHE_CDN = "dlcdn.apache.org";
-
- @BeforeAll
- public static void cleanupWhenOnline() {
- boolean isOnline;
- try (Socket socket = new Socket()) {
- socket.connect(new InetSocketAddress(APACHE_CDN, 80),
EnabledWhenCDNAvailable.TIMEOUT_MS);
- isOnline = true;
- } catch (IOException e) {
- // Unreachable, unresolvable or timeout
- isOnline = false;
- }
- // If CDN is available -> go cleanup in preparation of the actual tests
- if (isOnline) {
- wipeExistingModelFiles("-tokens-");
- wipeExistingModelFiles("-sentence-");
- wipeExistingModelFiles("-pos-");
- wipeExistingModelFiles("-lemmas-");
- }
- }
-
-
- /*
- * Helper method that wipes out mode files if they exist on the text
execution env.
- * Those model files are wiped from a hidden '.opennlp' subdirectory.
- *
- * Thereby, a clean download can be guaranteed - ín CDN is available and
test are executed.
- */
- private static void wipeExistingModelFiles(final String fragment) {
- final Path dir = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
- System.getProperty("user.home"))).resolve(".opennlp");
- if (Files.exists(dir)) {
- try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir,
"*opennlp-*" + fragment + "*")) {
- for (Path modelFileToWipe : stream) {
- Files.deleteIfExists(modelFileToWipe);
- }
- } catch (IOException e) {
- fail(e.getLocalizedMessage());
- }
- }
- }
-
-}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
index 5f328b00..24bad516 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
@@ -35,7 +35,7 @@ import opennlp.tools.sentdetect.SentenceModel;
import static org.junit.jupiter.api.Assertions.assertEquals;
@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
-public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest {
+public class DownloadUtilDownloadTwiceTest {
/*
* Programmatic change to debug log to ensure that we can see log messages to
@@ -60,24 +60,24 @@ public class DownloadUtilDownloadTwiceTest extends
AbstractDownloadUtilTest {
@Test
public void testDownloadModelTwice() throws IOException {
+ String lang = "de";
+ DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR;
+
try (LogCaptor logCaptor = LogCaptor.forClass(DownloadUtil.class)) {
-
- DownloadUtil.downloadModel("de",
- DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
-
- assertEquals(2, logCaptor.getDebugLogs().size());
- checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download
complete.");
+ boolean alreadyDownloaded = DownloadUtil.existsModel(lang, type);
+ DownloadUtil.downloadModel(lang, type, SentenceModel.class);
+
+ if (! alreadyDownloaded) {
+ assertEquals(2, logCaptor.getDebugLogs().size());
+ checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(),
"Download complete.");
+ } else {
+ assertEquals(1, logCaptor.getDebugLogs().size());
+ checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(),
"already exists. Skipping download.");
+ }
logCaptor.clearLogs();
// try to download again
- DownloadUtil.downloadModel("de",
- DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
- assertEquals(1, logCaptor.getDebugLogs().size());
- checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already
exists. Skipping download.");
- logCaptor.clearLogs();
-
- DownloadUtil.downloadModel("de",
- DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+ DownloadUtil.downloadModel(lang, type, SentenceModel.class);
assertEquals(1, logCaptor.getDebugLogs().size());
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already
exists. Skipping download.");
logCaptor.clearLogs();
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
index b8a61f91..c5355601 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.net.URL;
import java.util.stream.Stream;
+import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
@@ -32,11 +33,12 @@ import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
-public class DownloadUtilTest extends AbstractDownloadUtilTest {
+public class DownloadUtilTest {
@ParameterizedTest(name = "Verify \"{0}\" sentence model")
@ValueSource(strings = {"en", "fr", "de", "it", "nl", "bg", "ca", "cs",
"da", "el",
@@ -62,13 +64,33 @@ public class DownloadUtilTest extends
AbstractDownloadUtilTest {
assertTrue(model.isLoadedFromSerialized());
}
+ @Test
+ @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+ public void testExistsModel() throws IOException {
+ final String lang = "en";
+ final DownloadUtil.ModelType type =
DownloadUtil.ModelType.SENTENCE_DETECTOR;
+ // Prepare
+ SentenceModel model = DownloadUtil.downloadModel(lang, type,
SentenceModel.class);
+ assertNotNull(model);
+ assertEquals(lang, model.getLanguage());
+ // Test
+ assertTrue(DownloadUtil.existsModel(lang, type));
+ }
+
+ @ParameterizedTest
+ @NullAndEmptySource
+ @ValueSource(strings = {"xy", "\t", "\n"})
+ @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+ public void testExistsModelInvalid(String input) throws IOException {
+ assertFalse(DownloadUtil.existsModel(input,
DownloadUtil.ModelType.SENTENCE_DETECTOR));
+ }
+
@ParameterizedTest(name = "Detect invalid input: \"{0}\"")
@NullAndEmptySource
@ValueSource(strings = {" ", "\t", "\n"})
public void testDownloadModelInvalid(String input) {
- assertThrows(IOException.class, () -> DownloadUtil.downloadModel(
- input, DownloadUtil.ModelType.SENTENCE_DETECTOR,
SentenceModel.class),
- "Invalid model");
+ assertThrows(IOException.class, () -> DownloadUtil.downloadModel(input,
+ DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class),
"Invalid model");
}
private static final DownloadUtil.ModelType MT_TOKENIZER =
DownloadUtil.ModelType.TOKENIZER;