This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5f2e30215 TIKA-4499 - remove tika-dl module (#2347)
5f2e30215 is described below
commit 5f2e30215e6f60174ca7fcd814a55adc51e2452a
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 3 12:39:55 2025 -0400
TIKA-4499 - remove tika-dl module (#2347)
---
CHANGES.txt | 2 +
tika-bom/pom.xml | 5 -
tika-parsers/tika-parsers-ml/pom.xml | 1 -
tika-parsers/tika-parsers-ml/tika-dl/pom.xml | 195 ------------
.../tika/dl/imagerec/DL4JInceptionV3Net.java | 340 ---------------------
.../org/apache/tika/dl/imagerec/DL4JVGG16Net.java | 158 ----------
.../tika/dl/imagerec/DL4JInceptionV3NetTest.java | 63 ----
.../apache/tika/dl/imagerec/DL4JVGG16NetTest.java | 64 ----
.../resources/org/apache/tika/dl/imagerec/cat.jpg | Bin 7686 -> 0 bytes
.../tika/dl/imagerec/dl4j-inception3-config.xml | 35 ---
.../apache/tika/dl/imagerec/dl4j-vgg16-config.xml | 32 --
.../resources/org/apache/tika/dl/imagerec/lion.jpg | Bin 44441 -> 0 bytes
12 files changed, 2 insertions(+), 893 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 9e19666ad..bf320ab3a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -13,6 +13,8 @@ Release 4.0.0-BETA1 - ???
* Removed the dotnet module (TIKA-4332).
+ * Remove the tika-dl module (TIKA-4499).
+
OTHER CHANGES
* Fix concurrency bug in TikaToXMP (TIKA-4393)
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index ff9380841..d57fad573 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -275,11 +275,6 @@
<artifactId>tika-age-recogniser</artifactId>
<version>4.0.0-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-dl</artifactId>
- <version>4.0.0-SNAPSHOT</version>
- </dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-advancedmedia-module</artifactId>
diff --git a/tika-parsers/tika-parsers-ml/pom.xml
b/tika-parsers/tika-parsers-ml/pom.xml
index dbdb41248..0f9a5f048 100644
--- a/tika-parsers/tika-parsers-ml/pom.xml
+++ b/tika-parsers/tika-parsers-ml/pom.xml
@@ -102,7 +102,6 @@
<module>tika-age-recogniser</module>
<module>tika-parser-advancedmedia-module</module>
<module>tika-parser-advancedmedia-package</module>
- <module>tika-dl</module>
</modules>
</profile>
</profiles>
diff --git a/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
b/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
deleted file mode 100644
index ca2b189ea..000000000
--- a/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
+++ /dev/null
@@ -1,195 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers-ml</artifactId>
- <version>4.0.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-dl</artifactId>
- <packaging>jar</packaging>
-
- <name>Apache Tika Deep Learning (powered by DL4J)</name>
- <url>http://maven.apache.org</url>
-
-
- <dependencyManagement>
- <dependencies>
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>openblas-platform</artifactId>
- <version>0.3.30-1.5.12</version>
- </dependency>
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>openblas</artifactId>
- <version>0.3.30-1.5.12</version>
- </dependency>
- <!-- can't update above 4.5.5-1.5.7, UnsatisfiedLinkError: no
jniopencv_core in java.library.path -->
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>opencv</artifactId>
- <version>4.11.0-1.5.12</version>
- </dependency>
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>fastutil</artifactId>
- <version>${fastutil.version}</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-advancedmedia-module</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.datavec</groupId>
- <artifactId>datavec-data-image</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- <exclusions>
- <exclusion>
- <groupId>org.bytedeco</groupId>
- <artifactId>ffmpeg</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <!-- TODO restore after ffmpeg has been updated -->
- <!--
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>ffmpeg</artifactId>
- <version>7.1-1.5.11</version>
- <scope>provided</scope>
- </dependency>
- -->
- <dependency>
- <groupId>org.deeplearning4j</groupId>
- <artifactId>deeplearning4j-zoo</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.deeplearning4j</groupId>
- <artifactId>deeplearning4j-modelimport</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.deeplearning4j</groupId>
- <artifactId>deeplearning4j-nn</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.nd4j</groupId>
- <artifactId>nd4j-api</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.nd4j</groupId>
- <artifactId>nd4j-native-platform</artifactId>
- <version>${dl4j.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j2-impl</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>commons-net</groupId>
- <artifactId>commons-net</artifactId>
- <version>${commons.net.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>javacpp</artifactId>
- <version>${javacpp.version}</version>
- </dependency>
- <dependency>
- <groupId>org.bytedeco</groupId>
- <artifactId>javacpp-platform</artifactId>
- <version>${javacpp.version}</version>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <!--
- To reduce the size of jar by excluding unnecessary native libs,
- set `-Djavacpp.platform=<target>`
- possible targets = {android-arm, linux-x86_64, macosx-x86_64,
windows-x86_64}
- More details here
https://github.com/bytedeco/javacpp-presets/blob/master/README.md#downloads
-
- By default, native libs for all major native platforms are included.
- -->
- <artifactId>maven-assembly-plugin</artifactId>
- <configuration>
- <descriptorRefs>
- <descriptorRef>jar-with-dependencies</descriptorRef>
- </descriptorRefs>
- </configuration>
- <executions>
- <execution>
- <id>make-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestEntries>
- <Automatic-Module-Name>org.apache.tika.dl</Automatic-Module-Name>
- </manifestEntries>
- </archive>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <version>${rat.version}</version>
- <configuration>
- </configuration>
- </plugin>
-
- </plugins>
- </build>
-
- <scm>
- <tag>3.0.0-rc1</tag>
- </scm>
-</project>
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java
b/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java
deleted file mode 100644
index 69b6b34dd..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.dl.imagerec;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
-import org.datavec.image.loader.NativeImageLoader;
-import org.deeplearning4j.nn.graph.ComputationGraph;
-import org.deeplearning4j.nn.modelimport.keras.KerasModel;
-import
org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
-import
org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
-import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelBuilder;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.config.Field;
-import org.apache.tika.config.InitializableProblemHandler;
-import org.apache.tika.config.Param;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.recognition.ObjectRecogniser;
-import org.apache.tika.parser.recognition.RecognisedObject;
-
-/**
- * {@link DL4JInceptionV3Net} is an implementation of {@link ObjectRecogniser}.
- * This object recogniser is powered by <a
href="http://deeplearning4j.org">Deeplearning4j</a>.
- * This implementation is pre configured to use <a
href="https://arxiv.org/abs/1512.00567">
- * Google's InceptionV3 model </a> pre trained on
- * ImageNet corpus. The models references in default settings are originally
trained and exported
- * from <a href="http://keras.io">Keras </a> and imported using DL4J's
importer tools.
- * <p>
- * Although this implementation is made to work out of the box without user
attention,
- * for advances users who are interested in tweaking the settings, the
following fields are
- * configurable:
- * <ul>
- * <li>{@link #modelWeightsPath}</li>
- * <li>{@link #labelFile}</li>
- * <li>{@link #labelLang}</li>
- * <li>{@link #cacheDir}</li>
- * <li>{@link #imgWidth}</li>
- * <li>{@link #imgHeight}</li>
- * <li>{@link #imgChannels}</li>
- * <li>{@link #minConfidence}</li>
- * </ul>
- * </p>
- *
- * @see ObjectRecogniser
- * @see org.apache.tika.parser.recognition.ObjectRecognitionParser
- * @see org.apache.tika.parser.recognition.tf.TensorflowImageRecParser
- * @see org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser
- * @since Tika 1.15
- */
-public class DL4JInceptionV3Net implements ObjectRecogniser {
-
- private static final Set<MediaType> MEDIA_TYPES =
- Collections.singleton(MediaType.image("jpeg"));
- private static final Logger LOG =
LoggerFactory.getLogger(DL4JInceptionV3Net.class);
- private static final String DEF_WEIGHTS_URL =
-
"https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/inception_v3_keras_2.h5";
- private static final String DEF_LABEL_MAPPING_URL =
-
"https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/imagenet_class_index.json";
- private static final String BASE_DIR =
- System.getProperty("user.home") + File.separator + ".tika-dl" +
File.separator +
- "models" + File.separator + "keras";
- private static final String MODEL_DIR = BASE_DIR + File.separator +
"inception-v3";
-
- /**
- * Cache dir to be used for downloading the weights file.
- * This is used to download the model.
- */
- @Field
- private File cacheDir = new File(MODEL_DIR);
-
- /**
- * Path to a HDF5 file that contains weights of the Keras network
- * that was obtained by training the network on a labelled dataset.
- * <br/>
- * Note: when the value is set to <download>, the default model will
be
- * downloaded from {@value #DEF_WEIGHTS_URL}
- */
- @Field
- private String modelWeightsPath = DEF_WEIGHTS_URL;
-
- /***
- * Path to file that tells how to map node index to human readable label
names
- * <br/>
- * The default is retrieved from {@value DEF_LABEL_MAPPING_URL}
- */
- @Field
- private String labelFile = DEF_LABEL_MAPPING_URL;
-
- /**
- * Language name of the labels.
- * <br/>
- * Default is 'en'
- */
- @Field
- private String labelLang = "en";
-
- @Field
- private int imgHeight = 299;
-
- @Field
- private int imgWidth = 299;
-
- @Field
- private int imgChannels = 3;
- /***
- * Ignores the labels that are below this confidence score
- */
- @Field
- private double minConfidence = 0.005;
-
- private ComputationGraph graph;
- private NativeImageLoader imageLoader;
- private Map<Integer, String> labelMap;
-
- private static synchronized File cachedDownload(File cacheDir, URI uri)
throws IOException {
-
- if ("file".equals(uri.getScheme()) || uri.getScheme() == null) {
- return new File(uri);
- }
- if (!cacheDir.exists()) {
- cacheDir.mkdirs();
- }
- String[] parts = uri.toASCIIString().split("/");
- File cacheFile = new File(cacheDir, parts[parts.length - 1]);
- File successFlag = new File(cacheFile.getAbsolutePath() + ".success");
-
- if (cacheFile.exists() && successFlag.exists()) {
- LOG.info("Cache exist at {}. Not downloading it",
cacheFile.getAbsolutePath());
- } else {
- if (successFlag.exists()) {
- successFlag.delete();
- }
- LOG.info("Cache doesn't exist. Going to make a copy");
- LOG.info("This might take a while! GET {}", uri);
- FileUtils.copyURLToFile(uri.toURL(), cacheFile, 5000, 60000);
- //restore the success flag again
- FileUtils.write(successFlag, "CopiedAt:" +
System.currentTimeMillis(),
- StandardCharsets.UTF_8);
- }
- return cacheFile;
- }
-
- @Override
- public Set<MediaType> getSupportedMimes() {
- return MEDIA_TYPES;
- }
-
- /***
- *
- * @param path path to resolve the file
- * @return File or null
- */
- private File retrieveFile(String path) {
- File file = new File(path);
- if (!file.exists()) {
- LOG.warn("File {} not found in local file system. Asking the
classloader", path);
- URL url = getClass().getClassLoader().getResource(path);
- if (url == null) {
- LOG.debug("Classloader does not know the file {}", path);
- file = null;
- } else {
- LOG.debug("Classloader knows the file {}", path);
- try {
- file = cachedDownload(cacheDir, url.toURI());
- } catch (URISyntaxException | IOException e) {
- LOG.warn(e.getMessage(), e);
- }
- }
- }
- return file;
- }
-
- private InputStream retrieveResource(String path) throws
FileNotFoundException {
- File file = new File(path);
- if (file.exists()) {
- return new FileInputStream(file);
- }
- LOG.warn("File {} not found in local file system. Asking the
classloader", path);
- return getClass().getClassLoader().getResourceAsStream(path);
- }
-
- private String mayBeDownloadFile(String path) throws TikaConfigException {
- String resolvedFilePath;
- if (path.startsWith("http://") || path.startsWith("https://")) {
- LOG.debug("Config instructed to download the file, doing so.");
- try {
- resolvedFilePath = cachedDownload(cacheDir,
URI.create(path)).getAbsolutePath();
- } catch (IOException e) {
- throw new TikaConfigException(e.getMessage(), e);
- }
- } else {
- File file = retrieveFile(path);
- if (!file.exists()) {
- LOG.error("File does not exist at :: {}", path);
- }
- resolvedFilePath = file.getAbsolutePath();
- }
- return resolvedFilePath;
- }
-
- @Override
- public void initialize(Map<String, Param> params) throws
TikaConfigException {
-
- //STEP 1: resolve weights file, download if necessary
- modelWeightsPath = mayBeDownloadFile(modelWeightsPath);
-
- //STEP 2: Load labels map
- try (InputStream stream =
retrieveResource(mayBeDownloadFile(labelFile))) {
- this.labelMap = loadClassIndex(stream);
- } catch (IOException | ParseException e) {
- LOG.error("Could not load labels map", e);
- return;
- }
-
- //STEP 3: initialize the graph
- try {
- this.imageLoader = new NativeImageLoader(imgHeight, imgWidth,
imgChannels);
- LOG.info("Going to load Inception network...");
- long st = System.currentTimeMillis();
-
- try (KerasModelBuilder builder =
- new
KerasModel().modelBuilder().modelHdf5Filename(modelWeightsPath)
- .enforceTrainingConfig(false)) {
-
- builder.inputShape(new int[]{imgHeight, imgWidth, 3});
- KerasModel model = builder.buildModel();
- this.graph = model.getComputationGraph();
- long time = System.currentTimeMillis() - st;
- LOG.info("Loaded the Inception model. Time taken={}ms", time);
- }
- } catch (IOException | InvalidKerasConfigurationException |
- UnsupportedKerasConfigurationException e) {
- throw new TikaConfigException(e.getMessage(), e);
- }
- }
-
- @Override
- public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
- //TODO: what do we want to check here?
- }
-
- @Override
- public boolean isAvailable() {
- return graph != null;
- }
-
- /**
- * Pre process image to reduce to make it feedable to inception network
- *
- * @param input Input image
- * @return processed image
- */
- public INDArray preProcessImage(INDArray input) {
- // Transform to [-1.0, 1.0] range
- return input.div(255.0).sub(0.5).mul(2.0);
- }
-
- /**
- * Loads the class to
- *
- * @param stream label index stream
- * @return Map of integer -> label name
- * @throws IOException when the stream breaks unexpectedly
- * @throws ParseException when the input doesn't contain a valid JSON map
- */
- public Map<Integer, String> loadClassIndex(InputStream stream)
- throws IOException, ParseException {
- String content = IOUtils.toString(stream, StandardCharsets.UTF_8);
- JSONObject jIndex = (JSONObject) new JSONParser().parse(content);
- Map<Integer, String> classMap = new HashMap<>();
- for (Object key : jIndex.keySet()) {
- JSONArray names = (JSONArray) jIndex.get(key);
- classMap.put(Integer.parseInt(key.toString()),
names.get(names.size() - 1).toString());
- }
- return classMap;
- }
-
- @Override
- public List<RecognisedObject> recognise(InputStream stream, ContentHandler
handler,
- Metadata metadata, ParseContext
context)
- throws IOException, SAXException, TikaException {
-
- INDArray image = preProcessImage(imageLoader.asImageMatrix(stream,
false).getImage());
- INDArray scores = graph.outputSingle(image);
- List<RecognisedObject> result = new ArrayList<>();
- for (int i = 0; i < scores.length(); i++) {
- if (scores.getDouble(i) > minConfidence) {
- String label = labelMap.get(i);
- String id = i + "";
- result.add(new RecognisedObject(label, labelLang, id,
scores.getDouble(i)));
- LOG.debug("Found Object {}", label);
- }
- }
- return result;
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JVGG16Net.java
b/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JVGG16Net.java
deleted file mode 100644
index a6f6ca81a..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JVGG16Net.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.dl.imagerec;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.datavec.image.loader.NativeImageLoader;
-import org.deeplearning4j.nn.graph.ComputationGraph;
-import org.deeplearning4j.util.ModelSerializer;
-import org.deeplearning4j.zoo.PretrainedType;
-import org.deeplearning4j.zoo.ZooModel;
-import org.deeplearning4j.zoo.model.VGG16;
-import org.deeplearning4j.zoo.util.imagenet.ImageNetLabels;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization;
-import org.nd4j.linalg.dataset.api.preprocessor.VGG16ImagePreProcessor;
-import org.nd4j.linalg.factory.Nd4j;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.config.Field;
-import org.apache.tika.config.InitializableProblemHandler;
-import org.apache.tika.config.Param;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.recognition.ObjectRecogniser;
-import org.apache.tika.parser.recognition.RecognisedObject;
-
-public class DL4JVGG16Net implements ObjectRecogniser {
-
- public static final Set<MediaType> SUPPORTED_MIMES =
- Collections.singleton(MediaType.image("jpeg"));
- private static final Logger LOG =
LoggerFactory.getLogger(DL4JVGG16Net.class);
- private static final String BASE_DIR =
- System.getProperty("user.home") + File.separator + ".tika-dl" +
File.separator +
- "models" + File.separator + "dl4j";
- private static final String MODEL_DIR = BASE_DIR + File.separator +
"vgg-16";
-
- @Field
- private File cacheDir = new File(MODEL_DIR + File.separator + "vgg16.zip");
-
- @Field
- private boolean serialize = true;
-
- @Field
- private int topN;
-
- private NativeImageLoader imageLoader = new NativeImageLoader(224, 224, 3);
- private DataNormalization preProcessor = new VGG16ImagePreProcessor();
- private boolean available = false;
- private ComputationGraph model;
- private ImageNetLabels imageNetLabels;
-
- public Set<MediaType> getSupportedMimes() {
- return SUPPORTED_MIMES;
- }
-
- @Override
- public boolean isAvailable() {
- return available;
- }
-
- @Override
- public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
- //TODO: what do we want to check here?
- }
-
- @Override
- public void initialize(Map<String, Param> params) throws
TikaConfigException {
- try {
- if (serialize) {
- if (cacheDir.exists()) {
- model = ModelSerializer.restoreComputationGraph(cacheDir);
- LOG.info("Preprocessed Model Loaded from {}", cacheDir);
- } else {
- LOG.warn("Preprocessed Model doesn't exist at {}",
cacheDir);
- cacheDir.getParentFile().mkdirs();
- ZooModel zooModel = VGG16.builder().build();
- model = (ComputationGraph)
zooModel.initPretrained(PretrainedType.IMAGENET);
- LOG.info(
- "Saving the Loaded model for future use. Saved
models" +
- " are more optimised to consume less
resources.");
- ModelSerializer.writeModel(model, cacheDir, true);
- }
- } else {
- LOG.info("Weight graph model loaded via dl4j Helper
functions");
- ZooModel zooModel = VGG16.builder().build();
- model = (ComputationGraph)
zooModel.initPretrained(PretrainedType.IMAGENET);
- }
- imageNetLabels = new ImageNetLabels();
- available = true;
- } catch (Exception e) {
- available = false;
- LOG.warn(e.getMessage(), e);
- throw new TikaConfigException(e.getMessage(), e);
- }
- }
-
- @Override
- public List<RecognisedObject> recognise(InputStream stream, ContentHandler
handler,
- Metadata metadata, ParseContext
context)
- throws IOException, SAXException, TikaException {
- INDArray image = imageLoader.asMatrix(stream);
- preProcessor.transform(image);
- INDArray[] output = model.output(false, image);
- return predict(output[0]);
- }
-
- private List<RecognisedObject> predict(INDArray predictions) {
- List<RecognisedObject> objects = new ArrayList<>();
- int[] topNPredictions = new int[topN];
- float[] topNProb = new float[topN];
- String[] outLabels = new String[topN];
- //brute force collect top N
- int i = 0;
- for (int batch = 0; batch < predictions.size(0); batch++) {
- INDArray currentBatch = predictions.getRow(batch).dup();
- while (i < topN) {
- topNPredictions[i] = Nd4j.argMax(currentBatch, 1).getInt(0);
- topNProb[i] = currentBatch.getFloat(batch, topNPredictions[i]);
- currentBatch.putScalar(0, topNPredictions[i], 0);
- outLabels[i] = imageNetLabels.getLabel(topNPredictions[i]);
- objects.add(new RecognisedObject(outLabels[i], "eng",
- outLabels[i], topNProb[i]));
- i++;
- }
- }
- return objects;
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JInceptionV3NetTest.java
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JInceptionV3NetTest.java
deleted file mode 100644
index 18508bd4e..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JInceptionV3NetTest.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.dl.imagerec;
-
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assumptions.assumeFalse;
-import static org.junit.jupiter.api.Assumptions.assumeTrue;
-
-import java.io.InputStream;
-
-import org.apache.commons.lang3.SystemUtils;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-
-public class DL4JInceptionV3NetTest {
-
- @Test
- public void recognise() throws Exception {
- assumeFalse(SystemUtils.OS_ARCH.equals("aarch64"), "doesn't yet work
on aarch64");
- TikaConfig config = null;
- try (InputStream is =
getClass().getResourceAsStream("dl4j-inception3-config.xml")) {
- config = new TikaConfig(is);
- } catch (Exception e) {
- if (e.getMessage() != null && (e.getMessage().contains("Connection
refused") ||
- e.getMessage().contains("connect timed out") ||
e.getMessage().contains("403 for URL"))) {
- assumeTrue(false, "skipping test because of connection issue");
- }
- throw e;
- }
- assumeTrue(config != null, "something went wrong loading tika config");
- Tika tika = new Tika(config);
- Metadata md = new Metadata();
- try (InputStream is = getClass().getResourceAsStream("cat.jpg")) {
- tika.parse(is, md);
- }
- String[] objects = md.getValues("OBJECT");
- boolean found = false;
- for (String object : objects) {
- if (object.contains("_cat")) {
- found = true;
- break;
- }
- }
- assertTrue(found);
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JVGG16NetTest.java
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JVGG16NetTest.java
deleted file mode 100644
index f3666630f..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/java/org/apache/tika/dl/imagerec/DL4JVGG16NetTest.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.dl.imagerec;
-
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assumptions.assumeFalse;
-import static org.junit.jupiter.api.Assumptions.assumeTrue;
-
-import java.io.InputStream;
-
-import org.apache.commons.lang3.SystemUtils;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-
-public class DL4JVGG16NetTest {
-
- @Test
- public void recognise() throws Exception {
- assumeFalse(SystemUtils.OS_ARCH.equals("aarch64"), "doesn't yet work
on aarch64");
- TikaConfig config = null;
- try (InputStream is =
getClass().getResourceAsStream("dl4j-vgg16-config.xml")) {
- config = new TikaConfig(is);
- } catch (Exception e) {
- if (e.getMessage() != null && (e.getMessage().contains("Connection
refused") ||
- e.getMessage().contains("connect timed out") ||
e.getMessage().contains("403 for URL"))) {
- assumeTrue(false, "skipping test because of connection issue");
- }
- throw e;
- }
-
- assumeTrue(false, "something went wrong loading tika config");
- Tika tika = new Tika(config);
- Metadata md = new Metadata();
- try (InputStream is = getClass().getResourceAsStream("lion.jpg")) {
- tika.parse(is, md);
- }
- String[] objects = md.getValues("OBJECT");
- boolean found = false;
- for (String object : objects) {
- if (object.contains("lion")) {
- found = true;
- break;
- }
- }
- assertTrue(found);
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/cat.jpg
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/cat.jpg
deleted file mode 100644
index 09d45d460..000000000
Binary files
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/cat.jpg
and /dev/null differ
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
deleted file mode 100644
index 27280638b..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-inception3-config.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
- <mime>image/jpeg</mime>
- <params>
- <param name="modelWeightsPath"
type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/inception_v3_keras_2.h5</param>
- <param name="labelFile"
type="string">https://github.com/USCDataScience/tika-dockers/releases/download/v0.2/imagenet_class_index.json</param>
- <param name="topN" type="int">10</param>
- <param name="minConfidence" type="double">0.015</param>
- <param name="class"
type="string">org.apache.tika.dl.imagerec.DL4JInceptionV3Net</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
deleted file mode 100644
index 940a4b63f..000000000
---
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/dl4j-vgg16-config.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one or more
- ~ contributor license agreements. See the NOTICE file distributed with
- ~ this work for additional information regarding copyright ownership.
- ~ The ASF licenses this file to You under the Apache License, Version 2.0
- ~ (the "License"); you may not use this file except in compliance with
- ~ the License. You may obtain a copy of the License at
- ~
- ~ http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing, software
- ~ distributed under the License is distributed on an "AS IS" BASIS,
- ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ~ See the License for the specific language governing permissions and
- ~ limitations under the License.
- -->
-<properties>
- <parsers>
- <parser
class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
- <mime>image/jpeg</mime>
- <params>
- <param name="topN" type="int">3</param>
- <param name="minConfidence" type="double">0.015</param>
- <param name="class"
type="string">org.apache.tika.dl.imagerec.DL4JVGG16Net</param>
- <param name="modelType" type="string">VGG16</param>
- <param name="serialize" type="bool">true</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/lion.jpg
b/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/lion.jpg
deleted file mode 100644
index a25942645..000000000
Binary files
a/tika-parsers/tika-parsers-ml/tika-dl/src/test/resources/org/apache/tika/dl/imagerec/lion.jpg
and /dev/null differ