This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545-add-translators in repository https://gitbox.apache.org/repos/asf/tika.git
commit 361cb1ed0199dafc3b0846f99f42b8a88703b636 Author: tallison <[email protected]> AuthorDate: Wed Dec 3 13:30:03 2025 -0500 TIKA-4545 -- add translators and refactor loaders --- .../tika/language/translate/EmptyTranslator.java | 3 + .../java/org/apache/tika/plugins/TikaConfigs.java | 16 --- .../org/apache/tika/plugins/TikaPluginManager.java | 5 +- .../org/apache/tika/plugins/TikaConfigsTest.java | 4 +- .../tika/config/loader/ComponentInstantiator.java | 116 +++++++++++++++++++++ .../config/loader/CompositeComponentLoader.java | 55 +--------- .../apache/tika/config/loader/DetectorLoader.java | 60 +---------- .../tika/config/loader/EncodingDetectorLoader.java | 60 +---------- .../apache/tika/config/loader/ParserLoader.java | 30 +----- .../loader/PolymorphicObjectMapperFactory.java | 4 + .../org/apache/tika/config/loader/TikaLoader.java | 19 ++++ .../tika/config/loader/TranslatorLoader.java | 108 +++++++++++++++++++ .../apache/tika/config/loader/TikaLoaderTest.java | 43 ++++++++ .../tika/server/core/TranslateResourceTest.java | 1 - tika-translate/pom.xml | 10 ++ .../language/translate/impl/CachedTranslator.java | 2 + .../language/translate/impl/GoogleTranslator.java | 2 + .../translate/impl/JoshuaNetworkTranslator.java | 2 + .../language/translate/impl/Lingo24Translator.java | 2 + .../language/translate/impl/MarianTranslator.java | 2 + .../translate/impl/MicrosoftTranslator.java | 2 + .../language/translate/impl/MosesTranslator.java | 2 + .../language/translate/impl/RTGTranslator.java | 2 + .../language/translate/impl/YandexTranslator.java | 2 + .../org.apache.tika.language.translate.Translator | 22 ---- 25 files changed, 333 insertions(+), 241 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java index 9324af224..35c75b57e 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java @@ -16,11 +16,14 @@ */ package org.apache.tika.language.translate; +import org.apache.tika.config.TikaComponent; + /** * Dummy translator that always declines to give any text. Useful as a * sentinel translator for when none others are available. * for unknown document types. */ +@TikaComponent public class EmptyTranslator implements Translator { public String translate(String text, String sourceLanguage, String targetLanguage) { return null; diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java index acefb3640..d2a780b64 100644 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java +++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java @@ -21,9 +21,7 @@ import java.nio.file.Path; import java.util.Iterator; import java.util.Set; -import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; @@ -66,9 +64,6 @@ public class TikaConfigs { "server" ); - static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true); - private final TikaJsonConfig tikaJsonConfig; /** @@ -113,17 +108,6 @@ public class TikaConfigs { return tikaJsonConfig; } - /** - * Gets the root JSON node. - * Deprecated - use {@link #getTikaJsonConfig()} instead. - * - * @return the root JSON node - */ - @Deprecated - public JsonNode getRoot() { - return tikaJsonConfig.getRootNode(); - } - /** * Deserializes a configuration value for the given key. * diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java index df23e078c..59fda13be 100644 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java +++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java @@ -30,6 +30,7 @@ import org.pf4j.ExtensionFinder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; @@ -83,12 +84,12 @@ public class TikaPluginManager extends DefaultPluginManager { */ public static TikaPluginManager load(TikaConfigs tikaConfigs) throws TikaConfigException, IOException { - JsonNode root = tikaConfigs.getRoot(); + JsonNode root = tikaConfigs.getTikaJsonConfig().getRootNode(); JsonNode pluginRoots = root.get("plugin-roots"); if (pluginRoots == null) { throw new TikaConfigException("plugin-roots must be specified"); } - List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots, + List<Path> roots = PolymorphicObjectMapperFactory.getMapper().convertValue(pluginRoots, new TypeReference<List<Path>>() {}); if (roots.isEmpty()) { throw new TikaConfigException("plugin-roots must not be empty"); diff --git a/tika-plugins-core/src/test/java/org/apache/tika/plugins/TikaConfigsTest.java b/tika-plugins-core/src/test/java/org/apache/tika/plugins/TikaConfigsTest.java index 3ecafc018..207765d26 100644 --- a/tika-plugins-core/src/test/java/org/apache/tika/plugins/TikaConfigsTest.java +++ b/tika-plugins-core/src/test/java/org/apache/tika/plugins/TikaConfigsTest.java @@ -137,8 +137,8 @@ public class TikaConfigsTest { """; TikaConfigs configs = loadFromString(json); - assertNotNull(configs.getRoot()); - assertNotNull(configs.getRoot().get("fetchers")); + assertNotNull(configs.getTikaJsonConfig().getRootNode()); + assertNotNull(configs.getTikaJsonConfig().getRootNode().get("fetchers")); } private TikaConfigs loadFromString(String json) throws Exception { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java new file mode 100644 index 000000000..2f9a66e4c --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.tika.config.JsonConfig; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.utils.ServiceLoaderUtils; + +/** + * Utility class for instantiating Tika components from JSON configuration. + * Provides common logic for all component loaders to avoid code duplication. + */ +public class ComponentInstantiator { + + /** + * Instantiates a component with JsonConfig constructor or falls back to zero-arg constructor. + * <p> + * Instantiation strategy: + * <ol> + * <li>Try constructor with JsonConfig parameter</li> + * <li>If not found and JSON config has actual configuration, throw error</li> + * <li>Otherwise fall back to zero-arg constructor via ServiceLoader</li> + * </ol> + * + * @param componentClass the component class to instantiate + * @param jsonConfig the JSON configuration for the component + * @param classLoader the class loader to use + * @param componentTypeName the component type name (e.g., "Detector", "Parser") for error messages + * @param objectMapper the Jackson ObjectMapper for parsing JSON + * @param <T> the component type + * @return the instantiated component + * @throws TikaConfigException if instantiation fails + */ + @SuppressWarnings("unchecked") + public static <T> T instantiate(Class<?> componentClass, + JsonConfig jsonConfig, + ClassLoader classLoader, + String componentTypeName, + ObjectMapper objectMapper) + throws TikaConfigException { + try { + T component; + + // Try constructor with JsonConfig parameter + try { + Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class); + component = (T) constructor.newInstance(jsonConfig); + } catch (NoSuchMethodException e) { + // Check if JSON config has actual configuration + if (hasConfiguration(jsonConfig, objectMapper)) { + throw new TikaConfigException( + componentTypeName + " '" + componentClass.getName() + "' has configuration in JSON, " + + "but does not have a constructor that accepts JsonConfig. " + + "Please add a constructor: public " + componentClass.getSimpleName() + "(JsonConfig jsonConfig)"); + } + // Fall back to zero-arg constructor if no configuration provided + component = (T) ServiceLoaderUtils.newInstance(componentClass, + new org.apache.tika.config.ServiceLoader(classLoader)); + } + + return component; + } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { + throw new TikaConfigException("Failed to instantiate " + componentTypeName + ": " + + componentClass.getName(), e); + } + } + + /** + * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). + * + * @param jsonConfig the JSON configuration + * @param objectMapper the Jackson ObjectMapper for parsing JSON + * @return true if there's meaningful configuration, false if empty or just "{}" + */ + public static boolean hasConfiguration(JsonConfig jsonConfig, ObjectMapper objectMapper) { + if (jsonConfig == null) { + return false; + } + String json = jsonConfig.json(); + if (json == null || json.trim().isEmpty()) { + return false; + } + // Parse to check if it's an empty object or has actual fields + try { + JsonNode node = objectMapper.readTree(json); + // Check if it's an object and has at least one field + if (node.isObject() && node.size() > 0) { + return true; + } + return false; + } catch (Exception e) { + // If we can't parse it, assume it has configuration to be safe + return true; + } + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java index 7548f67a1..da5d5f59e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java @@ -16,8 +16,6 @@ */ package org.apache.tika.config.loader; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; @@ -32,7 +30,6 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.JsonConfig; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.utils.ServiceLoaderUtils; /** * Generic loader for Tika components (detectors, encoding detectors, filters, etc.). @@ -172,58 +169,10 @@ public class CompositeComponentLoader<T> { } } - @SuppressWarnings("unchecked") private T instantiateComponent(Class<?> componentClass, JsonConfig configJson) throws TikaConfigException { - try { - // Try constructor with JsonConfig parameter - try { - Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class); - return (T) constructor.newInstance(configJson); - } catch (NoSuchMethodException e) { - // Check if JSON config has actual configuration - if (hasConfiguration(configJson)) { - throw new TikaConfigException( - "Component '" + componentClass.getName() + "' has configuration in JSON, " + - "but does not have a constructor that accepts JsonConfig. " + - "Please add a constructor: public " + componentClass.getSimpleName() + "(JsonConfig jsonConfig)"); - } - // Fall back to zero-arg constructor if no configuration provided - return (T) ServiceLoaderUtils.newInstance(componentClass, - new org.apache.tika.config.ServiceLoader(classLoader)); - } - } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("Failed to instantiate component: " + - componentClass.getName(), e); - } - } - - /** - * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). - * - * @param jsonConfig the JSON configuration - * @return true if there's meaningful configuration, false if empty or just "{}" - */ - private boolean hasConfiguration(JsonConfig jsonConfig) { - if (jsonConfig == null) { - return false; - } - String json = jsonConfig.json(); - if (json == null || json.trim().isEmpty()) { - return false; - } - // Parse to check if it's an empty object or has actual fields - try { - JsonNode node = objectMapper.readTree(json); - // Check if it's an object and has at least one field - if (node.isObject() && node.size() > 0) { - return true; - } - return false; - } catch (Exception e) { - // If we can't parse it, assume it has configuration to be safe - return true; - } + return ComponentInstantiator.instantiate(componentClass, configJson, classLoader, + componentTypeName, objectMapper); } private List<T> loadSpiComponents() { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java index 8c63a8867..f2146cfc1 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java @@ -16,8 +16,6 @@ */ package org.apache.tika.config.loader; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -36,7 +34,6 @@ import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.utils.ServiceLoaderUtils; /** * Loader for detectors with support for SPI fallback via "default-detector" marker. @@ -190,63 +187,10 @@ public class DetectorLoader { } } - @SuppressWarnings("unchecked") private Detector instantiateDetector(Class<?> detectorClass, JsonConfig jsonConfig) throws TikaConfigException { - - try { - Detector detector; - - // Try constructor with JsonConfig parameter - try { - Constructor<?> constructor = detectorClass.getConstructor(JsonConfig.class); - detector = (Detector) constructor.newInstance(jsonConfig); - } catch (NoSuchMethodException e) { - // Check if JSON config has actual configuration - if (hasConfiguration(jsonConfig)) { - throw new TikaConfigException( - "Detector '" + detectorClass.getName() + "' has configuration in JSON, " + - "but does not have a constructor that accepts JsonConfig. " + - "Please add a constructor: public " + detectorClass.getSimpleName() + "(JsonConfig jsonConfig)"); - } - // Fall back to zero-arg constructor if no configuration provided - detector = (Detector) ServiceLoaderUtils.newInstance(detectorClass, - new org.apache.tika.config.ServiceLoader(classLoader)); - } - - return detector; - } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("Failed to instantiate detector: " + - detectorClass.getName(), e); - } - } - - /** - * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). - * - * @param jsonConfig the JSON configuration - * @return true if there's meaningful configuration, false if empty or just "{}" - */ - private boolean hasConfiguration(JsonConfig jsonConfig) { - if (jsonConfig == null) { - return false; - } - String json = jsonConfig.json(); - if (json == null || json.trim().isEmpty()) { - return false; - } - // Parse to check if it's an empty object or has actual fields - try { - JsonNode node = objectMapper.readTree(json); - // Check if it's an object and has at least one field - if (node.isObject() && node.size() > 0) { - return true; - } - return false; - } catch (Exception e) { - // If we can't parse it, assume it has configuration to be safe - return true; - } + return ComponentInstantiator.instantiate(detectorClass, jsonConfig, classLoader, + "Detector", objectMapper); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java index 4a46f595d..35079dfeb 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java @@ -16,8 +16,6 @@ */ package org.apache.tika.config.loader; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -36,7 +34,6 @@ import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.utils.ServiceLoaderUtils; /** * Loader for encoding detectors with support for SPI fallback via "default-encoding-detector" marker. @@ -179,63 +176,10 @@ public class EncodingDetectorLoader { } } - @SuppressWarnings("unchecked") private EncodingDetector instantiateEncodingDetector(Class<?> detectorClass, JsonConfig jsonConfig) throws TikaConfigException { - - try { - EncodingDetector detector; - - // Try constructor with JsonConfig parameter - try { - Constructor<?> constructor = detectorClass.getConstructor(JsonConfig.class); - detector = (EncodingDetector) constructor.newInstance(jsonConfig); - } catch (NoSuchMethodException e) { - // Check if JSON config has actual configuration - if (hasConfiguration(jsonConfig)) { - throw new TikaConfigException( - "Encoding detector '" + detectorClass.getName() + "' has configuration in JSON, " + - "but does not have a constructor that accepts JsonConfig. " + - "Please add a constructor: public " + detectorClass.getSimpleName() + "(JsonConfig jsonConfig)"); - } - // Fall back to zero-arg constructor if no configuration provided - detector = (EncodingDetector) ServiceLoaderUtils.newInstance(detectorClass, - new org.apache.tika.config.ServiceLoader(classLoader)); - } - - return detector; - } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("Failed to instantiate encoding detector: " + - detectorClass.getName(), e); - } - } - - /** - * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). - * - * @param jsonConfig the JSON configuration - * @return true if there's meaningful configuration, false if empty or just "{}" - */ - private boolean hasConfiguration(JsonConfig jsonConfig) { - if (jsonConfig == null) { - return false; - } - String json = jsonConfig.json(); - if (json == null || json.trim().isEmpty()) { - return false; - } - // Parse to check if it's an empty object or has actual fields - try { - JsonNode node = objectMapper.readTree(json); - // Check if it's an object and has at least one field - if (node.isObject() && node.size() > 0) { - return true; - } - return false; - } catch (Exception e) { - // If we can't parse it, assume it has configuration to be safe - return true; - } + return ComponentInstantiator.instantiate(detectorClass, jsonConfig, classLoader, + "EncodingDetector", objectMapper); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java index 177cacce7..786d2e9bb 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -224,7 +224,7 @@ public class ParserLoader { parser = (Parser) constructor.newInstance(jsonConfig); } catch (NoSuchMethodException e) { // Check if JSON config has actual configuration - if (hasConfiguration(jsonConfig)) { + if (ComponentInstantiator.hasConfiguration(jsonConfig, objectMapper)) { throw new TikaConfigException( "Parser '" + parserClass.getName() + "' has configuration in JSON, " + "but does not have a constructor that accepts JsonConfig. " + @@ -260,34 +260,6 @@ public class ParserLoader { } } - /** - * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). - * - * @param jsonConfig the JSON configuration - * @return true if there's meaningful configuration, false if empty or just "{}" - */ - private boolean hasConfiguration(JsonConfig jsonConfig) { - if (jsonConfig == null) { - return false; - } - String json = jsonConfig.json(); - if (json == null || json.trim().isEmpty()) { - return false; - } - // Parse to check if it's an empty object or has actual fields - try { - JsonNode node = objectMapper.readTree(json); - // Check if it's an object and has at least one field - if (node.isObject() && node.size() > 0) { - return true; - } - return false; - } catch (Exception e) { - // If we can't parse it, assume it has configuration to be safe - return true; - } - } - private Parser applyMimeFiltering(Parser parser, FrameworkConfig.ParserDecoration decoration) { List<String> includes = decoration.getMimeInclude(); List<String> excludes = decoration.getMimeExclude(); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java index b920d7cd0..77f15dab6 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java @@ -92,6 +92,10 @@ public class PolymorphicObjectMapperFactory { //Need to allow creation of classes without setters/getters -- we may want to revisit this mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); + mapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true); + + + // Build polymorphic type validator BasicPolymorphicTypeValidator.Builder builder = BasicPolymorphicTypeValidator.builder() .allowIfSubType("org.apache.tika.") diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 1f81802e3..67b57cc69 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -27,6 +27,7 @@ import org.apache.tika.config.GlobalSettings; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; @@ -86,6 +87,7 @@ public class TikaLoader { private EncodingDetector encodingDetectors; private MetadataFilter metadataFilter; private Renderer renderers; + private Translator translator; private ConfigLoader configLoader; private GlobalSettings globalSettings; @@ -285,6 +287,23 @@ public class TikaLoader { return renderers; } + /** + * Loads and returns the translator. + * If "translator" section exists in config, uses that translator. + * If section missing, uses SPI to discover translator. + * Results are cached - subsequent calls return the same instance. + * + * @return the translator + * @throws TikaConfigException if loading fails + */ + public synchronized Translator loadTranslator() throws TikaConfigException { + if (translator == null) { + TranslatorLoader loader = new TranslatorLoader(classLoader, objectMapper); + translator = loader.load(config); + } + return translator; + } + /** * Loads and returns an AutoDetectParser configured with this loader's parsers and detectors. * Results are cached - subsequent calls return the same instance. diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java new file mode 100644 index 000000000..b84905e2c --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.JsonConfig; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.language.translate.DefaultTranslator; +import org.apache.tika.language.translate.Translator; + +/** + * Loader for translators. + * Only one translator is supported at a time. + */ +public class TranslatorLoader { + + private static final Logger LOG = LoggerFactory.getLogger(TranslatorLoader.class); + + private final ClassLoader classLoader; + private final ObjectMapper objectMapper; + + public TranslatorLoader(ClassLoader classLoader, ObjectMapper objectMapper) { + this.classLoader = classLoader; + this.objectMapper = objectMapper; + } + + /** + * Loads a translator from JSON config. + * <p> + * If "translator" section exists in config, uses that translator. + * If section missing, uses DefaultTranslator to discover translator via SPI. + * + * @param config the Tika JSON configuration + * @return the translator + * @throws TikaConfigException if loading fails + */ + public Translator load(TikaJsonConfig config) throws TikaConfigException { + // Check if translator section exists in config + if (config.hasComponentSection("translator")) { + JsonNode translatorNode = config.getRootNode().get("translator"); + return loadConfiguredTranslator(translatorNode); + } else { + // No configured translator - use DefaultTranslator to load from SPI + return createDefaultTranslator(); + } + } + + private Translator loadConfiguredTranslator(JsonNode translatorNode) + throws TikaConfigException { + try { + // The translator node should be an object with a "class" field + if (!translatorNode.has("class")) { + throw new TikaConfigException("Translator configuration must have a 'class' field"); + } + + String className = translatorNode.get("class").asText(); + ComponentRegistry registry = new ComponentRegistry("translators", classLoader); + Class<?> translatorClass = registry.getComponentClass(className); + + // Remove "class" field from config before extraction + ObjectNode configCopy = ((ObjectNode) translatorNode).deepCopy(); + configCopy.remove("class"); + + // Extract framework config (e.g., _decorate if present) + FrameworkConfig frameworkConfig = FrameworkConfig.extract(configCopy, objectMapper); + + // Instantiate translator + return instantiateTranslator(translatorClass, frameworkConfig.getComponentConfigJson()); + + } catch (Exception e) { + throw new TikaConfigException("Failed to load translator", e); + } + } + + private Translator instantiateTranslator(Class<?> translatorClass, JsonConfig jsonConfig) + throws TikaConfigException { + return ComponentInstantiator.instantiate(translatorClass, jsonConfig, classLoader, + "Translator", objectMapper); + } + + /** + * Creates a DefaultTranslator that loads a translator from SPI. + * + * @return the DefaultTranslator with SPI-loaded translator + */ + private DefaultTranslator createDefaultTranslator() { + return new DefaultTranslator(new org.apache.tika.config.ServiceLoader(classLoader)); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java index 168e673dd..44c145418 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java @@ -29,6 +29,8 @@ import java.nio.file.Path; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; +import org.apache.tika.language.translate.EmptyTranslator; +import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -315,4 +317,45 @@ public class TikaLoaderTest { .contains(MediaType.parse("application/test+optin")), "Should NOT support application/test+optin (opt-in only, not in SPI)"); } + + @Test + public void testTranslatorLoading() throws Exception { + URL configUrl = getClass().getResource("/configs/test-translator-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Translator translator = loader.loadTranslator(); + + assertNotNull(translator, "Translator should not be null"); + assertTrue(translator instanceof EmptyTranslator, "Should be EmptyTranslator"); + assertTrue(translator.isAvailable(), "Translator should be available"); + } + + @Test + public void testTranslatorLazyLoading() throws Exception { + URL configUrl = getClass().getResource("/configs/test-translator-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + + // Load translator + Translator translator1 = loader.loadTranslator(); + assertNotNull(translator1, "First load should return translator"); + + // Load again - should return cached instance + Translator translator2 = loader.loadTranslator(); + assertTrue(translator1 == translator2, "Should return same cached instance"); + } + + @Test + public void testDefaultTranslatorWhenNotConfigured() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Translator translator = loader.loadTranslator(); + + assertNotNull(translator, "Translator should not be null"); + // Should be DefaultTranslator since no translator configured in test-loader-config.json + } } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java index f1bbd711d..c69968369 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java @@ -35,7 +35,6 @@ import org.apache.tika.server.core.resource.TranslateResource; import org.apache.tika.server.core.writer.TarWriter; import org.apache.tika.server.core.writer.ZipWriter; -@Disabled("until we get translators working") public class TranslateResourceTest extends CXFTestBase { private static final String TRANSLATE_PATH = "/translate"; diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml index 3ab74ad8c..51715188f 100644 --- a/tika-translate/pom.xml +++ b/tika-translate/pom.xml @@ -41,6 +41,16 @@ <version>${project.version}</version> <scope>provided</scope> </dependency> + + <!-- Annotation processor - contains @TikaComponent and ensures build order. + "provided" because it is only used at compile time --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-langdetect-optimaize</artifactId> diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/CachedTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/CachedTranslator.java index 1356e5f55..7aeb5c557 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/CachedTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/CachedTranslator.java @@ -22,6 +22,7 @@ import java.util.HashMap; import com.fasterxml.jackson.databind.util.LRUMap; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.translate.Translator; @@ -29,6 +30,7 @@ import org.apache.tika.language.translate.Translator; /** * CachedTranslator. Saves a map of previous translations in order to prevent repetitive translation requests. */ +@TikaComponent public class CachedTranslator extends AbstractTranslator { private static final int INITIAL_ENTRIES = 100; private static final int MAX_ENTRIES = 1000; diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/GoogleTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/GoogleTranslator.java index b9db911fd..540c78039 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/GoogleTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/GoogleTranslator.java @@ -33,6 +33,7 @@ import org.apache.cxf.jaxrs.client.WebClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; /** @@ -43,6 +44,7 @@ import org.apache.tika.exception.TikaException; * from <a href="http://hayageek.com">hayageek.com</a>. Set your API key in * translator.google.properties. */ +@TikaComponent public class GoogleTranslator extends AbstractTranslator { private static final Logger LOG = LoggerFactory.getLogger(GoogleTranslator.class); diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/JoshuaNetworkTranslator.java index 4676954f9..e2af6ecb0 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/JoshuaNetworkTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/JoshuaNetworkTranslator.java @@ -40,6 +40,7 @@ import org.apache.cxf.jaxrs.client.WebClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; /** @@ -62,6 +63,7 @@ import org.apache.tika.exception.TikaException; * Joshua requires input to be pre-formatted into sentences, one per line, * so this translation implementation takes care of that. */ +@TikaComponent public class JoshuaNetworkTranslator extends AbstractTranslator { private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class); diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/Lingo24Translator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/Lingo24Translator.java index 970c17d1f..195c71cec 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/Lingo24Translator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/Lingo24Translator.java @@ -33,6 +33,7 @@ import org.apache.cxf.jaxrs.client.WebClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; /** @@ -41,6 +42,7 @@ import org.apache.tika.exception.TikaException; * You can sign up for an access plan online on the <a href="https://developer.lingo24.com/plans">Lingo24 Developer Portal</a> * and set your Application's User Key in the <code>translator.lingo24.properties</code> file. */ +@TikaComponent public class Lingo24Translator extends AbstractTranslator { private static final Logger LOG = LoggerFactory.getLogger(Lingo24Translator.class); diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MarianTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MarianTranslator.java index c8c84bd7e..92f35e28f 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MarianTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MarianTranslator.java @@ -47,6 +47,7 @@ import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.utils.StringUtils; @@ -57,6 +58,7 @@ import org.apache.tika.utils.StringUtils; * * @link https://marian-nmt.github.io/. */ +@TikaComponent public class MarianTranslator extends AbstractTranslator { private static final Logger LOG = LoggerFactory.getLogger(MarianTranslator.class); diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MicrosoftTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MicrosoftTranslator.java index f6c9add8e..68ed55a90 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MicrosoftTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MicrosoftTranslator.java @@ -26,6 +26,7 @@ import com.memetix.mst.translate.Translate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.language.translate.Translator; @@ -35,6 +36,7 @@ import org.apache.tika.language.translate.Translator; * * @since Tika 1.6 */ +@TikaComponent public class MicrosoftTranslator implements Translator { public static final String PROPERTIES_FILE = "translator.microsoft.properties"; diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MosesTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MosesTranslator.java index 4cc1aae6b..522de7c32 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MosesTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/MosesTranslator.java @@ -27,12 +27,14 @@ import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.Properties; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; /** * Translator that uses the Moses decoder for translation. * Users must install the Moses system before using this Translator. @link http://www.statmt.org/moses/. */ +@TikaComponent public class MosesTranslator extends ExternalTranslator { private static final String DEFAULT_PATH = "dummy-path"; diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/RTGTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/RTGTranslator.java index 618c5e88f..c6d4d7d66 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/RTGTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/RTGTranslator.java @@ -39,6 +39,7 @@ import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; @@ -72,6 +73,7 @@ import org.apache.tika.exception.TikaException; * RTG requires input to be pre-formatted into sentences, one per line, * so this translation implementation takes care of that. */ +@TikaComponent public class RTGTranslator extends AbstractTranslator { public static final String RTG_TRANSLATE_URL_BASE = "http://localhost:6060"; diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/YandexTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/YandexTranslator.java index c83c59102..25d48436c 100644 --- a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/YandexTranslator.java +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/YandexTranslator.java @@ -34,6 +34,7 @@ import org.apache.cxf.jaxrs.client.WebClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.language.translate.Translator; @@ -42,6 +43,7 @@ import org.apache.tika.language.translate.Translator; * You can sign up for free access online on the <a href="https://tech.yandex.com/key/form.xml?service=trnsl">API Key form</a> * and set your Application's User Key in the <code>translator.yandex.properties</code> file. */ +@TikaComponent public class YandexTranslator implements Translator { private static final Logger LOG = LoggerFactory.getLogger(YandexTranslator.class); diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator deleted file mode 100644 index 71cc28df9..000000000 --- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.language.translate.impl.MicrosoftTranslator -org.apache.tika.language.translate.impl.GoogleTranslator -org.apache.tika.language.translate.impl.Lingo24Translator -org.apache.tika.language.translate.impl.CachedTranslator -org.apache.tika.language.translate.impl.JoshuaNetworkTranslator -org.apache.tika.language.translate.impl.RTGTranslator -org.apache.tika.language.translate.impl.MarianTranslator
