This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545-loaders in repository https://gitbox.apache.org/repos/asf/tika.git
commit 051782b1c69d6fe44de78ddcfee33b51b9a8efde Author: tallison <[email protected]> AuthorDate: Wed Nov 26 12:51:26 2025 -0500 TIKA-4545 -- integrate TikaJsonConfig across core and pipes This commit unifies JSON configuration handling between core Tika (parsers, detectors) and tika-pipes (fetchers, emitters) by: 1. Added tika-serialization dependency to tika-plugins-core 2. Refactored TikaConfigs to wrap TikaJsonConfig 3. TikaConfigs now validates only pipes-specific keys 4. TikaPluginManager accepts TikaJsonConfig for unified config 5. Added deserialize() and hasKey() helper methods to TikaJsonConfig 6. Deleted obsolete ExtensionConfigs.java Benefits: - Single source of truth for JSON parsing - Core and pipes can share same config file - Each component validates only its own keys - Reduced code duplication Architecture: TikaJsonConfig (tika-serialization) ├── TikaLoader (validates: parsers, detectors, etc.) └── TikaConfigs → TikaPluginManager (validates: fetchers, emitters, etc.) --- tika-plugins-core/pom.xml | 5 + .../org/apache/tika/plugins/ExtensionConfigs.java | 66 ---------- .../java/org/apache/tika/plugins/TikaConfigs.java | 134 ++++++++++++++++----- .../org/apache/tika/plugins/TikaPluginManager.java | 56 +++++++-- .../apache/tika/config/loader/TikaJsonConfig.java | 66 +++++++++- 5 files changed, 220 insertions(+), 107 deletions(-) diff --git a/tika-plugins-core/pom.xml b/tika-plugins-core/pom.xml index c6fc4368a..9051943e4 100644 --- a/tika-plugins-core/pom.xml +++ b/tika-plugins-core/pom.xml @@ -36,6 +36,11 @@ <version>${project.version}</version> <scope>provided</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>org.pf4j</groupId> <artifactId>pf4j</artifactId> diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java deleted file mode 100644 index b8dfb6405..000000000 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.plugins; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; - -public class ExtensionConfigs { - - Map<String, ExtensionConfig> idToConfig = new HashMap<>(); - Map<String, List<ExtensionConfig>> extensionIdsToConfig = new HashMap<>(); - - public ExtensionConfigs() { - - } - - public ExtensionConfigs(Map<String, ExtensionConfig> map) { - for (ExtensionConfig c : map.values()) { - add(c); - } - } - - public void add(ExtensionConfig extensionConfig) { - if (idToConfig.containsKey(extensionConfig.id())) { - throw new IllegalArgumentException("Can't overwrite existing extension config for extensionName: " + extensionConfig.name()); - } - idToConfig.put(extensionConfig.id(), extensionConfig); - extensionIdsToConfig - .computeIfAbsent(extensionConfig.name(), k -> new ArrayList<>()).add(extensionConfig); - } - - public Optional<ExtensionConfig> getById(String id) { - return Optional.ofNullable(idToConfig.get(id)); - } - - public List<ExtensionConfig> getByExtensionName(String extensionName) { - List<ExtensionConfig> configs = extensionIdsToConfig.get(extensionName); - if (configs == null) { - return List.of(); - } - return configs; - } - - public Set<String> ids() { - return idToConfig.keySet(); - } - -} diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java index cbc1231a9..0b7f80df6 100644 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java +++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java @@ -16,13 +16,7 @@ */ package org.apache.tika.plugins; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; import java.util.Set; @@ -31,14 +25,22 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; /** - * Loads and validates Tika plugin configuration from JSON. + * Loads and validates Tika pipes/plugin configuration from JSON. + * <p> + * This class validates pipes-specific configuration keys and delegates to + * {@link TikaJsonConfig} for parsing. Core Tika keys (parsers, detectors, etc.) + * are ignored by this validator - they are handled by TikaLoader. */ public class TikaConfigs { - private static final Set<String> KNOWN_ROOT_KEYS = Set.of( + /** + * Pipes-specific configuration keys. + */ + private static final Set<String> PIPES_KEYS = Set.of( "fetchers", "emitters", "pipes-iterator", @@ -47,50 +49,126 @@ public class TikaConfigs { "plugin-roots" ); + /** + * Core Tika configuration keys (handled by TikaLoader, not validated here). + */ + private static final Set<String> CORE_TIKA_KEYS = Set.of( + "parsers", + "detectors", + "encoding-detectors", + "encodingDetectors", + "metadata-filters", + "metadataFilters", + "renderers", + "translators", + "auto-detect-parser-config", + "autoDetectParserConfig" + ); + static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true); - public static TikaConfigs load(InputStream is) throws IOException, TikaConfigException { - try (Reader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { - TikaConfigs configs = new TikaConfigs(OBJECT_MAPPER.readTree(reader)); - configs.validateNoUnknownKeys(); - return configs; - } + private final TikaJsonConfig tikaJsonConfig; + + /** + * Loads pipes configuration from a pre-parsed TikaJsonConfig. + * This is the preferred method when sharing configuration across + * core Tika and pipes components. + * + * @param tikaJsonConfig the pre-parsed JSON configuration + * @return the pipes configuration + * @throws TikaConfigException if validation fails + */ + public static TikaConfigs load(TikaJsonConfig tikaJsonConfig) throws TikaConfigException { + TikaConfigs configs = new TikaConfigs(tikaJsonConfig); + configs.validatePipesKeys(); + return configs; } + + /** + * Loads pipes configuration from a file. + * For backwards compatibility - prefer {@link #load(TikaJsonConfig)} when possible. + * + * @param path the path to the JSON configuration file + * @return the pipes configuration + * @throws IOException if reading fails + * @throws TikaConfigException if validation fails + */ public static TikaConfigs load(Path path) throws IOException, TikaConfigException { - try (InputStream is = Files.newInputStream(path)) { - return load(is); - } + TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(path); + return load(tikaJsonConfig); } - private final JsonNode root; - private TikaConfigs(JsonNode root) { - this.root = root; + private TikaConfigs(TikaJsonConfig tikaJsonConfig) { + this.tikaJsonConfig = tikaJsonConfig; } + /** + * Gets the underlying TikaJsonConfig. + * + * @return the TikaJsonConfig + */ + public TikaJsonConfig getTikaJsonConfig() { + return tikaJsonConfig; + } + + /** + * Gets the root JSON node. + * Deprecated - use {@link #getTikaJsonConfig()} instead. + * + * @return the root JSON node + */ + @Deprecated public JsonNode getRoot() { - return root; + return tikaJsonConfig.getRootNode(); } + /** + * Deserializes a configuration value for the given key. + * + * @param clazz the target class + * @param key the configuration key + * @param <T> the type to deserialize to + * @return the deserialized value + * @throws IOException if deserialization fails + */ public <T> T deserialize(Class<T> clazz, String key) throws IOException { - return OBJECT_MAPPER.treeToValue(root.get(key), clazz); + return tikaJsonConfig.deserialize(key, clazz); } /** - * Validates that the config contains no unknown root-level keys. + * Validates that pipes-specific keys are correct. * This catches typos like "pipes-reporter" instead of "pipes-reporters". * <p> + * Core Tika keys (parsers, detectors, etc.) are ignored - they are + * validated by TikaLoader. + * <p> * Keys prefixed with "x-" are allowed for custom extensions. * - * @throws TikaConfigException if unknown keys are found + * @throws TikaConfigException if unknown pipes keys are found */ - private void validateNoUnknownKeys() throws TikaConfigException { + private void validatePipesKeys() throws TikaConfigException { + JsonNode root = tikaJsonConfig.getRootNode(); Iterator<String> fieldNames = root.fieldNames(); while (fieldNames.hasNext()) { String key = fieldNames.next(); - if (!KNOWN_ROOT_KEYS.contains(key) && !key.startsWith("x-")) { - throw new TikaConfigException("Unknown config key: '" + key + - "'. Valid keys: " + KNOWN_ROOT_KEYS + " (or use 'x-' prefix for custom keys)"); + + // Ignore core Tika keys - TikaLoader validates those + if (CORE_TIKA_KEYS.contains(key)) { + continue; + } + + // Ignore custom extension keys + if (key.startsWith("x-")) { + continue; + } + + // Must be a known pipes key + if (!PIPES_KEYS.contains(key)) { + throw new TikaConfigException("Unknown pipes config key: '" + key + + "'. Valid pipes keys: " + PIPES_KEYS + + " (or use 'x-' prefix for custom keys). " + + "Core Tika keys like 'parsers', 'detectors' should be configured separately."); } } } diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java index ac52d0da6..df23e078c 100644 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java +++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java @@ -18,7 +18,6 @@ package org.apache.tika.plugins; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; @@ -31,31 +30,66 @@ import org.pf4j.ExtensionFinder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; +/** + * PF4J-based plugin manager for Tika pipes components. + * <p> + * This manager loads plugins from configured plugin root directories and + * makes their extensions available for discovery. + */ public class TikaPluginManager extends DefaultPluginManager { - private static final Logger LOG = LoggerFactory.getLogger(TikaPluginManager.class); - public static TikaPluginManager load(Path p) throws TikaConfigException, IOException { - try (InputStream is = Files.newInputStream(p)) { - return load(is); - } + /** + * Loads plugin manager from a pre-parsed TikaJsonConfig. + * This is the preferred method when sharing configuration across + * core Tika and pipes components. + * + * @param tikaJsonConfig the pre-parsed JSON configuration + * @return the plugin manager + * @throws TikaConfigException if configuration is invalid + * @throws IOException if plugin initialization fails + */ + public static TikaPluginManager load(TikaJsonConfig tikaJsonConfig) + throws TikaConfigException, IOException { + TikaConfigs tikaConfigs = TikaConfigs.load(tikaJsonConfig); + return load(tikaConfigs); } - public static TikaPluginManager load(InputStream is) throws TikaConfigException, IOException { - return load(TikaConfigs.load(is)); + /** + * Loads plugin manager from a configuration file. + * For backwards compatibility - prefer {@link #load(TikaJsonConfig)} when possible. + * + * @param configPath the path to the JSON configuration file + * @return the plugin manager + * @throws TikaConfigException if configuration is invalid + * @throws IOException if reading or plugin initialization fails + */ + public static TikaPluginManager load(Path configPath) throws TikaConfigException, IOException { + TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath); + return load(tikaJsonConfig); } - public static TikaPluginManager load(TikaConfigs tikaConfigs) throws TikaConfigException, IOException { + /** + * Loads plugin manager from a TikaConfigs instance. + * + * @param tikaConfigs the pipes configuration + * @return the plugin manager + * @throws TikaConfigException if configuration is invalid + * @throws IOException if plugin initialization fails + */ + public static TikaPluginManager load(TikaConfigs tikaConfigs) + throws TikaConfigException, IOException { JsonNode root = tikaConfigs.getRoot(); JsonNode pluginRoots = root.get("plugin-roots"); if (pluginRoots == null) { throw new TikaConfigException("plugin-roots must be specified"); } - List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots, new TypeReference<List<Path>>() { - }); + List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots, + new TypeReference<List<Path>>() {}); if (roots.isEmpty()) { throw new TikaConfigException("plugin-roots must not be empty"); } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index fe8464dd3..4ab35a0ae 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -36,9 +36,37 @@ import org.apache.tika.exception.TikaConfigException; * Parsed representation of a Tika JSON configuration file. * Provides access to component configurations by type (parsers, detectors, etc.). * - * <p>JSON structure: + * <p>This class serves as the single source of truth for JSON parsing across + * core Tika (parsers, detectors) and tika-pipes (fetchers, emitters) components. + * It performs no validation - consumers validate only their own keys. + * + * <p><b>Unified Configuration Usage:</b> + * <pre> + * // Parse config once + * TikaJsonConfig jsonConfig = TikaJsonConfig.load(Paths.get("config.json")); + * + * // Load core Tika components (same classloader) + * TikaLoader tikaLoader = TikaLoader.load(jsonConfig); + * Parser parser = tikaLoader.loadParsers(); + * Detector detector = tikaLoader.loadDetectors(); + * + * // Load pipes/plugin components (different classloader) + * TikaPluginManager pluginManager = TikaPluginManager.load(jsonConfig); + * pluginManager.loadPlugins(); + * pluginManager.startPlugins(); + * + * // Extract config for plugins (crosses classloader boundary as string) + * JsonNode fetchersNode = jsonConfig.getRootNode().get("fetchers"); + * if (fetchersNode != null) { + * String fetcherConfigJson = fetchersNode.toString(); + * // Pass string to plugin - safe across classloader boundary + * } + * </pre> + * + * <p><b>JSON structure:</b> * <pre> * { + * // Core Tika components (validated by TikaLoader) * "parsers": [ * { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } }, * { "html-parser": { ... } }, @@ -48,7 +76,14 @@ import org.apache.tika.exception.TikaConfigException; * { "mime-magic-detector": {} }, * { "zip-container-detector": { "maxDepth": 10 } } * ], - * ... + * + * // Pipes components (validated by TikaConfigs) + * "plugin-roots": ["/path/to/plugins"], + * "fetchers": [...], + * "emitters": [...], + * + * // Custom extensions (prefix with x-) + * "x-my-custom-config": { ... } * } * </pre> * @@ -247,6 +282,33 @@ public class TikaJsonConfig { return result; } + /** + * Deserializes a configuration value for the given key. + * + * @param key the configuration key + * @param clazz the target class + * @param <T> the type to deserialize to + * @return the deserialized value, or null if key doesn't exist + * @throws IOException if deserialization fails + */ + public <T> T deserialize(String key, Class<T> clazz) throws IOException { + JsonNode node = rootNode.get(key); + if (node == null || node.isNull()) { + return null; + } + return OBJECT_MAPPER.treeToValue(node, clazz); + } + + /** + * Checks if a configuration key exists. + * + * @param key the configuration key + * @return true if the key exists and is not null + */ + public boolean hasKey(String key) { + return rootNode.has(key) && !rootNode.get(key).isNull(); + } + /** * Gets the ObjectMapper used for JSON processing. *
