This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4545-loaders
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 051782b1c69d6fe44de78ddcfee33b51b9a8efde
Author: tallison <[email protected]>
AuthorDate: Wed Nov 26 12:51:26 2025 -0500

    TIKA-4545 -- integrate TikaJsonConfig across core and pipes
    
    This commit unifies JSON configuration handling between core Tika
    (parsers, detectors) and tika-pipes (fetchers, emitters) by:
    
    1. Added tika-serialization dependency to tika-plugins-core
    2. Refactored TikaConfigs to wrap TikaJsonConfig
    3. TikaConfigs now validates only pipes-specific keys
    4. TikaPluginManager accepts TikaJsonConfig for unified config
    5. Added deserialize() and hasKey() helper methods to TikaJsonConfig
    6. Deleted obsolete ExtensionConfigs.java
    
    Benefits:
    - Single source of truth for JSON parsing
    - Core and pipes can share same config file
    - Each component validates only its own keys
    - Reduced code duplication
    
    Architecture:
    TikaJsonConfig (tika-serialization)
        ├── TikaLoader (validates: parsers, detectors, etc.)
        └── TikaConfigs → TikaPluginManager (validates: fetchers, emitters, 
etc.)
---
 tika-plugins-core/pom.xml                          |   5 +
 .../org/apache/tika/plugins/ExtensionConfigs.java  |  66 ----------
 .../java/org/apache/tika/plugins/TikaConfigs.java  | 134 ++++++++++++++++-----
 .../org/apache/tika/plugins/TikaPluginManager.java |  56 +++++++--
 .../apache/tika/config/loader/TikaJsonConfig.java  |  66 +++++++++-
 5 files changed, 220 insertions(+), 107 deletions(-)

diff --git a/tika-plugins-core/pom.xml b/tika-plugins-core/pom.xml
index c6fc4368a..9051943e4 100644
--- a/tika-plugins-core/pom.xml
+++ b/tika-plugins-core/pom.xml
@@ -36,6 +36,11 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.pf4j</groupId>
       <artifactId>pf4j</artifactId>
diff --git 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java
deleted file mode 100644
index b8dfb6405..000000000
--- 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ExtensionConfigs.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.plugins;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
-import java.util.Set;
-
-public class ExtensionConfigs {
-
-    Map<String, ExtensionConfig> idToConfig = new HashMap<>();
-    Map<String, List<ExtensionConfig>> extensionIdsToConfig = new HashMap<>();
-
-    public ExtensionConfigs() {
-
-    }
-
-    public ExtensionConfigs(Map<String, ExtensionConfig> map) {
-        for (ExtensionConfig c : map.values()) {
-            add(c);
-        }
-    }
-
-    public void add(ExtensionConfig extensionConfig) {
-        if (idToConfig.containsKey(extensionConfig.id())) {
-            throw new IllegalArgumentException("Can't overwrite existing 
extension config for extensionName: " + extensionConfig.name());
-        }
-        idToConfig.put(extensionConfig.id(), extensionConfig);
-        extensionIdsToConfig
-                .computeIfAbsent(extensionConfig.name(), k -> new 
ArrayList<>()).add(extensionConfig);
-    }
-
-    public Optional<ExtensionConfig> getById(String id) {
-        return Optional.ofNullable(idToConfig.get(id));
-    }
-
-    public List<ExtensionConfig> getByExtensionName(String extensionName) {
-        List<ExtensionConfig> configs = 
extensionIdsToConfig.get(extensionName);
-        if (configs == null) {
-            return List.of();
-        }
-        return configs;
-    }
-
-    public Set<String> ids() {
-        return idToConfig.keySet();
-    }
-
-}
diff --git 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java
index cbc1231a9..0b7f80df6 100644
--- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java
+++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java
@@ -16,13 +16,7 @@
  */
 package org.apache.tika.plugins;
 
-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Iterator;
 import java.util.Set;
@@ -31,14 +25,22 @@ import 
com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
+import org.apache.tika.config.loader.TikaJsonConfig;
 import org.apache.tika.exception.TikaConfigException;
 
 /**
- * Loads and validates Tika plugin configuration from JSON.
+ * Loads and validates Tika pipes/plugin configuration from JSON.
+ * <p>
+ * This class validates pipes-specific configuration keys and delegates to
+ * {@link TikaJsonConfig} for parsing. Core Tika keys (parsers, detectors, 
etc.)
+ * are ignored by this validator - they are handled by TikaLoader.
  */
 public class TikaConfigs {
 
-    private static final Set<String> KNOWN_ROOT_KEYS = Set.of(
+    /**
+     * Pipes-specific configuration keys.
+     */
+    private static final Set<String> PIPES_KEYS = Set.of(
             "fetchers",
             "emitters",
             "pipes-iterator",
@@ -47,50 +49,126 @@ public class TikaConfigs {
             "plugin-roots"
     );
 
+    /**
+     * Core Tika configuration keys (handled by TikaLoader, not validated 
here).
+     */
+    private static final Set<String> CORE_TIKA_KEYS = Set.of(
+            "parsers",
+            "detectors",
+            "encoding-detectors",
+            "encodingDetectors",
+            "metadata-filters",
+            "metadataFilters",
+            "renderers",
+            "translators",
+            "auto-detect-parser-config",
+            "autoDetectParserConfig"
+    );
+
     static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
             .configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, 
true);
 
-    public static TikaConfigs load(InputStream is) throws IOException, 
TikaConfigException {
-        try (Reader reader = new BufferedReader(new InputStreamReader(is, 
StandardCharsets.UTF_8))) {
-            TikaConfigs configs = new 
TikaConfigs(OBJECT_MAPPER.readTree(reader));
-            configs.validateNoUnknownKeys();
-            return configs;
-        }
+    private final TikaJsonConfig tikaJsonConfig;
+
+    /**
+     * Loads pipes configuration from a pre-parsed TikaJsonConfig.
+     * This is the preferred method when sharing configuration across
+     * core Tika and pipes components.
+     *
+     * @param tikaJsonConfig the pre-parsed JSON configuration
+     * @return the pipes configuration
+     * @throws TikaConfigException if validation fails
+     */
+    public static TikaConfigs load(TikaJsonConfig tikaJsonConfig) throws 
TikaConfigException {
+        TikaConfigs configs = new TikaConfigs(tikaJsonConfig);
+        configs.validatePipesKeys();
+        return configs;
     }
+
+    /**
+     * Loads pipes configuration from a file.
+     * For backwards compatibility - prefer {@link #load(TikaJsonConfig)} when 
possible.
+     *
+     * @param path the path to the JSON configuration file
+     * @return the pipes configuration
+     * @throws IOException if reading fails
+     * @throws TikaConfigException if validation fails
+     */
     public static TikaConfigs load(Path path) throws IOException, 
TikaConfigException {
-        try (InputStream is = Files.newInputStream(path)) {
-            return load(is);
-        }
+        TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(path);
+        return load(tikaJsonConfig);
     }
-    private final JsonNode root;
 
-    private TikaConfigs(JsonNode root) {
-        this.root = root;
+    private TikaConfigs(TikaJsonConfig tikaJsonConfig) {
+        this.tikaJsonConfig = tikaJsonConfig;
     }
 
+    /**
+     * Gets the underlying TikaJsonConfig.
+     *
+     * @return the TikaJsonConfig
+     */
+    public TikaJsonConfig getTikaJsonConfig() {
+        return tikaJsonConfig;
+    }
+
+    /**
+     * Gets the root JSON node.
+     * Deprecated - use {@link #getTikaJsonConfig()} instead.
+     *
+     * @return the root JSON node
+     */
+    @Deprecated
     public JsonNode getRoot() {
-        return root;
+        return tikaJsonConfig.getRootNode();
     }
 
+    /**
+     * Deserializes a configuration value for the given key.
+     *
+     * @param clazz the target class
+     * @param key the configuration key
+     * @param <T> the type to deserialize to
+     * @return the deserialized value
+     * @throws IOException if deserialization fails
+     */
     public <T> T deserialize(Class<T> clazz, String key) throws IOException {
-        return OBJECT_MAPPER.treeToValue(root.get(key), clazz);
+        return tikaJsonConfig.deserialize(key, clazz);
     }
 
     /**
-     * Validates that the config contains no unknown root-level keys.
+     * Validates that pipes-specific keys are correct.
      * This catches typos like "pipes-reporter" instead of "pipes-reporters".
      * <p>
+     * Core Tika keys (parsers, detectors, etc.) are ignored - they are
+     * validated by TikaLoader.
+     * <p>
      * Keys prefixed with "x-" are allowed for custom extensions.
      *
-     * @throws TikaConfigException if unknown keys are found
+     * @throws TikaConfigException if unknown pipes keys are found
      */
-    private void validateNoUnknownKeys() throws TikaConfigException {
+    private void validatePipesKeys() throws TikaConfigException {
+        JsonNode root = tikaJsonConfig.getRootNode();
         Iterator<String> fieldNames = root.fieldNames();
         while (fieldNames.hasNext()) {
             String key = fieldNames.next();
-            if (!KNOWN_ROOT_KEYS.contains(key) && !key.startsWith("x-")) {
-                throw new TikaConfigException("Unknown config key: '" + key +
-                        "'. Valid keys: " + KNOWN_ROOT_KEYS + " (or use 'x-' 
prefix for custom keys)");
+
+            // Ignore core Tika keys - TikaLoader validates those
+            if (CORE_TIKA_KEYS.contains(key)) {
+                continue;
+            }
+
+            // Ignore custom extension keys
+            if (key.startsWith("x-")) {
+                continue;
+            }
+
+            // Must be a known pipes key
+            if (!PIPES_KEYS.contains(key)) {
+                throw new TikaConfigException("Unknown pipes config key: '" + 
key +
+                        "'. Valid pipes keys: " + PIPES_KEYS +
+                        " (or use 'x-' prefix for custom keys). " +
+                        "Core Tika keys like 'parsers', 'detectors' should be 
configured separately.");
             }
         }
     }
diff --git 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java
 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java
index ac52d0da6..df23e078c 100644
--- 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java
+++ 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java
@@ -18,7 +18,6 @@ package org.apache.tika.plugins;
 
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
@@ -31,31 +30,66 @@ import org.pf4j.ExtensionFinder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.tika.config.loader.TikaJsonConfig;
 import org.apache.tika.exception.TikaConfigException;
 
+/**
+ * PF4J-based plugin manager for Tika pipes components.
+ * <p>
+ * This manager loads plugins from configured plugin root directories and
+ * makes their extensions available for discovery.
+ */
 public class TikaPluginManager extends DefaultPluginManager {
 
-
     private static final Logger LOG = 
LoggerFactory.getLogger(TikaPluginManager.class);
 
-    public static TikaPluginManager load(Path p) throws TikaConfigException, 
IOException {
-        try (InputStream is = Files.newInputStream(p)) {
-            return load(is);
-        }
+    /**
+     * Loads plugin manager from a pre-parsed TikaJsonConfig.
+     * This is the preferred method when sharing configuration across
+     * core Tika and pipes components.
+     *
+     * @param tikaJsonConfig the pre-parsed JSON configuration
+     * @return the plugin manager
+     * @throws TikaConfigException if configuration is invalid
+     * @throws IOException if plugin initialization fails
+     */
+    public static TikaPluginManager load(TikaJsonConfig tikaJsonConfig)
+            throws TikaConfigException, IOException {
+        TikaConfigs tikaConfigs = TikaConfigs.load(tikaJsonConfig);
+        return load(tikaConfigs);
     }
 
-    public static TikaPluginManager load(InputStream is) throws 
TikaConfigException, IOException {
-        return load(TikaConfigs.load(is));
+    /**
+     * Loads plugin manager from a configuration file.
+     * For backwards compatibility - prefer {@link #load(TikaJsonConfig)} when 
possible.
+     *
+     * @param configPath the path to the JSON configuration file
+     * @return the plugin manager
+     * @throws TikaConfigException if configuration is invalid
+     * @throws IOException if reading or plugin initialization fails
+     */
+    public static TikaPluginManager load(Path configPath) throws 
TikaConfigException, IOException {
+        TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath);
+        return load(tikaJsonConfig);
     }
 
-    public static TikaPluginManager load(TikaConfigs tikaConfigs) throws 
TikaConfigException, IOException {
+    /**
+     * Loads plugin manager from a TikaConfigs instance.
+     *
+     * @param tikaConfigs the pipes configuration
+     * @return the plugin manager
+     * @throws TikaConfigException if configuration is invalid
+     * @throws IOException if plugin initialization fails
+     */
+    public static TikaPluginManager load(TikaConfigs tikaConfigs)
+            throws TikaConfigException, IOException {
         JsonNode root = tikaConfigs.getRoot();
         JsonNode pluginRoots = root.get("plugin-roots");
         if (pluginRoots == null) {
             throw new TikaConfigException("plugin-roots must be specified");
         }
-        List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots, 
new TypeReference<List<Path>>() {
-        });
+        List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots,
+                new TypeReference<List<Path>>() {});
         if (roots.isEmpty()) {
             throw new TikaConfigException("plugin-roots must not be empty");
         }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index fe8464dd3..4ab35a0ae 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -36,9 +36,37 @@ import org.apache.tika.exception.TikaConfigException;
  * Parsed representation of a Tika JSON configuration file.
  * Provides access to component configurations by type (parsers, detectors, 
etc.).
  *
- * <p>JSON structure:
+ * <p>This class serves as the single source of truth for JSON parsing across
+ * core Tika (parsers, detectors) and tika-pipes (fetchers, emitters) 
components.
+ * It performs no validation - consumers validate only their own keys.
+ *
+ * <p><b>Unified Configuration Usage:</b>
+ * <pre>
+ * // Parse config once
+ * TikaJsonConfig jsonConfig = TikaJsonConfig.load(Paths.get("config.json"));
+ *
+ * // Load core Tika components (same classloader)
+ * TikaLoader tikaLoader = TikaLoader.load(jsonConfig);
+ * Parser parser = tikaLoader.loadParsers();
+ * Detector detector = tikaLoader.loadDetectors();
+ *
+ * // Load pipes/plugin components (different classloader)
+ * TikaPluginManager pluginManager = TikaPluginManager.load(jsonConfig);
+ * pluginManager.loadPlugins();
+ * pluginManager.startPlugins();
+ *
+ * // Extract config for plugins (crosses classloader boundary as string)
+ * JsonNode fetchersNode = jsonConfig.getRootNode().get("fetchers");
+ * if (fetchersNode != null) {
+ *     String fetcherConfigJson = fetchersNode.toString();
+ *     // Pass string to plugin - safe across classloader boundary
+ * }
+ * </pre>
+ *
+ * <p><b>JSON structure:</b>
  * <pre>
  * {
+ *   // Core Tika components (validated by TikaLoader)
  *   "parsers": [
  *     { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } },
  *     { "html-parser": { ... } },
@@ -48,7 +76,14 @@ import org.apache.tika.exception.TikaConfigException;
  *     { "mime-magic-detector": {} },
  *     { "zip-container-detector": { "maxDepth": 10 } }
  *   ],
- *   ...
+ *
+ *   // Pipes components (validated by TikaConfigs)
+ *   "plugin-roots": ["/path/to/plugins"],
+ *   "fetchers": [...],
+ *   "emitters": [...],
+ *
+ *   // Custom extensions (prefix with x-)
+ *   "x-my-custom-config": { ... }
  * }
  * </pre>
  *
@@ -247,6 +282,33 @@ public class TikaJsonConfig {
         return result;
     }
 
+    /**
+     * Deserializes a configuration value for the given key.
+     *
+     * @param key the configuration key
+     * @param clazz the target class
+     * @param <T> the type to deserialize to
+     * @return the deserialized value, or null if key doesn't exist
+     * @throws IOException if deserialization fails
+     */
+    public <T> T deserialize(String key, Class<T> clazz) throws IOException {
+        JsonNode node = rootNode.get(key);
+        if (node == null || node.isNull()) {
+            return null;
+        }
+        return OBJECT_MAPPER.treeToValue(node, clazz);
+    }
+
+    /**
+     * Checks if a configuration key exists.
+     *
+     * @param key the configuration key
+     * @return true if the key exists and is not null
+     */
+    public boolean hasKey(String key) {
+        return rootNode.has(key) && !rootNode.get(key).isNull();
+    }
+
     /**
      * Gets the ObjectMapper used for JSON processing.
      *

Reply via email to