This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch serialization-take2
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6e95d5b43d25b4f05574218da50cddb15adca9d7
Author: tallison <[email protected]>
AuthorDate: Wed Feb 25 20:14:56 2026 -0500

    simplify serialization, take 2
---
 tika-pipes/tika-pipes-api/pom.xml                  |  19 ++
 .../java/org/apache/tika/pipes/api/ParseMode.java  |   3 +
 .../tika/config/loader/ComponentInstantiator.java  | 157 +++++++++++++-
 .../config/loader/TikaObjectMapperFactory.java     |  29 +++
 .../tika/serialization/ComponentNameResolver.java  | 104 ++++++++-
 .../tika/serialization/ParseContextUtils.java      |  32 +--
 .../org/apache/tika/serialization/TikaModule.java  | 238 ++-------------------
 .../serdes/ParseContextDeserializer.java           |  47 +---
 .../serdes/ParseContextSerializer.java             |  37 +---
 9 files changed, 348 insertions(+), 318 deletions(-)

diff --git a/tika-pipes/tika-pipes-api/pom.xml 
b/tika-pipes/tika-pipes-api/pom.xml
index bf895ff91e..c37c5f9388 100644
--- a/tika-pipes/tika-pipes-api/pom.xml
+++ b/tika-pipes/tika-pipes-api/pom.xml
@@ -49,9 +49,28 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-annotation-processor</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
   </dependencies>
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>${project.groupId}</groupId>
+              <artifactId>tika-annotation-processor</artifactId>
+              <version>${project.version}</version>
+            </path>
+          </annotationProcessorPaths>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
diff --git 
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
 
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
index e6127d5005..f90ae7a29a 100644
--- 
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
+++ 
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
@@ -18,12 +18,15 @@ package org.apache.tika.pipes.api;
 
 import java.util.Locale;
 
+import org.apache.tika.config.TikaComponent;
+
 /**
  * Controls how embedded documents are handled during parsing.
  * <p>
  * This can be set as a default in PipesConfig (loaded from tika-config.json)
  * or overridden per-file via ParseContext.
  */
+@TikaComponent(name = "parse-mode")
 public enum ParseMode {
 
     /**
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
index 39860928bd..f82daa581c 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
@@ -18,13 +18,22 @@ package org.apache.tika.config.loader;
 
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
+import java.util.HashSet;
+import java.util.Set;
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
 
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.JsonConfig;
+import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.serialization.ComponentNameResolver;
 import org.apache.tika.utils.ServiceLoaderUtils;
 
@@ -123,12 +132,13 @@ public class ComponentInstantiator {
                 // No JsonConfig constructor, fall back to other methods
             }
 
-            // Fall back to Jackson bean deserialization or zero-arg 
constructor
+            // Fall back to no-arg constructor + Jackson bean deserialization 
(readerForUpdating)
+            // Using readerForUpdating preserves defaults from the no-arg 
constructor,
+            // unlike treeToValue which would null out unspecified fields.
             T component;
-            if (configNode == null || configNode.isEmpty()) {
-                component = (T) 
componentClass.getDeclaredConstructor().newInstance();
-            } else {
-                component = (T) objectMapper.treeToValue(configNode, 
componentClass);
+            component = (T) 
componentClass.getDeclaredConstructor().newInstance();
+            if (configNode != null && !configNode.isEmpty()) {
+                
objectMapper.readerForUpdating(component).readValue(configNode);
             }
 
             // Call initialize() on Initializable components
@@ -170,6 +180,143 @@ public class ComponentInstantiator {
         }
     }
 
+    /**
+     * Instantiates a Tika component with full special-case handling.
+     * <p>
+     * This is the primary entry point for component instantiation from JSON 
configuration.
+     * Handles:
+     * <ul>
+     *   <li>Type resolution via {@link 
ComponentNameResolver#resolveClass}</li>
+     *   <li>Type compatibility validation against expectedType</li>
+     *   <li>Special cases: DefaultParser/DefaultDetector rejection, MimeTypes 
singleton</li>
+     *   <li>{@code _mime-include}/{@code _mime-exclude} extraction and 
stripping</li>
+     *   <li>Three-step instantiation: JsonConfig ctor → readerForUpdating → 
no-arg</li>
+     *   <li>{@link Initializable#initialize()} callback</li>
+     *   <li>Parser MIME filter wrapping</li>
+     * </ul>
+     *
+     * @param typeName the component type name (friendly name or FQCN)
+     * @param configNode the JSON configuration node (may be null)
+     * @param mapper the ObjectMapper for deserialization
+     * @param classLoader the class loader for name resolution
+     * @param expectedType the expected interface/base type (for validation), 
or null to skip
+     * @return the instantiated component
+     * @throws TikaConfigException if instantiation fails
+     */
+    @SuppressWarnings("unchecked")
+    public static <T> T instantiateComponent(String typeName, JsonNode 
configNode,
+                                              ObjectMapper mapper, ClassLoader 
classLoader,
+                                              Class<?> expectedType)
+            throws TikaConfigException {
+        // Resolve the class using ComponentNameResolver
+        Class<?> clazz;
+        try {
+            clazz = ComponentNameResolver.resolveClass(typeName, classLoader);
+        } catch (ClassNotFoundException e) {
+            throw new TikaConfigException("Unknown type: " + typeName, e);
+        }
+
+        // Verify type compatibility
+        if (expectedType != null && !expectedType.isAssignableFrom(clazz)) {
+            throw new TikaConfigException("Type " + typeName + " (" + 
clazz.getName() +
+                    ") is not assignable to " + expectedType.getName());
+        }
+
+        // DefaultParser and DefaultDetector must be loaded via TikaLoader
+        if (clazz == DefaultParser.class) {
+            throw new TikaConfigException("DefaultParser must be loaded via 
TikaLoader, not " +
+                    "directly via Jackson deserialization. Use 
TikaLoader.load() to load configuration.");
+        } else if (clazz == DefaultDetector.class) {
+            throw new TikaConfigException("DefaultDetector must be loaded via 
TikaLoader, not " +
+                    "directly via Jackson deserialization. Use 
TikaLoader.load() to load configuration.");
+        }
+
+        // Extract mime filter fields before stripping them
+        Set<MediaType> includeTypes = extractMimeTypes(configNode, 
"_mime-include");
+        Set<MediaType> excludeTypes = extractMimeTypes(configNode, 
"_mime-exclude");
+
+        // Strip decorator fields before passing to component
+        JsonNode cleanedConfig = stripDecoratorFields(configNode);
+
+        try {
+            Object instance;
+
+            if (clazz == MimeTypes.class) {
+                // MimeTypes must use the singleton to have all type 
definitions loaded
+                instance = MimeTypes.getDefaultMimeTypes();
+            } else if (cleanedConfig == null || cleanedConfig.isEmpty()) {
+                // If no config, use default constructor
+                instance = clazz.getDeclaredConstructor().newInstance();
+            } else {
+                // Try JsonConfig constructor first
+                Constructor<?> jsonConfigCtor = 
findJsonConfigConstructor(clazz);
+                if (jsonConfigCtor != null) {
+                    // Use plain JSON mapper since the main mapper may be 
binary (Smile)
+                    String json = TikaObjectMapperFactory.getPlainMapper()
+                            .writeValueAsString(cleanedConfig);
+                    instance = jsonConfigCtor.newInstance((JsonConfig) () -> 
json);
+                } else {
+                    // Fall back to no-arg constructor + Jackson bean 
deserialization
+                    instance = clazz.getDeclaredConstructor().newInstance();
+                    
mapper.readerForUpdating(instance).readValue(cleanedConfig);
+                }
+            }
+
+            // Call initialize() on Initializable components
+            initializeIfNeeded(instance);
+
+            // Wrap parser with mime filtering if include/exclude types 
specified
+            if (instance instanceof Parser && (!includeTypes.isEmpty() || 
!excludeTypes.isEmpty())) {
+                instance = ParserDecorator.withMimeFilters(
+                        (Parser) instance, includeTypes, excludeTypes);
+            }
+
+            return (T) instance;
+
+        } catch (TikaConfigException e) {
+            throw e;
+        } catch (Exception e) {
+            throw new TikaConfigException("Failed to instantiate: " + 
typeName, e);
+        }
+    }
+
+    private static Set<MediaType> extractMimeTypes(JsonNode configNode, String 
fieldName) {
+        Set<MediaType> types = new HashSet<>();
+        if (configNode == null || !configNode.has(fieldName)) {
+            return types;
+        }
+        JsonNode arrayNode = configNode.get(fieldName);
+        if (arrayNode.isArray()) {
+            for (JsonNode typeNode : arrayNode) {
+                types.add(MediaType.parse(typeNode.asText()));
+            }
+        }
+        return types;
+    }
+
+    private static Constructor<?> findJsonConfigConstructor(Class<?> clazz) {
+        try {
+            return clazz.getConstructor(JsonConfig.class);
+        } catch (NoSuchMethodException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Strips decorator fields (_mime-include, _mime-exclude) from config node.
+     * These fields are handled by TikaLoader for wrapping, not by the 
component itself.
+     * Note: _exclude is NOT stripped as it's used by DefaultParser for SPI 
exclusions.
+     */
+    private static JsonNode stripDecoratorFields(JsonNode configNode) {
+        if (configNode == null || !configNode.isObject()) {
+            return configNode;
+        }
+        ObjectNode cleaned = configNode.deepCopy();
+        cleaned.remove("_mime-include");
+        cleaned.remove("_mime-exclude");
+        return cleaned;
+    }
+
     /**
      * Checks if the JsonConfig contains actual configuration (non-empty JSON 
object with fields).
      *
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
index e832dc8d4b..0a24e71705 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
@@ -55,6 +55,35 @@ public class TikaObjectMapperFactory {
 
     private static ObjectMapper MAPPER = null;
 
+    // Shared plain ObjectMapper (no TikaModule) for converting JsonNodes to 
JSON strings.
+    // Needed because the main mapper may use a binary format (e.g., Smile)
+    // which doesn't support writeValueAsString().
+    private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
+
+    static {
+        // Components with no bean properties (e.g., parsers with no 
configuration)
+        // need to serialize as empty objects rather than throwing.
+        PLAIN_MAPPER.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
+    }
+
+    /**
+     * Returns a shared plain ObjectMapper without TikaModule registration.
+     * <p>
+     * This mapper is suitable for:
+     * <ul>
+     *   <li>Converting JsonNodes to JSON strings</li>
+     *   <li>Serializing component properties without compact format 
wrapping</li>
+     *   <li>Avoiding infinite recursion when serializing inside 
TikaModule</li>
+     * </ul>
+     * <p>
+     * Has {@code FAIL_ON_EMPTY_BEANS} disabled to allow serialization of 
classes with no properties.
+     *
+     * @return the shared plain ObjectMapper
+     */
+    public static ObjectMapper getPlainMapper() {
+        return PLAIN_MAPPER;
+    }
+
     public static synchronized ObjectMapper getMapper() {
         if (MAPPER == null) {
             MAPPER = createMapper();
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
index 195cfd6df0..b1e1d6673a 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
@@ -17,6 +17,7 @@
 package org.apache.tika.serialization;
 
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -24,7 +25,19 @@ import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.tika.config.loader.ComponentInfo;
 import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.digest.DigesterFactory;
 import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.UnpackSelector;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.sax.ContentHandlerDecoratorFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
 
 /**
  * Utility class that resolves friendly component names to classes using 
ComponentRegistry.
@@ -37,6 +50,29 @@ import org.apache.tika.exception.TikaConfigException;
  */
 public final class ComponentNameResolver {
 
+    /**
+     * Interfaces that use compact format serialization and serve as 
ParseContext keys.
+     * Types implementing these interfaces will be serialized as:
+     * - "type-name" for defaults
+     * - {"type-name": {...}} for configured instances
+     */
+    private static final Set<Class<?>> CONTEXT_KEY_INTERFACES = new 
HashSet<>();
+
+    static {
+        CONTEXT_KEY_INTERFACES.add(Parser.class);
+        CONTEXT_KEY_INTERFACES.add(Detector.class);
+        CONTEXT_KEY_INTERFACES.add(EncodingDetector.class);
+        CONTEXT_KEY_INTERFACES.add(MetadataFilter.class);
+        CONTEXT_KEY_INTERFACES.add(Translator.class);
+        CONTEXT_KEY_INTERFACES.add(Renderer.class);
+        CONTEXT_KEY_INTERFACES.add(DigesterFactory.class);
+        CONTEXT_KEY_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
+        CONTEXT_KEY_INTERFACES.add(MetadataWriteLimiterFactory.class);
+        CONTEXT_KEY_INTERFACES.add(ContentHandlerDecoratorFactory.class);
+        CONTEXT_KEY_INTERFACES.add(ContentHandlerFactory.class);
+        CONTEXT_KEY_INTERFACES.add(UnpackSelector.class);
+    }
+
     private static final Map<String, ComponentRegistry> REGISTRIES = new 
ConcurrentHashMap<>();
 
     // Component configuration storage (keyed by JSON field name and by 
component class)
@@ -77,7 +113,10 @@ public final class ComponentNameResolver {
                 }
             }
         }
-        return Class.forName(name, false, classLoader);
+        throw new ClassNotFoundException(
+                "Component '" + name + "' is not registered. " +
+                "Components must be registered via @TikaComponent annotation 
or .idx file. " +
+                "Arbitrary class names are not allowed for security reasons.");
     }
 
     /**
@@ -204,6 +243,69 @@ public final class ComponentNameResolver {
         return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet());
     }
 
+    // ==================== Context Key Resolution Methods ====================
+
+    /**
+     * Returns the set of interfaces that use compact format serialization.
+     *
+     * @return unmodifiable set of context key interfaces
+     */
+    public static Set<Class<?>> getContextKeyInterfaces() {
+        return Collections.unmodifiableSet(CONTEXT_KEY_INTERFACES);
+    }
+
+    /**
+     * Finds the appropriate context key interface for a given type.
+     * This is used to determine which interface should be used as the 
ParseContext key
+     * when storing instances of this type.
+     *
+     * @param type the type to find the context key for
+     * @return the interface to use as context key, or null if none found
+     */
+    public static Class<?> findContextKeyInterface(Class<?> type) {
+        for (Class<?> iface : CONTEXT_KEY_INTERFACES) {
+            if (iface.isAssignableFrom(type)) {
+                return iface;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Checks if a type should use compact format serialization.
+     * Returns true if the type implements any of the registered context key 
interfaces.
+     *
+     * @param type the type to check
+     * @return true if the type uses compact format
+     */
+    public static boolean usesCompactFormat(Class<?> type) {
+        return findContextKeyInterface(type) != null;
+    }
+
+    /**
+     * Determines the ParseContext key for a component.
+     * <p>
+     * Resolution order:
+     * <ol>
+     *   <li>Explicit contextKey from .idx file (via @TikaComponent 
annotation)</li>
+     *   <li>Auto-detect from implemented interfaces (using 
CONTEXT_KEY_INTERFACES)</li>
+     *   <li>Fall back to the component class itself</li>
+     * </ol>
+     *
+     * @param info the component info
+     * @return the class to use as ParseContext key
+     */
+    public static Class<?> determineContextKey(ComponentInfo info) {
+        if (info.contextKey() != null) {
+            return info.contextKey();
+        }
+        Class<?> interfaceKey = findContextKeyInterface(info.componentClass());
+        if (interfaceKey != null) {
+            return interfaceKey;
+        }
+        return info.componentClass();
+    }
+
     /**
      * Gets the contextKey for a class from the component registry.
      * The contextKey is recorded in the .idx file by the annotation processor.
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index c95b9fe275..c563b47646 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -140,7 +140,7 @@ public class ParseContextUtils {
             }
 
             // Determine the context key
-            Class<?> contextKey = determineContextKey(info);
+            Class<?> contextKey = 
ComponentNameResolver.determineContextKey(info);
 
             try {
                 // Deserialize and cache in resolvedConfigs, also add to 
context
@@ -157,36 +157,6 @@ public class ParseContextUtils {
         }
     }
 
-    /**
-     * Determines the ParseContext key for a component.
-     * <p>
-     * Resolution order:
-     * <ol>
-     *   <li>Explicit contextKey from .idx file (via @TikaComponent 
annotation)</li>
-     *   <li>Auto-detect from implemented interfaces (using 
TikaModule.COMPACT_FORMAT_INTERFACES)</li>
-     *   <li>Fall back to the component class itself</li>
-     * </ol>
-     * <p>
-     * Security note: This only determines the context key - it does NOT 
affect which
-     * classes can be instantiated. Classes must still be registered via 
@TikaComponent.
-     *
-     * @param info the component info
-     * @return the class to use as ParseContext key
-     */
-    private static Class<?> determineContextKey(ComponentInfo info) {
-        // Use explicit contextKey from .idx file if specified
-        if (info.contextKey() != null) {
-            return info.contextKey();
-        }
-        // Auto-detect from implemented interfaces at runtime
-        Class<?> contextKeyInterface = 
TikaModule.findContextKeyInterface(info.componentClass());
-        if (contextKeyInterface != null) {
-            return contextKeyInterface;
-        }
-        // Fall back to the component class itself
-        return info.componentClass();
-    }
-
     /**
      * Resolves an array config entry (e.g., "metadata-filters") to a 
composite component.
      * <p>
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index 8277632830..63ea711796 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -17,10 +17,8 @@
 package org.apache.tika.serialization;
 
 import java.io.IOException;
-import java.lang.reflect.Constructor;
 import java.lang.reflect.Method;
 import java.lang.reflect.Modifier;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
@@ -36,36 +34,21 @@ import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.JsonSerializer;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializationConfig;
-import com.fasterxml.jackson.databind.SerializationFeature;
 import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.deser.Deserializers;
 import com.fasterxml.jackson.databind.module.SimpleModule;
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import com.fasterxml.jackson.databind.ser.Serializers;
 
-import org.apache.tika.config.Initializable;
-import org.apache.tika.config.JsonConfig;
-import org.apache.tika.config.SelfConfiguring;
+import org.apache.tika.config.loader.ComponentInstantiator;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
 import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.digest.DigesterFactory;
 import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
-import org.apache.tika.extractor.UnpackSelector;
-import org.apache.tika.language.translate.Translator;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.filter.MetadataFilter;
-import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.renderer.Renderer;
-import org.apache.tika.sax.ContentHandlerDecoratorFactory;
-import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.serialization.serdes.DefaultDetectorSerializer;
 import org.apache.tika.serialization.serdes.DefaultParserSerializer;
 import org.apache.tika.serialization.serdes.MetadataDeserializer;
@@ -91,64 +74,6 @@ public class TikaModule extends SimpleModule {
 
     private static ObjectMapper sharedMapper;
 
-    // Plain JSON mapper for converting JsonNodes to JSON strings.
-    // This is needed because the main mapper may use a binary format (e.g., 
Smile)
-    // which doesn't support writeValueAsString().
-    private static final ObjectMapper JSON_MAPPER = new ObjectMapper();
-
-    /**
-     * Interfaces that use compact format serialization.
-     * Types implementing these interfaces will be serialized as:
-     * - "type-name" for defaults
-     * - {"type-name": {...}} for configured instances
-     */
-    private static final Set<Class<?>> COMPACT_FORMAT_INTERFACES = new 
HashSet<>();
-
-    static {
-        // Core component interfaces that use compact format
-        COMPACT_FORMAT_INTERFACES.add(Parser.class);
-        COMPACT_FORMAT_INTERFACES.add(Detector.class);
-        COMPACT_FORMAT_INTERFACES.add(EncodingDetector.class);
-        COMPACT_FORMAT_INTERFACES.add(MetadataFilter.class);
-        COMPACT_FORMAT_INTERFACES.add(Translator.class);
-        COMPACT_FORMAT_INTERFACES.add(Renderer.class);
-        COMPACT_FORMAT_INTERFACES.add(DigesterFactory.class);
-        COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class);
-        COMPACT_FORMAT_INTERFACES.add(MetadataWriteLimiterFactory.class);
-        COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class);
-        COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class);
-        COMPACT_FORMAT_INTERFACES.add(UnpackSelector.class);
-    }
-
-    /**
-     * Checks if a type should use compact format serialization.
-     * Returns true if the type implements any of the registered compact 
format interfaces.
-     */
-    private static boolean usesCompactFormat(Class<?> type) {
-        return findContextKeyInterface(type) != null;
-    }
-
-    /**
-     * Finds the appropriate context key interface for a given type.
-     * This is used to determine which interface should be used as the 
ParseContext key
-     * when storing instances of this type.
-     * <p>
-     * Security note: This method only helps determine the context key - it 
does NOT
-     * affect which classes can be instantiated. Classes must still be 
registered
-     * via @TikaComponent to be deserializable.
-     *
-     * @param type the type to find the context key for
-     * @return the interface to use as context key, or null if none found
-     */
-    public static Class<?> findContextKeyInterface(Class<?> type) {
-        for (Class<?> iface : COMPACT_FORMAT_INTERFACES) {
-            if (iface.isAssignableFrom(type)) {
-                return iface;
-            }
-        }
-        return null;
-    }
-
     public TikaModule() {
         super("TikaModule");
 
@@ -220,7 +145,8 @@ public class TikaModule extends SimpleModule {
             // Concrete implementations (like ExternalParser, HtmlParser) 
should use normal
             // Jackson bean deserialization for their properties.
             if (rawClass.isInterface() || 
Modifier.isAbstract(rawClass.getModifiers())) {
-                if (COMPACT_FORMAT_INTERFACES.contains(rawClass) || 
usesCompactFormat(rawClass)) {
+                if 
(ComponentNameResolver.getContextKeyInterfaces().contains(rawClass) ||
+                        ComponentNameResolver.usesCompactFormat(rawClass)) {
                     return new TikaComponentDeserializer(rawClass);
                 }
             }
@@ -253,7 +179,8 @@ public class TikaModule extends SimpleModule {
 
             // Only serialize with compact format if type implements a compact 
format interface
             // AND has a registered friendly name
-            if (usesCompactFormat(rawClass) && 
ComponentNameResolver.getFriendlyName(rawClass) != null) {
+            if (ComponentNameResolver.usesCompactFormat(rawClass) &&
+                    ComponentNameResolver.getFriendlyName(rawClass) != null) {
                 return new TikaComponentSerializer();
             }
 
@@ -263,6 +190,7 @@ public class TikaModule extends SimpleModule {
 
     /**
      * Deserializer that handles both string and object formats for Tika 
components.
+     * Delegates to {@link ComponentInstantiator#instantiateComponent} for 
instantiation.
      */
     private static class TikaComponentDeserializer extends 
JsonDeserializer<Object> {
         private final Class<?> expectedType;
@@ -281,14 +209,15 @@ public class TikaModule extends SimpleModule {
                         "Call TikaModule.setSharedMapper() before 
deserializing.");
             }
 
+            String typeName;
+            JsonNode configNode;
+
             if (node.isTextual()) {
-                // Simple string format: "pdf-parser"
-                String typeName = node.asText();
-                return instantiate(typeName, null, mapper);
+                typeName = node.asText();
+                configNode = null;
             } else if (node.isObject()) {
                 Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
                 if (!fields.hasNext()) {
-                    // Empty object {} - try to create default instance if 
expectedType is concrete
                     try {
                         return 
expectedType.getDeclaredConstructor().newInstance();
                     } catch (ReflectiveOperationException e) {
@@ -297,136 +226,19 @@ public class TikaModule extends SimpleModule {
                     }
                 }
                 Map.Entry<String, JsonNode> entry = fields.next();
-                return instantiate(entry.getKey(), entry.getValue(), mapper);
+                typeName = entry.getKey();
+                configNode = entry.getValue();
             } else {
                 throw new IOException("Expected string or object for " +
                         expectedType.getSimpleName() + ", got: " + 
node.getNodeType());
             }
-        }
-
-        private Object instantiate(String typeName, JsonNode configNode, 
ObjectMapper mapper) throws IOException {
-            // Resolve the class using ComponentNameResolver
-            Class<?> clazz;
-            try {
-                clazz = ComponentNameResolver.resolveClass(typeName,
-                        Thread.currentThread().getContextClassLoader());
-            } catch (ClassNotFoundException e) {
-                throw new IOException("Unknown type: " + typeName, e);
-            }
-
-            // Verify type compatibility
-            if (!expectedType.isAssignableFrom(clazz)) {
-                throw new IOException("Type " + typeName + " (" + 
clazz.getName() +
-                        ") is not assignable to " + expectedType.getName());
-            }
-
-            // Extract mime filter fields before stripping them
-            Set<MediaType> includeTypes = extractMimeTypes(configNode, 
"_mime-include");
-            Set<MediaType> excludeTypes = extractMimeTypes(configNode, 
"_mime-exclude");
-
-            // Strip decorator fields before passing to component
-            JsonNode cleanedConfig = stripDecoratorFields(configNode);
 
             try {
-                Object instance;
-
-                // DefaultParser and DefaultDetector must be loaded via 
TikaLoader for proper dependency injection
-                if (clazz == DefaultParser.class) {
-                    throw new IOException("DefaultParser must be loaded via 
TikaLoader, not directly " +
-                            "via Jackson deserialization. Use 
TikaLoader.load() to load configuration.");
-                } else if (clazz == DefaultDetector.class) {
-                    throw new IOException("DefaultDetector must be loaded via 
TikaLoader, not directly " +
-                            "via Jackson deserialization. Use 
TikaLoader.load() to load configuration.");
-                } else if (clazz == MimeTypes.class) {
-                    // MimeTypes must use the singleton to have all type 
definitions loaded
-                    instance = MimeTypes.getDefaultMimeTypes();
-                } else if (cleanedConfig == null || cleanedConfig.isEmpty()) {
-                    // If no config, use default constructor
-                    instance = clazz.getDeclaredConstructor().newInstance();
-                } else {
-                    // Try JsonConfig constructor first (works for any 
component)
-                    Constructor<?> jsonConfigCtor = 
findJsonConfigConstructor(clazz);
-                    if (jsonConfigCtor != null) {
-                        // Use plain JSON mapper since the main mapper may be 
binary (Smile)
-                        String json = 
JSON_MAPPER.writeValueAsString(cleanedConfig);
-                        instance = jsonConfigCtor.newInstance((JsonConfig) () 
-> json);
-                    } else {
-                        // Fall back to no-arg constructor + Jackson bean 
deserialization
-                        instance = 
clazz.getDeclaredConstructor().newInstance();
-                        
mapper.readerForUpdating(instance).readValue(cleanedConfig);
-                    }
-                }
-
-                // Call initialize() on Initializable components
-                if (instance instanceof Initializable) {
-                    try {
-                        ((Initializable) instance).initialize();
-                    } catch (TikaConfigException e) {
-                        throw new IOException("Failed to initialize " + 
typeName, e);
-                    }
-                }
-
-                // Wrap parser with mime filtering if include/exclude types 
specified
-                if (instance instanceof Parser && (!includeTypes.isEmpty() || 
!excludeTypes.isEmpty())) {
-                    instance = ParserDecorator.withMimeFilters((Parser) 
instance, includeTypes, excludeTypes);
-                }
-
-                return instance;
-
-            } catch (ReflectiveOperationException e) {
-                throw new IOException("Failed to instantiate: " + typeName, e);
-            }
-        }
-
-        private Set<MediaType> extractMimeTypes(JsonNode configNode, String 
fieldName) {
-            Set<MediaType> types = new HashSet<>();
-            if (configNode == null || !configNode.has(fieldName)) {
-                return types;
-            }
-            JsonNode arrayNode = configNode.get(fieldName);
-            if (arrayNode.isArray()) {
-                for (JsonNode typeNode : arrayNode) {
-                    types.add(MediaType.parse(typeNode.asText()));
-                }
-            }
-            return types;
-        }
-
-        private Constructor<?> findJsonConfigConstructor(Class<?> clazz) {
-            try {
-                return clazz.getConstructor(JsonConfig.class);
-            } catch (NoSuchMethodException e) {
-                return null;
-            }
-        }
-
-        /**
-         * Deserializes a JsonNode using a dedicated deserializer.
-         */
-        private <T> T deserializeWithNode(JsonDeserializer<T> deserializer, 
JsonNode node,
-                                          ObjectMapper mapper) throws 
IOException {
-            if (node == null) {
-                node = mapper.createObjectNode();
-            }
-            try (JsonParser p = mapper.treeAsTokens(node)) {
-                p.nextToken();
-                return deserializer.deserialize(p, 
mapper.getDeserializationContext());
-            }
-        }
-
-        /**
-         * Strips decorator fields (_mime-include, _mime-exclude) from config 
node.
-         * These fields are handled by TikaLoader for wrapping, not by the 
component itself.
-         * Note: _exclude is NOT stripped as it's used by DefaultParser for 
SPI exclusions.
-         */
-        private JsonNode stripDecoratorFields(JsonNode configNode) {
-            if (configNode == null || !configNode.isObject()) {
-                return configNode;
+                return ComponentInstantiator.instantiateComponent(typeName, 
configNode,
+                        mapper, 
Thread.currentThread().getContextClassLoader(), expectedType);
+            } catch (TikaConfigException e) {
+                throw new IOException(e.getMessage(), e);
             }
-            ObjectNode cleaned = configNode.deepCopy();
-            cleaned.remove("_mime-include");
-            cleaned.remove("_mime-exclude");
-            return cleaned;
         }
     }
 
@@ -435,12 +247,8 @@ public class TikaModule extends SimpleModule {
      * Outputs simple string if using defaults, object with type key if 
configured.
      */
     private static class TikaComponentSerializer extends 
JsonSerializer<Object> {
-        // Plain mapper for serializing without TikaModule (avoids infinite 
recursion)
-        private final ObjectMapper plainMapper;
 
         TikaComponentSerializer() {
-            this.plainMapper = new ObjectMapper();
-            this.plainMapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
         }
 
         @Override
@@ -507,8 +315,8 @@ public class TikaModule extends SimpleModule {
                     // Create default config to compare against
                     Object defaultConfig = 
config.getClass().getDeclaredConstructor().newInstance();
 
-                    ObjectNode configNode = plainMapper.valueToTree(config);
-                    ObjectNode defaultNode = 
plainMapper.valueToTree(defaultConfig);
+                    ObjectNode configNode = 
TikaObjectMapperFactory.getPlainMapper().valueToTree(config);
+                    ObjectNode defaultNode = 
TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultConfig);
 
                     // Only keep properties that differ from defaults
                     ObjectNode result = mapper.createObjectNode();
@@ -525,10 +333,10 @@ public class TikaModule extends SimpleModule {
                     // No config object - serialize the component directly
                     Object defaultInstance = 
value.getClass().getDeclaredConstructor().newInstance();
 
-                    ObjectNode valueNode = plainMapper.valueToTree(value);
-                    ObjectNode defaultNode = 
plainMapper.valueToTree(defaultInstance);
+                    ObjectNode valueNode = 
TikaObjectMapperFactory.getPlainMapper().valueToTree(value);
+                    ObjectNode defaultNode = 
TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultInstance);
 
-                    ObjectNode result = plainMapper.createObjectNode();
+                    ObjectNode result = 
TikaObjectMapperFactory.getPlainMapper().createObjectNode();
                     Iterator<Map.Entry<String, JsonNode>> fields = 
valueNode.fields();
                     while (fields.hasNext()) {
                         Map.Entry<String, JsonNode> field = fields.next();
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index c8141c47d9..3e526f7b88 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -34,9 +34,9 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.loader.ComponentInfo;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.serialization.ComponentNameResolver;
-import org.apache.tika.serialization.TikaModule;
 
 /**
  * Deserializes ParseContext from JSON.
@@ -61,10 +61,9 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(ParseContextDeserializer.class);
 
-    // Plain JSON mapper for converting JsonNodes to JSON strings.
-    // This is needed because the main mapper may use a binary format (e.g., 
Smile)
-    // which doesn't support writeValueAsString().
-    private static final ObjectMapper JSON_MAPPER = new ObjectMapper();
+    private static ObjectMapper plainMapper() {
+        return TikaObjectMapperFactory.getPlainMapper();
+    }
 
     @Override
     public ParseContext deserialize(JsonParser jsonParser, 
DeserializationContext ctxt)
@@ -120,7 +119,7 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
 
                 // Store as JSON config for lazy resolution
                 // Use plain JSON mapper since the main mapper may be binary 
(Smile)
-                String json = JSON_MAPPER.writeValueAsString(value);
+                String json = plainMapper().writeValueAsString(value);
                 parseContext.setJsonConfig(name, json);
             }
         }
@@ -128,21 +127,6 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
         return parseContext;
     }
 
-    /**
-     * Determines the context key for a component.
-     * Uses explicit contextKey if available, otherwise auto-detects from 
interfaces.
-     */
-    private static Class<?> determineContextKey(ComponentInfo info) {
-        if (info.contextKey() != null) {
-            return info.contextKey();
-        }
-        Class<?> interfaceKey = 
TikaModule.findContextKeyInterface(info.componentClass());
-        if (interfaceKey != null) {
-            return interfaceKey;
-        }
-        return info.componentClass();
-    }
-
     /**
      * Checks if a JSON config entry would create a duplicate context key.
      * <p>
@@ -172,7 +156,7 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
             return;
         }
 
-        Class<?> contextKey = determineContextKey(info);
+        Class<?> contextKey = ComponentNameResolver.determineContextKey(info);
 
         String existingName = seenContextKeys.get(contextKey);
         if (existingName != null) {
@@ -215,25 +199,16 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
                 contextKeyClass = info.contextKey();
             }
 
-            // If not found in registry, try as fully qualified class name
+            // If not found in registry, reject — components must be registered
             if (configClass == null) {
-                try {
-                    configClass = Class.forName(componentName);
-                    // Check if the class has a contextKey via its annotation
-                    contextKeyClass = 
ComponentNameResolver.getContextKey(configClass);
-                } catch (ClassNotFoundException e) {
-                    LOG.warn("Could not find class for typed component '{}', 
storing as JSON config",
-                            componentName);
-                    // Fall back to storing as JSON config (use plain JSON 
mapper)
-                    parseContext.setJsonConfig(componentName, 
JSON_MAPPER.writeValueAsString(configNode));
-                    continue;
-                }
+                throw new IOException("Unknown typed component '" + 
componentName + "'. " +
+                        "Components must be registered via @TikaComponent 
annotation or .idx file.");
             }
 
             // Determine context key: explicit > interface detection > class 
itself
             Class<?> parseContextKey = contextKeyClass;
             if (parseContextKey == null) {
-                parseContextKey = 
TikaModule.findContextKeyInterface(configClass);
+                parseContextKey = 
ComponentNameResolver.findContextKeyInterface(configClass);
             }
             if (parseContextKey == null) {
                 parseContextKey = configClass;
@@ -257,7 +232,7 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
                 LOG.warn("Failed to deserialize typed component '{}' as {}, 
storing as JSON config",
                         componentName, configClass.getName(), e);
                 // Use plain JSON mapper since main mapper may be binary 
(Smile)
-                parseContext.setJsonConfig(componentName, 
JSON_MAPPER.writeValueAsString(configNode));
+                parseContext.setJsonConfig(componentName, 
plainMapper().writeValueAsString(configNode));
             }
         }
     }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
index d884f93553..1453b3483c 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
@@ -27,6 +27,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializerProvider;
 
 import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.serialization.ComponentNameResolver;
 
@@ -51,12 +52,8 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
     public static final String PARSE_CONTEXT = "parse-context";
     public static final String TYPED = "typed";
 
-    // Plain mapper for serializing values without TikaModule's component 
wrapping
-    private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper();
-
-    static {
-        // Allow serialization of classes with no properties
-        
PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS);
+    private static ObjectMapper plainMapper() {
+        return TikaObjectMapperFactory.getPlainMapper();
     }
 
     @Override
@@ -81,14 +78,10 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
                 continue;
             }
 
-            // Use the actual value's class for serialization, not the key 
class (which may be an interface)
-            // This ensures we can deserialize back to the concrete class
-            String valueClassName = value.getClass().getName();
-
             // Try to find a friendly component name for the value's class, 
otherwise use FQCN
-            String keyName = findComponentName(valueClassName);
+            String keyName = 
ComponentNameResolver.getFriendlyName(value.getClass());
             if (keyName == null) {
-                keyName = valueClassName;
+                keyName = value.getClass().getName();
             }
 
             if (!hasTypedObjects) {
@@ -99,7 +92,7 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
             gen.writeFieldName(keyName);
             // Use writeTree instead of writeRawValue for binary format 
support (e.g., Smile)
             // and stricter validation (fails early if value can't be 
serialized)
-            gen.writeTree(PLAIN_MAPPER.valueToTree(value));
+            gen.writeTree(plainMapper().valueToTree(value));
 
             // Track this name so we skip it in jsonConfigs
             serializedNames.add(keyName);
@@ -119,26 +112,10 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
             }
             gen.writeFieldName(entry.getKey());
             // Parse the JSON string into a tree for binary format support
-            gen.writeTree(PLAIN_MAPPER.readTree(entry.getValue().json()));
+            gen.writeTree(plainMapper().readTree(entry.getValue().json()));
         }
 
         gen.writeEndObject();
     }
 
-    /**
-     * Finds the component name for a class.
-     * Uses ComponentNameResolver for registry lookup. Only classes registered
-     * in a component registry will be serialized.
-     *
-     * @param className the fully qualified class name
-     * @return the component name, or null if not registered
-     */
-    private String findComponentName(String className) {
-        try {
-            Class<?> clazz = Class.forName(className);
-            return ComponentNameResolver.getFriendlyName(clazz);
-        } catch (ClassNotFoundException e) {
-            return null;
-        }
-    }
 }

Reply via email to