This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch serialization-take2 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6e95d5b43d25b4f05574218da50cddb15adca9d7 Author: tallison <[email protected]> AuthorDate: Wed Feb 25 20:14:56 2026 -0500 simplify serialization, take 2 --- tika-pipes/tika-pipes-api/pom.xml | 19 ++ .../java/org/apache/tika/pipes/api/ParseMode.java | 3 + .../tika/config/loader/ComponentInstantiator.java | 157 +++++++++++++- .../config/loader/TikaObjectMapperFactory.java | 29 +++ .../tika/serialization/ComponentNameResolver.java | 104 ++++++++- .../tika/serialization/ParseContextUtils.java | 32 +-- .../org/apache/tika/serialization/TikaModule.java | 238 ++------------------- .../serdes/ParseContextDeserializer.java | 47 +--- .../serdes/ParseContextSerializer.java | 37 +--- 9 files changed, 348 insertions(+), 318 deletions(-) diff --git a/tika-pipes/tika-pipes-api/pom.xml b/tika-pipes/tika-pipes-api/pom.xml index bf895ff91e..c37c5f9388 100644 --- a/tika-pipes/tika-pipes-api/pom.xml +++ b/tika-pipes/tika-pipes-api/pom.xml @@ -49,9 +49,28 @@ <version>${project.version}</version> <scope>provided</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> </dependencies> <build> <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <annotationProcessorPaths> + <path> + <groupId>${project.groupId}</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + </path> + </annotationProcessorPaths> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java index e6127d5005..f90ae7a29a 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java @@ -18,12 +18,15 @@ package org.apache.tika.pipes.api; import java.util.Locale; +import org.apache.tika.config.TikaComponent; + /** * Controls how embedded documents are handled during parsing. * <p> * This can be set as a default in PipesConfig (loaded from tika-config.json) * or overridden per-file via ParseContext. */ +@TikaComponent(name = "parse-mode") public enum ParseMode { /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java index 39860928bd..f82daa581c 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java @@ -18,13 +18,22 @@ package org.apache.tika.config.loader; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.util.HashSet; +import java.util.Set; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.tika.config.Initializable; import org.apache.tika.config.JsonConfig; +import org.apache.tika.detect.DefaultDetector; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.apache.tika.serialization.ComponentNameResolver; import org.apache.tika.utils.ServiceLoaderUtils; @@ -123,12 +132,13 @@ public class ComponentInstantiator { // No JsonConfig constructor, fall back to other methods } - // Fall back to Jackson bean deserialization or zero-arg constructor + // Fall back to no-arg constructor + Jackson bean deserialization (readerForUpdating) + // Using readerForUpdating preserves defaults from the no-arg constructor, + // unlike treeToValue which would null out unspecified fields. T component; - if (configNode == null || configNode.isEmpty()) { - component = (T) componentClass.getDeclaredConstructor().newInstance(); - } else { - component = (T) objectMapper.treeToValue(configNode, componentClass); + component = (T) componentClass.getDeclaredConstructor().newInstance(); + if (configNode != null && !configNode.isEmpty()) { + objectMapper.readerForUpdating(component).readValue(configNode); } // Call initialize() on Initializable components @@ -170,6 +180,143 @@ public class ComponentInstantiator { } } + /** + * Instantiates a Tika component with full special-case handling. + * <p> + * This is the primary entry point for component instantiation from JSON configuration. + * Handles: + * <ul> + * <li>Type resolution via {@link ComponentNameResolver#resolveClass}</li> + * <li>Type compatibility validation against expectedType</li> + * <li>Special cases: DefaultParser/DefaultDetector rejection, MimeTypes singleton</li> + * <li>{@code _mime-include}/{@code _mime-exclude} extraction and stripping</li> + * <li>Three-step instantiation: JsonConfig ctor → readerForUpdating → no-arg</li> + * <li>{@link Initializable#initialize()} callback</li> + * <li>Parser MIME filter wrapping</li> + * </ul> + * + * @param typeName the component type name (friendly name or FQCN) + * @param configNode the JSON configuration node (may be null) + * @param mapper the ObjectMapper for deserialization + * @param classLoader the class loader for name resolution + * @param expectedType the expected interface/base type (for validation), or null to skip + * @return the instantiated component + * @throws TikaConfigException if instantiation fails + */ + @SuppressWarnings("unchecked") + public static <T> T instantiateComponent(String typeName, JsonNode configNode, + ObjectMapper mapper, ClassLoader classLoader, + Class<?> expectedType) + throws TikaConfigException { + // Resolve the class using ComponentNameResolver + Class<?> clazz; + try { + clazz = ComponentNameResolver.resolveClass(typeName, classLoader); + } catch (ClassNotFoundException e) { + throw new TikaConfigException("Unknown type: " + typeName, e); + } + + // Verify type compatibility + if (expectedType != null && !expectedType.isAssignableFrom(clazz)) { + throw new TikaConfigException("Type " + typeName + " (" + clazz.getName() + + ") is not assignable to " + expectedType.getName()); + } + + // DefaultParser and DefaultDetector must be loaded via TikaLoader + if (clazz == DefaultParser.class) { + throw new TikaConfigException("DefaultParser must be loaded via TikaLoader, not " + + "directly via Jackson deserialization. Use TikaLoader.load() to load configuration."); + } else if (clazz == DefaultDetector.class) { + throw new TikaConfigException("DefaultDetector must be loaded via TikaLoader, not " + + "directly via Jackson deserialization. Use TikaLoader.load() to load configuration."); + } + + // Extract mime filter fields before stripping them + Set<MediaType> includeTypes = extractMimeTypes(configNode, "_mime-include"); + Set<MediaType> excludeTypes = extractMimeTypes(configNode, "_mime-exclude"); + + // Strip decorator fields before passing to component + JsonNode cleanedConfig = stripDecoratorFields(configNode); + + try { + Object instance; + + if (clazz == MimeTypes.class) { + // MimeTypes must use the singleton to have all type definitions loaded + instance = MimeTypes.getDefaultMimeTypes(); + } else if (cleanedConfig == null || cleanedConfig.isEmpty()) { + // If no config, use default constructor + instance = clazz.getDeclaredConstructor().newInstance(); + } else { + // Try JsonConfig constructor first + Constructor<?> jsonConfigCtor = findJsonConfigConstructor(clazz); + if (jsonConfigCtor != null) { + // Use plain JSON mapper since the main mapper may be binary (Smile) + String json = TikaObjectMapperFactory.getPlainMapper() + .writeValueAsString(cleanedConfig); + instance = jsonConfigCtor.newInstance((JsonConfig) () -> json); + } else { + // Fall back to no-arg constructor + Jackson bean deserialization + instance = clazz.getDeclaredConstructor().newInstance(); + mapper.readerForUpdating(instance).readValue(cleanedConfig); + } + } + + // Call initialize() on Initializable components + initializeIfNeeded(instance); + + // Wrap parser with mime filtering if include/exclude types specified + if (instance instanceof Parser && (!includeTypes.isEmpty() || !excludeTypes.isEmpty())) { + instance = ParserDecorator.withMimeFilters( + (Parser) instance, includeTypes, excludeTypes); + } + + return (T) instance; + + } catch (TikaConfigException e) { + throw e; + } catch (Exception e) { + throw new TikaConfigException("Failed to instantiate: " + typeName, e); + } + } + + private static Set<MediaType> extractMimeTypes(JsonNode configNode, String fieldName) { + Set<MediaType> types = new HashSet<>(); + if (configNode == null || !configNode.has(fieldName)) { + return types; + } + JsonNode arrayNode = configNode.get(fieldName); + if (arrayNode.isArray()) { + for (JsonNode typeNode : arrayNode) { + types.add(MediaType.parse(typeNode.asText())); + } + } + return types; + } + + private static Constructor<?> findJsonConfigConstructor(Class<?> clazz) { + try { + return clazz.getConstructor(JsonConfig.class); + } catch (NoSuchMethodException e) { + return null; + } + } + + /** + * Strips decorator fields (_mime-include, _mime-exclude) from config node. + * These fields are handled by TikaLoader for wrapping, not by the component itself. + * Note: _exclude is NOT stripped as it's used by DefaultParser for SPI exclusions. + */ + private static JsonNode stripDecoratorFields(JsonNode configNode) { + if (configNode == null || !configNode.isObject()) { + return configNode; + } + ObjectNode cleaned = configNode.deepCopy(); + cleaned.remove("_mime-include"); + cleaned.remove("_mime-exclude"); + return cleaned; + } + /** * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). * diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java index e832dc8d4b..0a24e71705 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -55,6 +55,35 @@ public class TikaObjectMapperFactory { private static ObjectMapper MAPPER = null; + // Shared plain ObjectMapper (no TikaModule) for converting JsonNodes to JSON strings. + // Needed because the main mapper may use a binary format (e.g., Smile) + // which doesn't support writeValueAsString(). + private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper(); + + static { + // Components with no bean properties (e.g., parsers with no configuration) + // need to serialize as empty objects rather than throwing. + PLAIN_MAPPER.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS); + } + + /** + * Returns a shared plain ObjectMapper without TikaModule registration. + * <p> + * This mapper is suitable for: + * <ul> + * <li>Converting JsonNodes to JSON strings</li> + * <li>Serializing component properties without compact format wrapping</li> + * <li>Avoiding infinite recursion when serializing inside TikaModule</li> + * </ul> + * <p> + * Has {@code FAIL_ON_EMPTY_BEANS} disabled to allow serialization of classes with no properties. + * + * @return the shared plain ObjectMapper + */ + public static ObjectMapper getPlainMapper() { + return PLAIN_MAPPER; + } + public static synchronized ObjectMapper getMapper() { if (MAPPER == null) { MAPPER = createMapper(); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java index 195cfd6df0..b1e1d6673a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java @@ -17,6 +17,7 @@ package org.apache.tika.serialization; import java.util.Collections; +import java.util.HashSet; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -24,7 +25,19 @@ import java.util.concurrent.ConcurrentHashMap; import org.apache.tika.config.loader.ComponentInfo; import org.apache.tika.config.loader.ComponentRegistry; +import org.apache.tika.detect.Detector; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.digest.DigesterFactory; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; +import org.apache.tika.extractor.UnpackSelector; +import org.apache.tika.language.translate.Translator; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; +import org.apache.tika.parser.Parser; +import org.apache.tika.renderer.Renderer; +import org.apache.tika.sax.ContentHandlerDecoratorFactory; +import org.apache.tika.sax.ContentHandlerFactory; /** * Utility class that resolves friendly component names to classes using ComponentRegistry. @@ -37,6 +50,29 @@ import org.apache.tika.exception.TikaConfigException; */ public final class ComponentNameResolver { + /** + * Interfaces that use compact format serialization and serve as ParseContext keys. + * Types implementing these interfaces will be serialized as: + * - "type-name" for defaults + * - {"type-name": {...}} for configured instances + */ + private static final Set<Class<?>> CONTEXT_KEY_INTERFACES = new HashSet<>(); + + static { + CONTEXT_KEY_INTERFACES.add(Parser.class); + CONTEXT_KEY_INTERFACES.add(Detector.class); + CONTEXT_KEY_INTERFACES.add(EncodingDetector.class); + CONTEXT_KEY_INTERFACES.add(MetadataFilter.class); + CONTEXT_KEY_INTERFACES.add(Translator.class); + CONTEXT_KEY_INTERFACES.add(Renderer.class); + CONTEXT_KEY_INTERFACES.add(DigesterFactory.class); + CONTEXT_KEY_INTERFACES.add(EmbeddedDocumentExtractorFactory.class); + CONTEXT_KEY_INTERFACES.add(MetadataWriteLimiterFactory.class); + CONTEXT_KEY_INTERFACES.add(ContentHandlerDecoratorFactory.class); + CONTEXT_KEY_INTERFACES.add(ContentHandlerFactory.class); + CONTEXT_KEY_INTERFACES.add(UnpackSelector.class); + } + private static final Map<String, ComponentRegistry> REGISTRIES = new ConcurrentHashMap<>(); // Component configuration storage (keyed by JSON field name and by component class) @@ -77,7 +113,10 @@ public final class ComponentNameResolver { } } } - return Class.forName(name, false, classLoader); + throw new ClassNotFoundException( + "Component '" + name + "' is not registered. " + + "Components must be registered via @TikaComponent annotation or .idx file. " + + "Arbitrary class names are not allowed for security reasons."); } /** @@ -204,6 +243,69 @@ public final class ComponentNameResolver { return Collections.unmodifiableSet(FIELD_TO_CONFIG.keySet()); } + // ==================== Context Key Resolution Methods ==================== + + /** + * Returns the set of interfaces that use compact format serialization. + * + * @return unmodifiable set of context key interfaces + */ + public static Set<Class<?>> getContextKeyInterfaces() { + return Collections.unmodifiableSet(CONTEXT_KEY_INTERFACES); + } + + /** + * Finds the appropriate context key interface for a given type. + * This is used to determine which interface should be used as the ParseContext key + * when storing instances of this type. + * + * @param type the type to find the context key for + * @return the interface to use as context key, or null if none found + */ + public static Class<?> findContextKeyInterface(Class<?> type) { + for (Class<?> iface : CONTEXT_KEY_INTERFACES) { + if (iface.isAssignableFrom(type)) { + return iface; + } + } + return null; + } + + /** + * Checks if a type should use compact format serialization. + * Returns true if the type implements any of the registered context key interfaces. + * + * @param type the type to check + * @return true if the type uses compact format + */ + public static boolean usesCompactFormat(Class<?> type) { + return findContextKeyInterface(type) != null; + } + + /** + * Determines the ParseContext key for a component. + * <p> + * Resolution order: + * <ol> + * <li>Explicit contextKey from .idx file (via @TikaComponent annotation)</li> + * <li>Auto-detect from implemented interfaces (using CONTEXT_KEY_INTERFACES)</li> + * <li>Fall back to the component class itself</li> + * </ol> + * + * @param info the component info + * @return the class to use as ParseContext key + */ + public static Class<?> determineContextKey(ComponentInfo info) { + if (info.contextKey() != null) { + return info.contextKey(); + } + Class<?> interfaceKey = findContextKeyInterface(info.componentClass()); + if (interfaceKey != null) { + return interfaceKey; + } + return info.componentClass(); + } + /** * Gets the contextKey for a class from the component registry. * The contextKey is recorded in the .idx file by the annotation processor. diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java index c95b9fe275..c563b47646 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -140,7 +140,7 @@ public class ParseContextUtils { } // Determine the context key - Class<?> contextKey = determineContextKey(info); + Class<?> contextKey = ComponentNameResolver.determineContextKey(info); try { // Deserialize and cache in resolvedConfigs, also add to context @@ -157,36 +157,6 @@ public class ParseContextUtils { } } - /** - * Determines the ParseContext key for a component. - * <p> - * Resolution order: - * <ol> - * <li>Explicit contextKey from .idx file (via @TikaComponent annotation)</li> - * <li>Auto-detect from implemented interfaces (using TikaModule.COMPACT_FORMAT_INTERFACES)</li> - * <li>Fall back to the component class itself</li> - * </ol> - * <p> - * Security note: This only determines the context key - it does NOT affect which - * classes can be instantiated. Classes must still be registered via @TikaComponent. - * - * @param info the component info - * @return the class to use as ParseContext key - */ - private static Class<?> determineContextKey(ComponentInfo info) { - // Use explicit contextKey from .idx file if specified - if (info.contextKey() != null) { - return info.contextKey(); - } - // Auto-detect from implemented interfaces at runtime - Class<?> contextKeyInterface = TikaModule.findContextKeyInterface(info.componentClass()); - if (contextKeyInterface != null) { - return contextKeyInterface; - } - // Fall back to the component class itself - return info.componentClass(); - } - /** * Resolves an array config entry (e.g., "metadata-filters") to a composite component. * <p> diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java index 8277632830..63ea711796 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java @@ -17,10 +17,8 @@ package org.apache.tika.serialization; import java.io.IOException; -import java.lang.reflect.Constructor; import java.lang.reflect.Method; import java.lang.reflect.Modifier; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -36,36 +34,21 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationConfig; -import com.fasterxml.jackson.databind.SerializationFeature; import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.deser.Deserializers; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.databind.ser.Serializers; -import org.apache.tika.config.Initializable; -import org.apache.tika.config.JsonConfig; -import org.apache.tika.config.SelfConfiguring; +import org.apache.tika.config.loader.ComponentInstantiator; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.detect.DefaultDetector; -import org.apache.tika.detect.Detector; -import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.digest.DigesterFactory; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; -import org.apache.tika.extractor.UnpackSelector; -import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; -import org.apache.tika.renderer.Renderer; -import org.apache.tika.sax.ContentHandlerDecoratorFactory; -import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.serdes.DefaultDetectorSerializer; import org.apache.tika.serialization.serdes.DefaultParserSerializer; import org.apache.tika.serialization.serdes.MetadataDeserializer; @@ -91,64 +74,6 @@ public class TikaModule extends SimpleModule { private static ObjectMapper sharedMapper; - // Plain JSON mapper for converting JsonNodes to JSON strings. - // This is needed because the main mapper may use a binary format (e.g., Smile) - // which doesn't support writeValueAsString(). - private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); - - /** - * Interfaces that use compact format serialization. - * Types implementing these interfaces will be serialized as: - * - "type-name" for defaults - * - {"type-name": {...}} for configured instances - */ - private static final Set<Class<?>> COMPACT_FORMAT_INTERFACES = new HashSet<>(); - - static { - // Core component interfaces that use compact format - COMPACT_FORMAT_INTERFACES.add(Parser.class); - COMPACT_FORMAT_INTERFACES.add(Detector.class); - COMPACT_FORMAT_INTERFACES.add(EncodingDetector.class); - COMPACT_FORMAT_INTERFACES.add(MetadataFilter.class); - COMPACT_FORMAT_INTERFACES.add(Translator.class); - COMPACT_FORMAT_INTERFACES.add(Renderer.class); - COMPACT_FORMAT_INTERFACES.add(DigesterFactory.class); - COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class); - COMPACT_FORMAT_INTERFACES.add(MetadataWriteLimiterFactory.class); - COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class); - COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class); - COMPACT_FORMAT_INTERFACES.add(UnpackSelector.class); - } - - /** - * Checks if a type should use compact format serialization. - * Returns true if the type implements any of the registered compact format interfaces. - */ - private static boolean usesCompactFormat(Class<?> type) { - return findContextKeyInterface(type) != null; - } - - /** - * Finds the appropriate context key interface for a given type. - * This is used to determine which interface should be used as the ParseContext key - * when storing instances of this type. - * <p> - * Security note: This method only helps determine the context key - it does NOT - * affect which classes can be instantiated. Classes must still be registered - * via @TikaComponent to be deserializable. - * - * @param type the type to find the context key for - * @return the interface to use as context key, or null if none found - */ - public static Class<?> findContextKeyInterface(Class<?> type) { - for (Class<?> iface : COMPACT_FORMAT_INTERFACES) { - if (iface.isAssignableFrom(type)) { - return iface; - } - } - return null; - } - public TikaModule() { super("TikaModule"); @@ -220,7 +145,8 @@ public class TikaModule extends SimpleModule { // Concrete implementations (like ExternalParser, HtmlParser) should use normal // Jackson bean deserialization for their properties. if (rawClass.isInterface() || Modifier.isAbstract(rawClass.getModifiers())) { - if (COMPACT_FORMAT_INTERFACES.contains(rawClass) || usesCompactFormat(rawClass)) { + if (ComponentNameResolver.getContextKeyInterfaces().contains(rawClass) || + ComponentNameResolver.usesCompactFormat(rawClass)) { return new TikaComponentDeserializer(rawClass); } } @@ -253,7 +179,8 @@ public class TikaModule extends SimpleModule { // Only serialize with compact format if type implements a compact format interface // AND has a registered friendly name - if (usesCompactFormat(rawClass) && ComponentNameResolver.getFriendlyName(rawClass) != null) { + if (ComponentNameResolver.usesCompactFormat(rawClass) && + ComponentNameResolver.getFriendlyName(rawClass) != null) { return new TikaComponentSerializer(); } @@ -263,6 +190,7 @@ public class TikaModule extends SimpleModule { /** * Deserializer that handles both string and object formats for Tika components. + * Delegates to {@link ComponentInstantiator#instantiateComponent} for instantiation. */ private static class TikaComponentDeserializer extends JsonDeserializer<Object> { private final Class<?> expectedType; @@ -281,14 +209,15 @@ public class TikaModule extends SimpleModule { "Call TikaModule.setSharedMapper() before deserializing."); } + String typeName; + JsonNode configNode; + if (node.isTextual()) { - // Simple string format: "pdf-parser" - String typeName = node.asText(); - return instantiate(typeName, null, mapper); + typeName = node.asText(); + configNode = null; } else if (node.isObject()) { Iterator<Map.Entry<String, JsonNode>> fields = node.fields(); if (!fields.hasNext()) { - // Empty object {} - try to create default instance if expectedType is concrete try { return expectedType.getDeclaredConstructor().newInstance(); } catch (ReflectiveOperationException e) { @@ -297,136 +226,19 @@ public class TikaModule extends SimpleModule { } } Map.Entry<String, JsonNode> entry = fields.next(); - return instantiate(entry.getKey(), entry.getValue(), mapper); + typeName = entry.getKey(); + configNode = entry.getValue(); } else { throw new IOException("Expected string or object for " + expectedType.getSimpleName() + ", got: " + node.getNodeType()); } - } - - private Object instantiate(String typeName, JsonNode configNode, ObjectMapper mapper) throws IOException { - // Resolve the class using ComponentNameResolver - Class<?> clazz; - try { - clazz = ComponentNameResolver.resolveClass(typeName, - Thread.currentThread().getContextClassLoader()); - } catch (ClassNotFoundException e) { - throw new IOException("Unknown type: " + typeName, e); - } - - // Verify type compatibility - if (!expectedType.isAssignableFrom(clazz)) { - throw new IOException("Type " + typeName + " (" + clazz.getName() + - ") is not assignable to " + expectedType.getName()); - } - - // Extract mime filter fields before stripping them - Set<MediaType> includeTypes = extractMimeTypes(configNode, "_mime-include"); - Set<MediaType> excludeTypes = extractMimeTypes(configNode, "_mime-exclude"); - - // Strip decorator fields before passing to component - JsonNode cleanedConfig = stripDecoratorFields(configNode); try { - Object instance; - - // DefaultParser and DefaultDetector must be loaded via TikaLoader for proper dependency injection - if (clazz == DefaultParser.class) { - throw new IOException("DefaultParser must be loaded via TikaLoader, not directly " + - "via Jackson deserialization. Use TikaLoader.load() to load configuration."); - } else if (clazz == DefaultDetector.class) { - throw new IOException("DefaultDetector must be loaded via TikaLoader, not directly " + - "via Jackson deserialization. Use TikaLoader.load() to load configuration."); - } else if (clazz == MimeTypes.class) { - // MimeTypes must use the singleton to have all type definitions loaded - instance = MimeTypes.getDefaultMimeTypes(); - } else if (cleanedConfig == null || cleanedConfig.isEmpty()) { - // If no config, use default constructor - instance = clazz.getDeclaredConstructor().newInstance(); - } else { - // Try JsonConfig constructor first (works for any component) - Constructor<?> jsonConfigCtor = findJsonConfigConstructor(clazz); - if (jsonConfigCtor != null) { - // Use plain JSON mapper since the main mapper may be binary (Smile) - String json = JSON_MAPPER.writeValueAsString(cleanedConfig); - instance = jsonConfigCtor.newInstance((JsonConfig) () -> json); - } else { - // Fall back to no-arg constructor + Jackson bean deserialization - instance = clazz.getDeclaredConstructor().newInstance(); - mapper.readerForUpdating(instance).readValue(cleanedConfig); - } - } - - // Call initialize() on Initializable components - if (instance instanceof Initializable) { - try { - ((Initializable) instance).initialize(); - } catch (TikaConfigException e) { - throw new IOException("Failed to initialize " + typeName, e); - } - } - - // Wrap parser with mime filtering if include/exclude types specified - if (instance instanceof Parser && (!includeTypes.isEmpty() || !excludeTypes.isEmpty())) { - instance = ParserDecorator.withMimeFilters((Parser) instance, includeTypes, excludeTypes); - } - - return instance; - - } catch (ReflectiveOperationException e) { - throw new IOException("Failed to instantiate: " + typeName, e); - } - } - - private Set<MediaType> extractMimeTypes(JsonNode configNode, String fieldName) { - Set<MediaType> types = new HashSet<>(); - if (configNode == null || !configNode.has(fieldName)) { - return types; - } - JsonNode arrayNode = configNode.get(fieldName); - if (arrayNode.isArray()) { - for (JsonNode typeNode : arrayNode) { - types.add(MediaType.parse(typeNode.asText())); - } - } - return types; - } - - private Constructor<?> findJsonConfigConstructor(Class<?> clazz) { - try { - return clazz.getConstructor(JsonConfig.class); - } catch (NoSuchMethodException e) { - return null; - } - } - - /** - * Deserializes a JsonNode using a dedicated deserializer. - */ - private <T> T deserializeWithNode(JsonDeserializer<T> deserializer, JsonNode node, - ObjectMapper mapper) throws IOException { - if (node == null) { - node = mapper.createObjectNode(); - } - try (JsonParser p = mapper.treeAsTokens(node)) { - p.nextToken(); - return deserializer.deserialize(p, mapper.getDeserializationContext()); - } - } - - /** - * Strips decorator fields (_mime-include, _mime-exclude) from config node. - * These fields are handled by TikaLoader for wrapping, not by the component itself. - * Note: _exclude is NOT stripped as it's used by DefaultParser for SPI exclusions. - */ - private JsonNode stripDecoratorFields(JsonNode configNode) { - if (configNode == null || !configNode.isObject()) { - return configNode; + return ComponentInstantiator.instantiateComponent(typeName, configNode, + mapper, Thread.currentThread().getContextClassLoader(), expectedType); + } catch (TikaConfigException e) { + throw new IOException(e.getMessage(), e); } - ObjectNode cleaned = configNode.deepCopy(); - cleaned.remove("_mime-include"); - cleaned.remove("_mime-exclude"); - return cleaned; } } @@ -435,12 +247,8 @@ public class TikaModule extends SimpleModule { * Outputs simple string if using defaults, object with type key if configured. */ private static class TikaComponentSerializer extends JsonSerializer<Object> { - // Plain mapper for serializing without TikaModule (avoids infinite recursion) - private final ObjectMapper plainMapper; TikaComponentSerializer() { - this.plainMapper = new ObjectMapper(); - this.plainMapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS); } @Override @@ -507,8 +315,8 @@ public class TikaModule extends SimpleModule { // Create default config to compare against Object defaultConfig = config.getClass().getDeclaredConstructor().newInstance(); - ObjectNode configNode = plainMapper.valueToTree(config); - ObjectNode defaultNode = plainMapper.valueToTree(defaultConfig); + ObjectNode configNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(config); + ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultConfig); // Only keep properties that differ from defaults ObjectNode result = mapper.createObjectNode(); @@ -525,10 +333,10 @@ public class TikaModule extends SimpleModule { // No config object - serialize the component directly Object defaultInstance = value.getClass().getDeclaredConstructor().newInstance(); - ObjectNode valueNode = plainMapper.valueToTree(value); - ObjectNode defaultNode = plainMapper.valueToTree(defaultInstance); + ObjectNode valueNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(value); + ObjectNode defaultNode = TikaObjectMapperFactory.getPlainMapper().valueToTree(defaultInstance); - ObjectNode result = plainMapper.createObjectNode(); + ObjectNode result = TikaObjectMapperFactory.getPlainMapper().createObjectNode(); Iterator<Map.Entry<String, JsonNode>> fields = valueNode.fields(); while (fields.hasNext()) { Map.Entry<String, JsonNode> field = fields.next(); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index c8141c47d9..3e526f7b88 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -34,9 +34,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.loader.ComponentInfo; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.parser.ParseContext; import org.apache.tika.serialization.ComponentNameResolver; -import org.apache.tika.serialization.TikaModule; /** * Deserializes ParseContext from JSON. @@ -61,10 +61,9 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { private static final Logger LOG = LoggerFactory.getLogger(ParseContextDeserializer.class); - // Plain JSON mapper for converting JsonNodes to JSON strings. - // This is needed because the main mapper may use a binary format (e.g., Smile) - // which doesn't support writeValueAsString(). - private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); + private static ObjectMapper plainMapper() { + return TikaObjectMapperFactory.getPlainMapper(); + } @Override public ParseContext deserialize(JsonParser jsonParser, DeserializationContext ctxt) @@ -120,7 +119,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { // Store as JSON config for lazy resolution // Use plain JSON mapper since the main mapper may be binary (Smile) - String json = JSON_MAPPER.writeValueAsString(value); + String json = plainMapper().writeValueAsString(value); parseContext.setJsonConfig(name, json); } } @@ -128,21 +127,6 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } - /** - * Determines the context key for a component. - * Uses explicit contextKey if available, otherwise auto-detects from interfaces. - */ - private static Class<?> determineContextKey(ComponentInfo info) { - if (info.contextKey() != null) { - return info.contextKey(); - } - Class<?> interfaceKey = TikaModule.findContextKeyInterface(info.componentClass()); - if (interfaceKey != null) { - return interfaceKey; - } - return info.componentClass(); - } - /** * Checks if a JSON config entry would create a duplicate context key. * <p> @@ -172,7 +156,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return; } - Class<?> contextKey = determineContextKey(info); + Class<?> contextKey = ComponentNameResolver.determineContextKey(info); String existingName = seenContextKeys.get(contextKey); if (existingName != null) { @@ -215,25 +199,16 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { contextKeyClass = info.contextKey(); } - // If not found in registry, try as fully qualified class name + // If not found in registry, reject — components must be registered if (configClass == null) { - try { - configClass = Class.forName(componentName); - // Check if the class has a contextKey via its annotation - contextKeyClass = ComponentNameResolver.getContextKey(configClass); - } catch (ClassNotFoundException e) { - LOG.warn("Could not find class for typed component '{}', storing as JSON config", - componentName); - // Fall back to storing as JSON config (use plain JSON mapper) - parseContext.setJsonConfig(componentName, JSON_MAPPER.writeValueAsString(configNode)); - continue; - } + throw new IOException("Unknown typed component '" + componentName + "'. " + + "Components must be registered via @TikaComponent annotation or .idx file."); } // Determine context key: explicit > interface detection > class itself Class<?> parseContextKey = contextKeyClass; if (parseContextKey == null) { - parseContextKey = TikaModule.findContextKeyInterface(configClass); + parseContextKey = ComponentNameResolver.findContextKeyInterface(configClass); } if (parseContextKey == null) { parseContextKey = configClass; @@ -257,7 +232,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { LOG.warn("Failed to deserialize typed component '{}' as {}, storing as JSON config", componentName, configClass.getName(), e); // Use plain JSON mapper since main mapper may be binary (Smile) - parseContext.setJsonConfig(componentName, JSON_MAPPER.writeValueAsString(configNode)); + parseContext.setJsonConfig(componentName, plainMapper().writeValueAsString(configNode)); } } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java index d884f93553..1453b3483c 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java @@ -27,6 +27,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializerProvider; import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.parser.ParseContext; import org.apache.tika.serialization.ComponentNameResolver; @@ -51,12 +52,8 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { public static final String PARSE_CONTEXT = "parse-context"; public static final String TYPED = "typed"; - // Plain mapper for serializing values without TikaModule's component wrapping - private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper(); - - static { - // Allow serialization of classes with no properties - PLAIN_MAPPER.disable(com.fasterxml.jackson.databind.SerializationFeature.FAIL_ON_EMPTY_BEANS); + private static ObjectMapper plainMapper() { + return TikaObjectMapperFactory.getPlainMapper(); } @Override @@ -81,14 +78,10 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { continue; } - // Use the actual value's class for serialization, not the key class (which may be an interface) - // This ensures we can deserialize back to the concrete class - String valueClassName = value.getClass().getName(); - // Try to find a friendly component name for the value's class, otherwise use FQCN - String keyName = findComponentName(valueClassName); + String keyName = ComponentNameResolver.getFriendlyName(value.getClass()); if (keyName == null) { - keyName = valueClassName; + keyName = value.getClass().getName(); } if (!hasTypedObjects) { @@ -99,7 +92,7 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { gen.writeFieldName(keyName); // Use writeTree instead of writeRawValue for binary format support (e.g., Smile) // and stricter validation (fails early if value can't be serialized) - gen.writeTree(PLAIN_MAPPER.valueToTree(value)); + gen.writeTree(plainMapper().valueToTree(value)); // Track this name so we skip it in jsonConfigs serializedNames.add(keyName); @@ -119,26 +112,10 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { } gen.writeFieldName(entry.getKey()); // Parse the JSON string into a tree for binary format support - gen.writeTree(PLAIN_MAPPER.readTree(entry.getValue().json())); + gen.writeTree(plainMapper().readTree(entry.getValue().json())); } gen.writeEndObject(); } - /** - * Finds the component name for a class. - * Uses ComponentNameResolver for registry lookup. Only classes registered - * in a component registry will be serialized. - * - * @param className the fully qualified class name - * @return the component name, or null if not registered - */ - private String findComponentName(String className) { - try { - Class<?> clazz = Class.forName(className); - return ComponentNameResolver.getFriendlyName(clazz); - } catch (ClassNotFoundException e) { - return null; - } - } }
