This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545-mixins in repository https://gitbox.apache.org/repos/asf/tika.git
commit 199b4e8654bcdddfbf72583a52729d5cd8efd5bb Author: tallison <[email protected]> AuthorDate: Thu Dec 11 14:48:36 2025 -0500 TIKA-4545 -- refactoring of serialization --- .../org/apache/tika/config/ConfigDeserializer.java | 23 ++- .../main/java/org/apache/tika/detect/Detector.java | 3 +- .../org/apache/tika/detect/EncodingDetector.java | 3 +- .../ParsingEmbeddedDocumentExtractorFactory.java | 2 + .../tika/extractor/RUnpackExtractorFactory.java | 27 ++- .../apache/tika/language/translate/Translator.java | 3 +- .../writefilter/StandardWriteFilterFactory.java | 17 +- .../main/java/org/apache/tika/parser/Parser.java | 3 +- .../java/org/apache/tika/renderer/Renderer.java | 3 +- .../parser/digestutils/CommonsDigesterFactory.java | 2 + .../tika/parser/ocr/TesseractOCRParserTest.java | 4 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +- .../src/test/resources/configs/tika-4533.json | 7 +- .../configs/tika-config-digests-pdf-only.json | 7 +- .../tika-config-digests-skip-container.json | 9 +- .../resources/configs/tika-config-digests.json | 7 +- .../test/resources/configs/tika-unrar-config.json | 2 +- .../tika/config/TIKA-1702-translator-default.json | 2 +- .../config/TIKA-1702-translator-empty-default.json | 2 +- .../tika/config/TIKA-1702-translator-empty.json | 2 +- .../org/apache/tika/async/cli/PluginsWriter.java | 4 +- .../tika/pipes/core/AbstractComponentManager.java | 4 +- .../pipes/core/emitter/EmitterManagerTest.java | 2 +- .../pipes/core/fetcher/FetcherManagerTest.java | 2 +- .../tika/config/loader/ComponentInstantiator.java | 73 ++++++++ .../config/loader/CompositeComponentLoader.java | 82 +-------- .../apache/tika/config/loader/ConfigLoader.java | 18 +- .../apache/tika/config/loader/DetectorLoader.java | 131 +++++--------- .../tika/config/loader/EncodingDetectorLoader.java | 114 +++++------- .../apache/tika/config/loader/FrameworkConfig.java | 13 +- .../apache/tika/config/loader/ParserLoader.java | 115 +++--------- .../loader/PolymorphicObjectMapperFactory.java | 155 ---------------- .../apache/tika/config/loader/TikaJsonConfig.java | 2 +- .../org/apache/tika/config/loader/TikaLoader.java | 7 +- .../config/loader/TikaObjectMapperFactory.java | 112 ++++++++++++ .../tika/config/loader/TranslatorLoader.java | 57 +++--- .../tika/serialization/ComponentNameResolver.java | 88 +++++++++ .../tika/serialization/ConfigDeserializer.java | 4 +- .../serialization/ParseContextDeserializer.java | 105 +++++++++-- .../tika/serialization/ParseContextSerializer.java | 32 ++-- .../tika/serialization/ParseContextUtils.java | 20 +- .../tika/serialization/TikaAbstractTypeMixins.java | 201 +++++++++++++++++++++ .../tika/config/loader/ConfigLoaderTest.java | 22 +-- .../CustomClassSerializationTest.java | 4 +- .../TestParseContextSerialization.java | 57 +++++- .../test/resources/configs/test-config-loader.json | 3 +- .../resources/configs/test-translator-config.json | 2 +- 47 files changed, 925 insertions(+), 636 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java index 18ab2ff12a..dd5d12d778 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java @@ -22,6 +22,10 @@ import java.lang.reflect.Method; * Utility for deserializing JSON configuration without compile-time dependency on Jackson. * <p> * This class uses reflection to call Jackson's ObjectMapper when available on the classpath. + * If tika-serialization is available, it uses the configured ObjectMapper from + * {@code TikaObjectMapperFactory} to ensure consistent behavior with ParseContext + * serialization. Otherwise, it falls back to a plain ObjectMapper. + * <p> * If Jackson is not available and JSON deserialization is attempted, it throws a clear error message. * <p> * Usage pattern in parsers, detectors, and other Tika components: @@ -62,12 +66,19 @@ public class ConfigDeserializer { Method method = null; try { clazz = Class.forName("com.fasterxml.jackson.databind.ObjectMapper"); - // Use a plain ObjectMapper for simple config deserialization. - // The polymorphic mapper from tika-serialization is meant for ParseContext - // serialization with actual polymorphic types, not for simple config classes. - //TODO -- we need to revisit this. We should be using the same object mapper for - //config files and for runtime configs - instance = clazz.getDeclaredConstructor().newInstance(); + + // Try to use TikaObjectMapperFactory from tika-serialization if available. + // This ensures we use the same configured ObjectMapper as ParseContext serialization. + try { + Class<?> factoryClass = Class.forName( + "org.apache.tika.config.loader.TikaObjectMapperFactory"); + Method getMapperMethod = factoryClass.getMethod("getMapper"); + instance = getMapperMethod.invoke(null); + } catch (Exception e) { + // tika-serialization not on classpath, fall back to plain ObjectMapper + instance = clazz.getDeclaredConstructor().newInstance(); + } + method = clazz.getMethod("readValue", String.class, Class.class); } catch (Exception e) { // Jackson not on classpath - will fail at runtime if JSON deserialization is attempted diff --git a/tika-core/src/main/java/org/apache/tika/detect/Detector.java b/tika-core/src/main/java/org/apache/tika/detect/Detector.java index fc237aa5aa..3d513f042a 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/Detector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/Detector.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.Serializable; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -30,7 +31,7 @@ import org.apache.tika.mime.MediaType; * * @since Apache Tika 0.3 */ -public interface Detector extends Serializable { +public interface Detector extends Serializable, SelfConfiguring { /** * Detects the content type of the given input document. Returns diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java index 9dbad4c277..25f7bfa9d5 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.io.Serializable; import java.nio.charset.Charset; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.metadata.Metadata; /** @@ -30,7 +31,7 @@ import org.apache.tika.metadata.Metadata; * * @since Apache Tika 0.4 */ -public interface EncodingDetector extends Serializable { +public interface EncodingDetector extends Serializable, SelfConfiguring { /** * Detects the character encoding of the given text document, or diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index 9136228c4a..f1dfa071fe 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -17,9 +17,11 @@ package org.apache.tika.extractor; import org.apache.tika.config.Field; +import org.apache.tika.config.TikaComponent; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +@TikaComponent public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java index 5813ed3abb..ef46771ec8 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java @@ -17,6 +17,7 @@ package org.apache.tika.extractor; import java.util.HashSet; +import java.util.Set; import org.apache.tika.config.Field; import org.apache.tika.config.TikaComponent; @@ -30,11 +31,10 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l * 1024l * 1024l; private boolean writeFileNameToContent = true; - //concrete HashSet class for the sake of Jackson - private HashSet<String> embeddedBytesIncludeMimeTypes = new HashSet<>(); - private HashSet<String> embeddedBytesExcludeMimeTypes = new HashSet<>(); - private HashSet<String> embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); - private HashSet<String> embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); + private Set<String> embeddedBytesIncludeMimeTypes = new HashSet<>(); + private Set<String> embeddedBytesExcludeMimeTypes = new HashSet<>(); + private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); + private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION; @Field @@ -43,23 +43,22 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract } @Field - public void setEmbeddedBytesIncludeMimeTypes(HashSet<String> includeMimeTypes) { + public void setEmbeddedBytesIncludeMimeTypes(Set<String> includeMimeTypes) { embeddedBytesIncludeMimeTypes = new HashSet<>(includeMimeTypes); } @Field - public void setEmbeddedBytesExcludeMimeTypes(HashSet<String> excludeMimeTypes) { + public void setEmbeddedBytesExcludeMimeTypes(Set<String> excludeMimeTypes) { embeddedBytesExcludeMimeTypes = new HashSet<>(excludeMimeTypes); - } @Field - public void setEmbeddedBytesIncludeEmbeddedResourceTypes(HashSet<String> includeAttachmentTypes) { + public void setEmbeddedBytesIncludeEmbeddedResourceTypes(Set<String> includeAttachmentTypes) { embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(includeAttachmentTypes); } @Field - public void setEmbeddedBytesExcludeEmbeddedResourceTypes(HashSet<String> excludeAttachmentTypes) { + public void setEmbeddedBytesExcludeEmbeddedResourceTypes(Set<String> excludeAttachmentTypes) { embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(excludeAttachmentTypes); } @@ -84,19 +83,19 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract return writeFileNameToContent; } - public HashSet<String> getEmbeddedBytesIncludeMimeTypes() { + public Set<String> getEmbeddedBytesIncludeMimeTypes() { return embeddedBytesIncludeMimeTypes; } - public HashSet<String> getEmbeddedBytesExcludeMimeTypes() { + public Set<String> getEmbeddedBytesExcludeMimeTypes() { return embeddedBytesExcludeMimeTypes; } - public HashSet<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() { + public Set<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() { return embeddedBytesIncludeEmbeddedResourceTypes; } - public HashSet<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() { + public Set<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() { return embeddedBytesExcludeEmbeddedResourceTypes; } diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java index 563e6c4fc5..a349d9a16d 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java @@ -18,6 +18,7 @@ package org.apache.tika.language.translate; import java.io.IOException; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.exception.TikaException; /** @@ -25,7 +26,7 @@ import org.apache.tika.exception.TikaException; * * @since Tika 1.6 */ -public interface Translator { +public interface Translator extends SelfConfiguring { /** * Translate text between given languages. * diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java index b5d8a0288b..877dc47a0e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java @@ -17,11 +17,15 @@ package org.apache.tika.metadata.writefilter; import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.config.TikaComponent; /** * Factory class for {@link StandardWriteFilter}. See that class * for how the estimated sizes are calculated on Strings. */ +@TikaComponent public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { @@ -30,9 +34,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024; public static int DEFAULT_MAX_VALUES_PER_FIELD = 10; - //concrete classes here and in the setters/getters for the sake of Jackson - private HashSet<String> includeFields = new HashSet<>(); - private HashSet<String> excludeFields = new HashSet<>(); + private Set<String> includeFields = new HashSet<>(); + private Set<String> excludeFields = new HashSet<>(); private int maxKeySize = DEFAULT_MAX_KEY_SIZE; private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE; private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES; @@ -58,11 +61,11 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { excludeFields, includeEmpty); } - public void setIncludeFields(HashSet<String> includeFields) { + public void setIncludeFields(Set<String> includeFields) { this.includeFields = new HashSet<>(includeFields); } - public void setExcludeFields(HashSet<String> excludeFields) { + public void setExcludeFields(Set<String> excludeFields) { this.excludeFields = new HashSet<>(excludeFields); } @@ -86,11 +89,11 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { this.maxValuesPerField = maxValuesPerField; } - public HashSet<String> getIncludeFields() { + public Set<String> getIncludeFields() { return includeFields; } - public HashSet<String> getExcludeFields() { + public Set<String> getExcludeFields() { return excludeFields; } diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java b/tika-core/src/main/java/org/apache/tika/parser/Parser.java index 44882883a4..ef5299ba56 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java @@ -24,6 +24,7 @@ import java.util.Set; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -31,7 +32,7 @@ import org.apache.tika.mime.MediaType; /** * Tika parser interface. */ -public interface Parser extends Serializable { +public interface Parser extends Serializable, SelfConfiguring { /** * Returns the set of media types supported by this parser when used diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java index bc4261f521..ff0ad40b63 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Set; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -31,7 +32,7 @@ import org.apache.tika.parser.ParseContext; * but also on portions of PDF pages as well as on other document types. * */ -public interface Renderer extends Serializable { +public interface Renderer extends Serializable, SelfConfiguring { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java index d37f7acb10..56bad7352e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java @@ -17,12 +17,14 @@ package org.apache.tika.parser.digestutils; import org.apache.tika.config.Field; +import org.apache.tika.config.TikaComponent; import org.apache.tika.parser.DigestingParser; /** * Simple factory for {@link CommonsDigester} with * default markLimit = 1000000 and md5 digester. */ +@TikaComponent public class CommonsDigesterFactory implements DigestingParser.DigesterFactory { private int markLimit = 1000000; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index d660a61728..4f14f0cdef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -35,8 +35,8 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.ParseContextConfig; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PDF; @@ -280,7 +280,7 @@ public class TesseractOCRParserTest extends TikaTest { @Test public void testUpdatingConfigs() throws Exception { - ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper(); + ObjectMapper mapper = TikaObjectMapperFactory.getMapper(); // Create default config (simulating parser initialization) TesseractOCRConfig defaultConfig = new TesseractOCRConfig(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index d89beadcfd..7bfd3a44a2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -46,7 +46,7 @@ import org.xml.sax.SAXException; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -589,7 +589,7 @@ public class PDFParserTest extends TikaTest { parseContext.set(PDFParserConfig.class, config); // Serialize using ParseContextSerializer - com.fasterxml.jackson.databind.ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper(); + com.fasterxml.jackson.databind.ObjectMapper mapper = TikaObjectMapperFactory.getMapper(); com.fasterxml.jackson.databind.module.SimpleModule module = new com.fasterxml.jackson.databind.module.SimpleModule(); module.addSerializer(ParseContext.class, new org.apache.tika.serialization.ParseContextSerializer()); module.addDeserializer(ParseContext.class, new org.apache.tika.serialization.ParseContextDeserializer()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json index bab3af07e7..96729b9b30 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json @@ -5,9 +5,10 @@ "maximumPackageEntryDepth": 100, "throwOnZeroBytes": false, "digesterFactory": { - "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory", - "markLimit": 100000, - "algorithmString": "sha256" + "commons-digester-factory": { + "markLimit": 100000, + "algorithmString": "sha256" + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index 34e5248c7c..8472ae4843 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -12,9 +12,10 @@ "spoolToDisk": 1000000, "outputThreshold": 1000000, "digesterFactory": { - "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory", - "markLimit": 100000, - "algorithmString": "sha256:32,md5" + "commons-digester-factory": { + "markLimit": 100000, + "algorithmString": "sha256:32,md5" + } }, "throwOnZeroBytes": false } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json index 5fa5e78975..6332107097 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json @@ -3,10 +3,11 @@ "spoolToDisk": 1000000, "outputThreshold": 1000000, "digesterFactory": { - "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory", - "markLimit": 100000, - "algorithmString": "sha256:32,md5", - "skipContainerDocument": true + "commons-digester-factory": { + "markLimit": 100000, + "algorithmString": "sha256:32,md5", + "skipContainerDocument": true + } }, "throwOnZeroBytes": false } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json index bf12e17d7a..559542f8e1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json @@ -3,9 +3,10 @@ "spoolToDisk": 1000000, "outputThreshold": 1000000, "digesterFactory": { - "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory", - "markLimit": 100000, - "algorithmString": "sha256:32,md5" + "commons-digester-factory": { + "markLimit": 100000, + "algorithmString": "sha256:32,md5" + } }, "throwOnZeroBytes": false } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json index 5511b90b7a..de3fd5b32b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json @@ -2,7 +2,7 @@ "parsers": [ { "default-parser": { - "exclude": ["rar-parser"] + "_exclude": ["rar-parser"] } }, { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json index 69f20d6784..aa268b6c4c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json @@ -1,5 +1,5 @@ { "translator": { - "class": "default-translator" + "default-translator": {} } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json index 4e4b88fcc8..73ad08c224 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json @@ -1,5 +1,5 @@ { "translator": { - "class": "empty-translator" + "empty-translator": {} } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json index 4e4b88fcc8..73ad08c224 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json @@ -1,5 +1,5 @@ { "translator": { - "class": "empty-translator" + "empty-translator": {} } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java index 3093b59b69..c6e7a30af8 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java @@ -27,7 +27,7 @@ import java.util.List; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.utils.StringUtils; @@ -71,7 +71,7 @@ public class PluginsWriter { if (simpleAsyncConfig.getTimeoutMs() != null) { pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs()); } - ObjectMapper objectMapper = PolymorphicObjectMapperFactory.getMapper(); + ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper(); ObjectNode root = (ObjectNode) objectMapper.readTree(json.getBytes(StandardCharsets.UTF_8)); root.set("pipes", objectMapper.valueToTree(pipesConfig)); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java index 603248ea01..02c77d4d28 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java @@ -30,7 +30,7 @@ import org.pf4j.PluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.plugins.ExtensionConfig; @@ -177,7 +177,7 @@ public abstract class AbstractComponentManager<T extends TikaExtension, private static String toJsonString(final JsonNode node) throws TikaConfigException { try { - return PolymorphicObjectMapperFactory.getMapper().writeValueAsString(node); + return TikaObjectMapperFactory.getMapper().writeValueAsString(node); } catch (JsonProcessingException e) { throw new TikaConfigException("Failed to serialize config to JSON string", e); } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java index b2f7a9c62f..e2990abd96 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java @@ -279,7 +279,7 @@ public class EmitterManagerTest { Path configPath = tmpDir.resolve("config.json"); Files.writeString(configPath, configJson, StandardCharsets.UTF_8); - // PolymorphicObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled + // TikaObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled // so duplicate keys are caught during JSON parsing TikaConfigException exception = assertThrows(TikaConfigException.class, () -> { TikaJsonConfig.load(configPath); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java index 192646f0d8..f41fb4a199 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java @@ -275,7 +275,7 @@ public class FetcherManagerTest { Path configPath = tmpDir.resolve("config.json"); Files.writeString(configPath, configJson, StandardCharsets.UTF_8); - // PolymorphicObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled + // TikaObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled // so duplicate keys are caught during JSON parsing TikaConfigException exception = assertThrows(TikaConfigException.class, () -> { TikaJsonConfig.load(configPath); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java index 2f9a66e4cc..93d2a334fd 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java @@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.JsonConfig; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.serialization.ComponentNameResolver; import org.apache.tika.utils.ServiceLoaderUtils; /** @@ -85,6 +86,78 @@ public class ComponentInstantiator { } } + /** + * Instantiates a component from a JsonNode configuration. + * <p> + * Instantiation strategy: + * <ol> + * <li>Try constructor with JsonConfig parameter</li> + * <li>Fall back to Jackson bean deserialization if config is provided</li> + * <li>Fall back to zero-arg constructor if no config</li> + * </ol> + * + * @param componentClass the component class to instantiate + * @param configNode the JSON configuration node (may be null or empty) + * @param objectMapper the Jackson ObjectMapper for deserialization + * @param <T> the component type + * @return the instantiated component + * @throws TikaConfigException if instantiation fails + */ + @SuppressWarnings("unchecked") + public static <T> T instantiate(Class<?> componentClass, + JsonNode configNode, + ObjectMapper objectMapper) + throws TikaConfigException { + try { + // Try JsonConfig constructor first + try { + Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class); + String jsonString = configNode != null ? configNode.toString() : "{}"; + JsonConfig jsonConfig = () -> jsonString; + return (T) constructor.newInstance(jsonConfig); + } catch (NoSuchMethodException e) { + // No JsonConfig constructor, fall back to other methods + } + + // Fall back to Jackson bean deserialization or zero-arg constructor + if (configNode == null || configNode.isEmpty()) { + return (T) componentClass.getDeclaredConstructor().newInstance(); + } + + return (T) objectMapper.treeToValue(configNode, componentClass); + + } catch (Exception e) { + throw new TikaConfigException( + "Failed to instantiate component '" + componentClass.getName() + "': " + e.getMessage(), e); + } + } + + /** + * Instantiates a component by resolving a friendly name or FQCN to a class. + * <p> + * This is a convenience method that combines name resolution with instantiation. + * + * @param typeName the component type name (friendly name like "pdf-parser" or FQCN) + * @param configNode the JSON configuration node (may be null or empty) + * @param objectMapper the Jackson ObjectMapper for deserialization + * @param classLoader the class loader for name resolution + * @param <T> the component type + * @return the instantiated component + * @throws TikaConfigException if instantiation fails or type name is unknown + */ + public static <T> T instantiate(String typeName, + JsonNode configNode, + ObjectMapper objectMapper, + ClassLoader classLoader) + throws TikaConfigException { + try { + Class<?> componentClass = ComponentNameResolver.resolveClass(typeName, classLoader); + return instantiate(componentClass, configNode, objectMapper); + } catch (ClassNotFoundException e) { + throw new TikaConfigException("Unknown component type: '" + typeName + "'", e); + } + } + /** * Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields). * diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java index da5d5f59e2..6b65658abb 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java @@ -28,7 +28,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.JsonConfig; import org.apache.tika.exception.TikaConfigException; /** @@ -47,7 +46,6 @@ public class CompositeComponentLoader<T> { private final Class<T> componentInterface; private final String componentTypeName; - private final String indexFileName; private final ClassLoader classLoader; private final ObjectMapper objectMapper; @@ -56,16 +54,13 @@ public class CompositeComponentLoader<T> { * * @param componentInterface the component interface (e.g., Detector.class) * @param componentTypeName the JSON config key (e.g., "detectors") - * @param indexFileName the index file name (e.g., "detectors") * @param classLoader the class loader * @param objectMapper the Jackson ObjectMapper */ public CompositeComponentLoader(Class<T> componentInterface, String componentTypeName, - String indexFileName, ClassLoader classLoader, - ObjectMapper objectMapper) { + ClassLoader classLoader, ObjectMapper objectMapper) { this.componentInterface = componentInterface; this.componentTypeName = componentTypeName; - this.indexFileName = indexFileName; this.classLoader = classLoader; this.objectMapper = objectMapper; } @@ -103,14 +98,13 @@ public class CompositeComponentLoader<T> { return Collections.emptyList(); } - ComponentRegistry registry = new ComponentRegistry(indexFileName, classLoader); List<T> instances = new ArrayList<>(); for (Map.Entry<String, JsonNode> entry : arrayComponents) { String name = entry.getKey(); JsonNode configNode = entry.getValue(); - T instance = loadComponent(name, configNode, registry); + T instance = deserializeComponent(name, configNode); instances.add(instance); } @@ -129,87 +123,29 @@ public class CompositeComponentLoader<T> { // Load configured components if (config.hasComponents(componentTypeName)) { - ComponentRegistry registry = new ComponentRegistry(indexFileName, classLoader); Map<String, JsonNode> components = config.getComponents(componentTypeName); for (Map.Entry<String, JsonNode> entry : components.entrySet()) { String name = entry.getKey(); JsonNode configNode = entry.getValue(); - T instance = loadConfiguredComponent(name, configNode, registry); + T instance = deserializeComponent(name, configNode); instances.add(instance); } } // Add SPI-discovered components - List<T> spiComponents = loadSpiComponents(); + List<T> spiComponents = loadAllFromSpi(); instances.addAll(spiComponents); return instances; } - private T loadConfiguredComponent(String name, JsonNode configNode, - ComponentRegistry registry) - throws TikaConfigException { - try { - // Get component class - Class<?> componentClass = registry.getComponentClass(name); - - // Extract framework config - FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); - - // Instantiate component - T instance = instantiateComponent(componentClass, frameworkConfig.getComponentConfigJson()); - - return instance; - - } catch (Exception e) { - throw new TikaConfigException("Failed to load component '" + name + "' of type " + - componentTypeName, e); - } - } - - private T instantiateComponent(Class<?> componentClass, JsonConfig configJson) - throws TikaConfigException { - return ComponentInstantiator.instantiate(componentClass, configJson, classLoader, - componentTypeName, objectMapper); - } - - private List<T> loadSpiComponents() { - List<T> result = new ArrayList<>(); - ServiceLoader<T> serviceLoader = ServiceLoader.load(componentInterface, classLoader); - - Iterator<T> iterator = serviceLoader.iterator(); - while (iterator.hasNext()) { - try { - T instance = iterator.next(); - result.add(instance); - } catch (Exception e) { - // Log and skip problematic SPI providers - LOG.warn("Failed to load SPI component of type {}: {}", componentTypeName, e.getMessage(), e); - } - } - - return result; - } - - private T loadComponent(String name, JsonNode configNode, ComponentRegistry registry) - throws TikaConfigException { - try { - // Get component class - Class<?> componentClass = registry.getComponentClass(name); - - // Wrap JSON string in JsonConfig - String jsonString = objectMapper.writeValueAsString(configNode); - JsonConfig jsonConfig = () -> jsonString; - - // Instantiate component - return instantiateComponent(componentClass, jsonConfig); - - } catch (Exception e) { - throw new TikaConfigException("Failed to load component '" + name + "' of type " + - componentTypeName, e); - } + /** + * Deserializes a component, trying JsonConfig constructor first, then Jackson bean deserialization. + */ + private T deserializeComponent(String name, JsonNode configNode) throws TikaConfigException { + return ComponentInstantiator.instantiate(name, configNode, objectMapper, classLoader); } private List<T> loadAllFromSpi() { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java index bb9e2b6d30..b57aae89ee 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -125,12 +125,14 @@ public class ConfigLoader { /** * Loads a configuration object from the specified JSON key. * <p> - * Supports three formats for interfaces: + * Supports two formats: * <ul> - * <li>String value: treated as class name or component name to look up</li> - * <li>Object with "@class": explicit type specification</li> - * <li>Object without "@class": attempts direct deserialization (works for concrete classes)</li> + * <li>String value: treated as fully qualified class name to instantiate</li> + * <li>Object: deserialized directly into the target class</li> * </ul> + * <p> + * For tier-1 polymorphic types (Parser, Detector, MetadataFilter), use the wrapper + * object format with friendly names: {@code {"pdf-parser": {...}}} * * @param key The JSON key to load from * @param clazz The class to deserialize into (can be interface, abstract, or concrete) @@ -148,14 +150,14 @@ public class ConfigLoader { } try { - // Strategy 1: String value - treat as class name + // Strategy 1: String value - treat as class name (for interfaces) if (node.isTextual()) { return loadFromClassName(node.asText(), clazz); } - // Strategy 2: Let Jackson handle everything else - // Jackson's activateDefaultTyping will automatically handle @class fields - // for interfaces/abstract classes via the PolymorphicObjectMapperFactory configuration + // Strategy 2: Direct deserialization + // For tier-1 types (Parser, Detector, MetadataFilter), mixins handle polymorphism + // For concrete classes, Jackson deserializes directly return objectMapper.treeToValue(node, clazz); } catch (JsonProcessingException e) { throw new TikaConfigException( diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java index 321cf878e5..a277a9b79f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java @@ -29,7 +29,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.JsonConfig; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -72,7 +71,6 @@ public class DetectorLoader { // Load configured detectors if (config.hasComponentSection("detectors")) { List<Detector> detectorList = new ArrayList<>(); - ComponentRegistry registry = new ComponentRegistry("detectors", classLoader); List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("detectors"); // Check if "default-detector" is in the list and extract exclusions @@ -82,41 +80,7 @@ public class DetectorLoader { for (Map.Entry<String, JsonNode> entry : detectors) { if ("default-detector".equals(entry.getKey())) { hasDefaultDetector = true; - - // Parse exclusions from default-detector config - JsonNode configNode = entry.getValue(); - if (configNode != null && configNode.has("_exclude")) { - JsonNode excludeNode = configNode.get("_exclude"); - if (excludeNode.isArray()) { - for (JsonNode excludeName : excludeNode) { - if (excludeName.isTextual()) { - String detectorName = excludeName.asText(); - try { - Class<?> detectorClass; - // Try as component name first - try { - detectorClass = registry.getComponentClass(detectorName); - } catch (TikaConfigException e) { - // If not found as component name, try as FQCN - try { - detectorClass = Class.forName(detectorName, false, classLoader); - } catch (ClassNotFoundException ex) { - LOG.warn("Unknown detector in default-detector exclude list: {}", detectorName); - continue; - } - } - @SuppressWarnings("unchecked") - Class<? extends Detector> detectorTyped = - (Class<? extends Detector>) detectorClass; - excludedDetectorClasses.add(detectorTyped); - LOG.debug("Excluding detector from SPI: {}", detectorName); - } catch (Exception e) { - LOG.warn("Failed to exclude detector '{}': {}", detectorName, e.getMessage()); - } - } - } - } - } + excludedDetectorClasses.addAll(parseExclusions(entry.getValue())); break; } } @@ -133,8 +97,16 @@ public class DetectorLoader { continue; } - JsonNode configNode = entry.getValue(); - Detector detector = loadConfiguredDetector(name, configNode, registry); + // Special case: mime-types requires the initialized registry from TikaLoader + if ("mime-types".equals(name)) { + LOG.debug("Using TikaLoader.getMimeTypes() for mime-types detector"); + detectorList.add(TikaLoader.getMimeTypes()); + configuredDetectorClasses.add(TikaLoader.getMimeTypes().getClass()); + continue; + } + + // Use Jackson with mixins to deserialize - the TypeIdResolver handles name resolution + Detector detector = deserializeDetector(name, entry.getValue()); detectorList.add(detector); @SuppressWarnings("unchecked") Class<? extends Detector> detectorClass = @@ -146,13 +118,10 @@ public class DetectorLoader { configuredDetectorClasses.addAll(excludedDetectorClasses); // Add SPI-discovered detectors only if "default-detector" is in config - // If "default-detector" is present, use SPI fallback for unlisted detectors - // If "default-detector" is NOT present, only load explicitly configured detectors if (hasDefaultDetector) { DefaultDetector defaultDetector = createDefaultDetector(configuredDetectorClasses); LOG.debug("Loading SPI detectors because 'default-detector' is in config"); if (detectorList.isEmpty()) { - //short-circuit return as is if no other detectors are specified return defaultDetector; } detectorList.add(0, defaultDetector); @@ -167,51 +136,51 @@ public class DetectorLoader { } } - private Detector loadConfiguredDetector(String name, JsonNode configNode, - ComponentRegistry registry) - throws TikaConfigException { - try { - // Special case: mime-types requires the initialized registry from TikaLoader - // The no-arg constructor creates an empty MimeTypes without the XML-loaded types - if ("mime-types".equals(name)) { - LOG.debug("Using TikaLoader.getMimeTypes() for mime-types detector"); - return TikaLoader.getMimeTypes(); - } - - // Get detector class - try component name first, then FQCN fallback - Class<?> detectorClass; - try { - detectorClass = registry.getComponentClass(name); - } catch (TikaConfigException e) { - // If not found as component name, try as fully qualified class name - try { - detectorClass = Class.forName(name, false, classLoader); - LOG.debug("Loaded detector by FQCN: {}", name); - } catch (ClassNotFoundException ex) { - throw new TikaConfigException("Unknown detector: '" + name + - "'. Not found as component name or FQCN.", e); - } - } - - // Extract framework config - FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); + /** + * Deserializes a detector, trying JsonConfig constructor first, then Jackson bean deserialization. + */ + private Detector deserializeDetector(String name, JsonNode configNode) throws TikaConfigException { + return ComponentInstantiator.instantiate(name, configNode, objectMapper, classLoader); + } - // Instantiate detector - Detector detector = instantiateDetector(detectorClass, frameworkConfig.getComponentConfigJson()); + /** + * Parses exclusion list from default-detector config. + */ + @SuppressWarnings("unchecked") + private Set<Class<? extends Detector>> parseExclusions(JsonNode configNode) { + Set<Class<? extends Detector>> excluded = new HashSet<>(); + if (configNode == null || !configNode.has("_exclude")) { + return excluded; + } - return detector; + JsonNode excludeNode = configNode.get("_exclude"); + if (!excludeNode.isArray()) { + return excluded; + } - } catch (TikaConfigException e) { - throw e; - } catch (Exception e) { - throw new TikaConfigException("Failed to load detector '" + name + "'", e); + for (JsonNode excludeName : excludeNode) { + if (!excludeName.isTextual()) { + continue; + } + String detectorName = excludeName.asText(); + try { + // Try to resolve via TypeIdResolver's logic (registry lookup then Class.forName) + Class<?> detectorClass = resolveClass(detectorName); + excluded.add((Class<? extends Detector>) detectorClass); + LOG.debug("Excluding detector from SPI: {}", detectorName); + } catch (Exception e) { + LOG.warn("Unknown detector in exclude list: {}", detectorName); + } } + return excluded; } - private Detector instantiateDetector(Class<?> detectorClass, JsonConfig jsonConfig) - throws TikaConfigException { - return ComponentInstantiator.instantiate(detectorClass, jsonConfig, classLoader, - "Detector", objectMapper); + /** + * Resolves a name to a class, trying friendly name lookup first then FQCN. + */ + private Class<?> resolveClass(String name) throws ClassNotFoundException { + return org.apache.tika.serialization.ComponentNameResolver + .resolveClass(name, classLoader); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java index 66fa71adc8..25668d38eb 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java @@ -29,7 +29,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.JsonConfig; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetector; @@ -72,7 +71,6 @@ public class EncodingDetectorLoader { // Load configured encoding detectors if (config.hasComponentSection("encoding-detectors")) { List<EncodingDetector> detectorList = new ArrayList<>(); - ComponentRegistry registry = new ComponentRegistry("encoding-detectors", classLoader); List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("encoding-detectors"); // Check if "default-encoding-detector" is in the list and extract exclusions @@ -82,41 +80,7 @@ public class EncodingDetectorLoader { for (Map.Entry<String, JsonNode> entry : detectors) { if ("default-encoding-detector".equals(entry.getKey())) { hasDefaultEncodingDetector = true; - - // Parse exclusions from default-encoding-detector config - JsonNode configNode = entry.getValue(); - if (configNode != null && configNode.has("_exclude")) { - JsonNode excludeNode = configNode.get("_exclude"); - if (excludeNode.isArray()) { - for (JsonNode excludeName : excludeNode) { - if (excludeName.isTextual()) { - String detectorName = excludeName.asText(); - try { - Class<?> detectorClass; - // Try as component name first - try { - detectorClass = registry.getComponentClass(detectorName); - } catch (TikaConfigException e) { - // If not found as component name, try as FQCN - try { - detectorClass = Class.forName(detectorName, false, classLoader); - } catch (ClassNotFoundException ex) { - LOG.warn("Unknown encoding detector in default-encoding-detector exclude list: {}", detectorName); - continue; - } - } - @SuppressWarnings("unchecked") - Class<? extends EncodingDetector> detectorTyped = - (Class<? extends EncodingDetector>) detectorClass; - excludedDetectorClasses.add(detectorTyped); - LOG.debug("Excluding encoding detector from SPI: {}", detectorName); - } catch (Exception e) { - LOG.warn("Failed to exclude encoding detector '{}': {}", detectorName, e.getMessage()); - } - } - } - } - } + excludedDetectorClasses.addAll(parseExclusions(entry.getValue())); break; } } @@ -133,8 +97,8 @@ public class EncodingDetectorLoader { continue; } - JsonNode configNode = entry.getValue(); - EncodingDetector detector = loadConfiguredEncodingDetector(name, configNode, registry); + // Use Jackson with mixins to deserialize + EncodingDetector detector = deserializeEncodingDetector(name, entry.getValue()); detectorList.add(detector); @SuppressWarnings("unchecked") Class<? extends EncodingDetector> detectorClass = @@ -146,8 +110,6 @@ public class EncodingDetectorLoader { configuredDetectorClasses.addAll(excludedDetectorClasses); // Add SPI-discovered detectors only if "default-encoding-detector" is in config - // If "default-encoding-detector" is present, use SPI fallback for unlisted detectors - // If "default-encoding-detector" is NOT present, only load explicitly configured detectors if (hasDefaultEncodingDetector) { DefaultEncodingDetector defaultDetector = createDefaultEncodingDetector(configuredDetectorClasses); LOG.debug("Loading SPI encoding detectors because 'default-encoding-detector' is in config"); @@ -166,45 +128,51 @@ public class EncodingDetectorLoader { } } - private EncodingDetector loadConfiguredEncodingDetector(String name, JsonNode configNode, - ComponentRegistry registry) + /** + * Deserializes an encoding detector, trying JsonConfig constructor first, then Jackson bean deserialization. + */ + private EncodingDetector deserializeEncodingDetector(String name, JsonNode configNode) throws TikaConfigException { - try { - // Get encoding detector class - try component name first, then FQCN fallback - Class<?> detectorClass; - try { - detectorClass = registry.getComponentClass(name); - } catch (TikaConfigException e) { - // If not found as component name, try as fully qualified class name - try { - detectorClass = Class.forName(name, false, classLoader); - LOG.debug("Loaded encoding detector by FQCN: {}", name); - } catch (ClassNotFoundException ex) { - throw new TikaConfigException("Unknown encoding detector: '" + name + - "'. Not found as component name or FQCN.", e); - } - } - - // Extract framework config - FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); + return ComponentInstantiator.instantiate(name, configNode, objectMapper, classLoader); + } - // Instantiate encoding detector - EncodingDetector detector = instantiateEncodingDetector(detectorClass, - frameworkConfig.getComponentConfigJson()); + /** + * Parses exclusion list from default-encoding-detector config. + */ + @SuppressWarnings("unchecked") + private Set<Class<? extends EncodingDetector>> parseExclusions(JsonNode configNode) { + Set<Class<? extends EncodingDetector>> excluded = new HashSet<>(); + if (configNode == null || !configNode.has("_exclude")) { + return excluded; + } - return detector; + JsonNode excludeNode = configNode.get("_exclude"); + if (!excludeNode.isArray()) { + return excluded; + } - } catch (TikaConfigException e) { - throw e; - } catch (Exception e) { - throw new TikaConfigException("Failed to load encoding detector '" + name + "'", e); + for (JsonNode excludeName : excludeNode) { + if (!excludeName.isTextual()) { + continue; + } + String detectorName = excludeName.asText(); + try { + Class<?> detectorClass = resolveClass(detectorName); + excluded.add((Class<? extends EncodingDetector>) detectorClass); + LOG.debug("Excluding encoding detector from SPI: {}", detectorName); + } catch (Exception e) { + LOG.warn("Unknown encoding detector in exclude list: {}", detectorName); + } } + return excluded; } - private EncodingDetector instantiateEncodingDetector(Class<?> detectorClass, JsonConfig jsonConfig) - throws TikaConfigException { - return ComponentInstantiator.instantiate(detectorClass, jsonConfig, classLoader, - "EncodingDetector", objectMapper); + /** + * Resolves a name to a class, trying friendly name lookup first then FQCN. + */ + private Class<?> resolveClass(String name) throws ClassNotFoundException { + return org.apache.tika.serialization.ComponentNameResolver + .resolveClass(name, classLoader); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java index 96a101d34a..34952ee396 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java @@ -44,10 +44,13 @@ public class FrameworkConfig { private final ParserDecoration decoration; private final JsonConfig componentConfigJson; + private final JsonNode componentConfigNode; - private FrameworkConfig(ParserDecoration decoration, JsonConfig componentConfigJson) { + private FrameworkConfig(ParserDecoration decoration, JsonConfig componentConfigJson, + JsonNode componentConfigNode) { this.decoration = decoration; this.componentConfigJson = componentConfigJson; + this.componentConfigNode = componentConfigNode; } /** @@ -63,7 +66,7 @@ public class FrameworkConfig { if (configNode == null || !configNode.isObject()) { String jsonString = objectMapper.writeValueAsString(configNode); JsonConfig jsonConfig = () -> jsonString; - return new FrameworkConfig(null, jsonConfig); + return new FrameworkConfig(null, jsonConfig, configNode); } ObjectNode objNode = (ObjectNode) configNode.deepCopy(); @@ -81,7 +84,7 @@ public class FrameworkConfig { String jsonString = objectMapper.writeValueAsString(objNode); JsonConfig componentConfigJson = () -> jsonString; - return new FrameworkConfig(decoration, componentConfigJson); + return new FrameworkConfig(decoration, componentConfigJson, objNode); } private static List<String> parseStringList(JsonNode node) { @@ -111,6 +114,10 @@ public class FrameworkConfig { return componentConfigJson; } + public JsonNode getComponentConfigNode() { + return componentConfigNode; + } + /** * Parser decoration configuration for mime type filtering. */ diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java index aa19032f93..5d1f60cdc4 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -16,8 +16,6 @@ */ package org.apache.tika.config.loader; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -33,7 +31,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.JsonConfig; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.mime.MediaType; @@ -43,7 +40,6 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.RenderingParser; import org.apache.tika.renderer.Renderer; -import org.apache.tika.utils.ServiceLoaderUtils; /** * Loader for parsers with support for decoration (mime type filtering). @@ -95,7 +91,6 @@ public class ParserLoader { // Load configured parsers if (config.hasComponentSection("parsers")) { - ComponentRegistry registry = new ComponentRegistry("parsers", classLoader); List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers"); // Check if "default-parser" is in the list and extract exclusions and decorations @@ -125,23 +120,11 @@ public class ParserLoader { if (excludeName.isTextual()) { String parserName = excludeName.asText(); try { - Class<?> parserClass; - // Try as component name first - try { - parserClass = registry.getComponentClass(parserName); - } catch (TikaConfigException e) { - // If not found as component name, try as FQCN - try { - parserClass = Class.forName(parserName, false, classLoader); - } catch (ClassNotFoundException ex) { - LOG.warn("Unknown parser in default-parser exclude list: {}", parserName); - continue; - } - } + Class<?> parserClass = resolveClass(parserName); excludedParserClasses.add(parserClass); LOG.debug("Excluding parser from SPI: {}", parserName); } catch (Exception e) { - LOG.warn("Failed to exclude parser '{}': {}", parserName, e.getMessage()); + LOG.warn("Unknown parser in default-parser exclude list: {}", parserName); } } } @@ -173,7 +156,7 @@ public class ParserLoader { } JsonNode configNode = entry.getValue(); - ParsedParserConfig parsed = loadConfiguredParser(name, configNode, registry); + ParsedParserConfig parsed = loadConfiguredParser(name, configNode); parsedConfigs.put(name, parsed); } @@ -229,30 +212,24 @@ public class ParserLoader { return new CompositeParser(TikaLoader.getMediaTypeRegistry(), parserList); } - private ParsedParserConfig loadConfiguredParser(String name, JsonNode configNode, - ComponentRegistry registry) + private ParsedParserConfig loadConfiguredParser(String name, JsonNode configNode) throws TikaConfigException { try { - // Get parser class - try component name first, then FQCN fallback - Class<?> parserClass; - try { - parserClass = registry.getComponentClass(name); - } catch (TikaConfigException e) { - // If not found as component name, try as fully qualified class name - try { - parserClass = Class.forName(name, false, classLoader); - LOG.debug("Loaded parser by FQCN: {}", name); - } catch (ClassNotFoundException ex) { - throw new TikaConfigException("Unknown parser: '" + name + - "'. Not found as component name or FQCN.", e); - } - } - - // Extract framework config + // Extract framework config (decorations like mimeInclude/mimeExclude) FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); - // Instantiate parser - Parser parser = instantiateParser(parserClass, frameworkConfig.getComponentConfigJson()); + // Use Jackson with mixins to deserialize - the TypeIdResolver handles name resolution + Parser parser = deserializeParser(name, frameworkConfig.getComponentConfigNode()); + + // Post-process: inject EncodingDetector for AbstractEncodingDetectorParser + if (parser instanceof AbstractEncodingDetectorParser) { + ((AbstractEncodingDetectorParser) parser).setEncodingDetector(encodingDetector); + } + + // Post-process: inject Renderer for RenderingParser + if (parser instanceof RenderingParser && renderer != null) { + ((RenderingParser) parser).setRenderer(renderer); + } return new ParsedParserConfig(name, parser, frameworkConfig.getDecoration()); @@ -263,53 +240,19 @@ public class ParserLoader { } } - @SuppressWarnings("unchecked") - private Parser instantiateParser(Class<?> parserClass, JsonConfig jsonConfig) - throws TikaConfigException { - - try { - Parser parser; - - // Try constructor with JsonConfig parameter - try { - Constructor<?> constructor = parserClass.getConstructor(JsonConfig.class); - parser = (Parser) constructor.newInstance(jsonConfig); - } catch (NoSuchMethodException e) { - // Check if JSON config has actual configuration - if (ComponentInstantiator.hasConfiguration(jsonConfig, objectMapper)) { - throw new TikaConfigException( - "Parser '" + parserClass.getName() + "' has configuration in JSON, " + - "but does not have a constructor that accepts JsonConfig. " + - "Please add a constructor: public " + parserClass.getSimpleName() + "(JsonConfig jsonConfig)"); - } - - // Try constructor with EncodingDetector parameter (for AbstractEncodingDetectorParser) - if (AbstractEncodingDetectorParser.class.isAssignableFrom(parserClass)) { - try { - Constructor<?> constructor = parserClass.getConstructor(EncodingDetector.class); - parser = (Parser) constructor.newInstance(encodingDetector); - } catch (NoSuchMethodException ex) { - // Fall back to zero-arg constructor - parser = (Parser) ServiceLoaderUtils.newInstance(parserClass, - new org.apache.tika.config.ServiceLoader(classLoader)); - } - } else { - // Fall back to zero-arg constructor - parser = (Parser) ServiceLoaderUtils.newInstance(parserClass, - new org.apache.tika.config.ServiceLoader(classLoader)); - } - } - - // Inject renderer for RenderingParser instances - if (parser instanceof RenderingParser && renderer != null) { - ((RenderingParser) parser).setRenderer(renderer); - } + /** + * Deserializes a parser, trying JsonConfig constructor first, then Jackson bean deserialization. + */ + private Parser deserializeParser(String name, JsonNode configNode) throws TikaConfigException { + return ComponentInstantiator.instantiate(name, configNode, objectMapper, classLoader); + } - return parser; - } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("Failed to instantiate parser: " + - parserClass.getName(), e); - } + /** + * Resolves a name to a class, trying friendly name lookup first then FQCN. + */ + private Class<?> resolveClass(String name) throws ClassNotFoundException { + return org.apache.tika.serialization.ComponentNameResolver + .resolveClass(name, classLoader); } private Parser applyMimeFiltering(Parser parser, FrameworkConfig.ParserDecoration decoration) { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java deleted file mode 100644 index b920d7cd0a..0000000000 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.config.loader; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Enumeration; -import java.util.List; - -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; -import com.fasterxml.jackson.databind.jsontype.BasicPolymorphicTypeValidator; -import com.fasterxml.jackson.databind.jsontype.PolymorphicTypeValidator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Factory for creating ObjectMappers with consistent polymorphic type handling - * across Tika configuration and ParseContext serialization. - */ -public class PolymorphicObjectMapperFactory { - - private static final Logger LOG = LoggerFactory.getLogger(PolymorphicObjectMapperFactory.class); - - /** - * Classpath resource file where users can specify additional package prefixes - * to allow for polymorphic deserialization. One package prefix per line. - * Comments (lines starting with #) and blank lines are ignored. - * - * Example content: - * <pre> - * # Allow com.acme classes - * com.acme - * # Allow com.example classes - * com.example - * </pre> - */ - public static final String ALLOWED_PACKAGES_RESOURCE = "META-INF/tika-serialization-allowlist.txt"; - - private static ObjectMapper MAPPER = null; - - public static synchronized ObjectMapper getMapper() { - if (MAPPER == null) { - MAPPER = createPolymorphicMapper(); - } - return MAPPER; - } - - /** - * Creates an ObjectMapper with polymorphic type handling for Tika configuration. - * Configures security validation to allow Tika classes and any additional - * packages specified via {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath. - * - * @return configured ObjectMapper - */ - public static ObjectMapper createPolymorphicMapper() { - ObjectMapper mapper = new ObjectMapper(); - - // Fail on unknown properties to catch configuration errors early - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); - - // Prevent null values being assigned to primitive fields (int, boolean, etc.) - mapper.configure(DeserializationFeature.FAIL_ON_NULL_FOR_PRIMITIVES, true); - - // Ensure enums are properly validated (not just numeric values) - mapper.configure(DeserializationFeature.FAIL_ON_NUMBERS_FOR_ENUMS, true); - - // Catch duplicate keys in JSON objects - mapper.configure(DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY, true); - - //Need to allow creation of classes without setters/getters -- we may want to revisit this - mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); - - // Build polymorphic type validator - BasicPolymorphicTypeValidator.Builder builder = BasicPolymorphicTypeValidator.builder() - .allowIfSubType("org.apache.tika.") - .allowIfSubType("java.util.") - .allowIfSubType("java.nio.file."); - - // Add user-specified packages from classpath - List<String> additionalPackages = loadAllowedPackages(); - for (String packagePrefix : additionalPackages) { - builder.allowIfSubType(packagePrefix); - } - - PolymorphicTypeValidator typeValidator = builder.build(); - - // Use OBJECT_AND_NON_CONCRETE to add type info when static type is: - // - Object.class (for objects in maps) - // - Abstract classes or interfaces (for polymorphic fields) - mapper.activateDefaultTyping(typeValidator, ObjectMapper.DefaultTyping.OBJECT_AND_NON_CONCRETE, JsonTypeInfo.As.PROPERTY); - - return mapper; - } - - /** - * Loads additional package prefixes from classpath resources. - * Scans all {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath. - * - * @return list of additional package prefixes to allow - */ - private static List<String> loadAllowedPackages() { - List<String> packages = new ArrayList<>(); - try { - Enumeration<URL> resources = PolymorphicObjectMapperFactory.class.getClassLoader() - .getResources(ALLOWED_PACKAGES_RESOURCE); - - while (resources.hasMoreElements()) { - URL resource = resources.nextElement(); - LOG.debug("Loading allowed packages from: {}", resource); - - try (InputStream is = resource.openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { - - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - // Skip comments and empty lines - if (line.isEmpty() || line.startsWith("#")) { - continue; - } - packages.add(line); - LOG.info("Allowing polymorphic deserialization for package: {}", line); - } - } catch (IOException e) { - LOG.warn("Failed to read allowed packages from: {}", resource, e); - } - } - } catch (IOException e) { - LOG.warn("Failed to load allowed packages resources", e); - } - return packages; - } -} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 06da0f3175..2eeb8bc7a2 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -126,7 +126,7 @@ public class TikaJsonConfig { ); private static final ObjectMapper OBJECT_MAPPER = - PolymorphicObjectMapperFactory.getMapper(); + TikaObjectMapperFactory.getMapper(); private final JsonNode rootNode; private final Map<String, Map<String, JsonNode>> componentsByType; diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 01aa21e0f6..acf7be7161 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -92,7 +92,7 @@ public class TikaLoader { private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) { this.config = config; this.classLoader = classLoader; - this.objectMapper = PolymorphicObjectMapperFactory.getMapper(); + this.objectMapper = TikaObjectMapperFactory.getMapper(); } /** @@ -250,8 +250,7 @@ public class TikaLoader { if (config.hasComponentSection("metadata-filters")) { // Load explicitly configured filters (no SPI fallback) CompositeComponentLoader<MetadataFilter> loader = new CompositeComponentLoader<>( - MetadataFilter.class, "metadata-filters", "metadata-filters", - classLoader, objectMapper); + MetadataFilter.class, "metadata-filters", classLoader, objectMapper); filterList = loader.loadFromArray(config); } else { // No config section - metadata filters are opt-in only, don't load from SPI @@ -278,7 +277,7 @@ public class TikaLoader { public synchronized Renderer loadRenderers() throws TikaConfigException { if (renderers == null) { CompositeComponentLoader<Renderer> loader = new CompositeComponentLoader<>( - Renderer.class, "renderers", "renderers", classLoader, objectMapper); + Renderer.class, "renderers", classLoader, objectMapper); List<Renderer> rendererList = loader.loadFromArray(config); renderers = new CompositeRenderer(rendererList); } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java new file mode 100644 index 0000000000..b45a20afc4 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.serialization.ComponentNameResolver; +import org.apache.tika.serialization.TikaAbstractTypeMixins; + +/** + * Factory for creating ObjectMappers configured for Tika serialization. + * <p> + * Configures strict validation settings and loads component registries + * for friendly name resolution. + */ +public class TikaObjectMapperFactory { + + private static final Logger LOG = LoggerFactory.getLogger(TikaObjectMapperFactory.class); + + /** + * Index file names for component registries. + */ + private static final String[] REGISTRY_INDEX_FILES = { + "parsers", + "detectors", + "encoding-detectors", + "metadata-filters", + "renderers", + "translators", + "other-configs" + }; + + private static ObjectMapper MAPPER = null; + + public static synchronized ObjectMapper getMapper() { + if (MAPPER == null) { + MAPPER = createMapper(); + } + return MAPPER; + } + + /** + * Creates an ObjectMapper configured for Tika serialization. + * + * @return configured ObjectMapper + */ + public static ObjectMapper createMapper() { + ObjectMapper mapper = new ObjectMapper(); + + // Fail on unknown properties to catch configuration errors early + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); + + // Prevent null values being assigned to primitive fields (int, boolean, etc.) + mapper.configure(DeserializationFeature.FAIL_ON_NULL_FOR_PRIMITIVES, true); + + // Ensure enums are properly validated (not just numeric values) + mapper.configure(DeserializationFeature.FAIL_ON_NUMBERS_FOR_ENUMS, true); + + // Catch duplicate keys in JSON objects + mapper.configure(DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY, true); + + // Need to allow creation of classes without setters/getters -- we may want to revisit this + mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); + + // Load component registries for name resolution + loadComponentRegistries(); + + // Register deserializers for abstract types using wrapper object format + TikaAbstractTypeMixins.registerDeserializers(mapper); + + return mapper; + } + + /** + * Loads component registries for name resolution. + * Registries are loaded from index files and registered with the ComponentNameResolver. + * Missing registries are silently ignored (may not be on classpath). + */ + private static void loadComponentRegistries() { + ClassLoader classLoader = TikaObjectMapperFactory.class.getClassLoader(); + + for (String indexFile : REGISTRY_INDEX_FILES) { + try { + ComponentRegistry registry = new ComponentRegistry(indexFile, classLoader); + ComponentNameResolver.registerRegistry(indexFile, registry); + LOG.debug("Loaded component registry: {}", indexFile); + } catch (TikaConfigException e) { + // Registry not available - this is expected if the module isn't on classpath + LOG.debug("Component registry not available: {} - {}", indexFile, e.getMessage()); + } + } + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java index b84905e2c3..55b7f745a7 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java @@ -16,13 +16,14 @@ */ package org.apache.tika.config.loader; +import java.util.Iterator; +import java.util.Map; + import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.JsonConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; @@ -30,6 +31,17 @@ import org.apache.tika.language.translate.Translator; /** * Loader for translators. * Only one translator is supported at a time. + * <p> + * JSON format uses wrapper object style: + * <pre> + * { + * "translator": { + * "google-translator": { + * "apiKey": "..." + * } + * } + * } + * </pre> */ public class TranslatorLoader { @@ -66,35 +78,32 @@ public class TranslatorLoader { private Translator loadConfiguredTranslator(JsonNode translatorNode) throws TikaConfigException { - try { - // The translator node should be an object with a "class" field - if (!translatorNode.has("class")) { - throw new TikaConfigException("Translator configuration must have a 'class' field"); - } - - String className = translatorNode.get("class").asText(); - ComponentRegistry registry = new ComponentRegistry("translators", classLoader); - Class<?> translatorClass = registry.getComponentClass(className); + if (!translatorNode.isObject() || translatorNode.isEmpty()) { + throw new TikaConfigException( + "Translator configuration must be an object with translator type as key"); + } - // Remove "class" field from config before extraction - ObjectNode configCopy = ((ObjectNode) translatorNode).deepCopy(); - configCopy.remove("class"); + // Get the single field name (translator type) and its config + Iterator<Map.Entry<String, JsonNode>> fields = translatorNode.fields(); + Map.Entry<String, JsonNode> entry = fields.next(); - // Extract framework config (e.g., _decorate if present) - FrameworkConfig frameworkConfig = FrameworkConfig.extract(configCopy, objectMapper); + if (fields.hasNext()) { + throw new TikaConfigException( + "Translator configuration must have exactly one translator type"); + } - // Instantiate translator - return instantiateTranslator(translatorClass, frameworkConfig.getComponentConfigJson()); + String typeName = entry.getKey(); + JsonNode configNode = entry.getValue(); - } catch (Exception e) { - throw new TikaConfigException("Failed to load translator", e); - } + return deserializeTranslator(typeName, configNode); } - private Translator instantiateTranslator(Class<?> translatorClass, JsonConfig jsonConfig) + /** + * Deserializes a translator, trying JsonConfig constructor first, then Jackson bean deserialization. + */ + private Translator deserializeTranslator(String name, JsonNode configNode) throws TikaConfigException { - return ComponentInstantiator.instantiate(translatorClass, jsonConfig, classLoader, - "Translator", objectMapper); + return ComponentInstantiator.instantiate(name, configNode, objectMapper, classLoader); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java new file mode 100644 index 0000000000..739ed9944b --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.tika.config.loader.ComponentRegistry; +import org.apache.tika.exception.TikaConfigException; + +/** + * Utility class that resolves friendly component names to classes using ComponentRegistry. + * <p> + * Supports friendly names like "pdf-parser" as well as fully qualified class names. + * Registries must be registered via {@link #registerRegistry(String, ComponentRegistry)} + * before use. + */ +public final class ComponentNameResolver { + + private static final Map<String, ComponentRegistry> REGISTRIES = new ConcurrentHashMap<>(); + + private ComponentNameResolver() { + // Utility class + } + + /** + * Registers a ComponentRegistry for name resolution. + * + * @param indexName the index file name (e.g., "parsers", "detectors") + * @param registry the registry to register + */ + public static void registerRegistry(String indexName, ComponentRegistry registry) { + REGISTRIES.put(indexName, registry); + } + + /** + * Resolves a friendly name or FQCN to a Class. + * Searches all registered component registries, falling back to Class.forName. + * + * @param name friendly name or fully qualified class name + * @param classLoader the class loader to use for FQCN fallback + * @return the resolved class + * @throws ClassNotFoundException if not found in any registry and not a valid FQCN + */ + public static Class<?> resolveClass(String name, ClassLoader classLoader) + throws ClassNotFoundException { + for (ComponentRegistry registry : REGISTRIES.values()) { + if (registry.hasComponent(name)) { + try { + return registry.getComponentClass(name); + } catch (TikaConfigException e) { + // continue to next registry + } + } + } + return Class.forName(name, false, classLoader); + } + + /** + * Gets the friendly name for a class, or null if not registered. + * + * @param clazz the class to look up + * @return the friendly name, or null if not found + */ + public static String getFriendlyName(Class<?> clazz) { + for (ComponentRegistry registry : REGISTRIES.values()) { + String friendlyName = registry.getFriendlyName(clazz); + if (friendlyName != null) { + return friendlyName; + } + } + return null; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java index c8feb39919..507a182110 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java @@ -23,7 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.JsonConfig; import org.apache.tika.config.loader.JsonMergeUtils; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.parser.ParseContext; /** @@ -52,7 +52,7 @@ import org.apache.tika.parser.ParseContext; */ public class ConfigDeserializer { - private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); /** * Retrieves and deserializes a parser configuration from the ConfigContainer in ParseContext. diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index 518bffd160..437bca9332 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -31,7 +31,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.SelfConfiguring; +import org.apache.tika.config.loader.ComponentInfo; +import org.apache.tika.config.loader.ComponentRegistry; +import org.apache.tika.config.loader.TikaObjectMapperFactory; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; /** @@ -44,15 +48,33 @@ import org.apache.tika.parser.ParseContext; * <pre> * { * "pdf-parser": {"extractActions": true}, - * "tika-task-timeout": {"timeoutMillis": 5000}, - * "org.apache.tika.metadata.filter.MetadataFilter": {"@class": "...", ...} + * "tika-task-timeout": {"timeoutMillis": 5000} * } * </pre> */ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { private static final Logger LOG = LoggerFactory.getLogger(ParseContextDeserializer.class); - private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); + + // Lazily loaded registry for looking up friendly names + private static volatile ComponentRegistry registry; + + private static ComponentRegistry getRegistry() { + if (registry == null) { + synchronized (ParseContextDeserializer.class) { + if (registry == null) { + try { + registry = new ComponentRegistry("other-configs", + ParseContextDeserializer.class.getClassLoader()); + } catch (TikaConfigException e) { + LOG.warn("Failed to load component registry for deserialization", e); + } + } + } + } + return registry; + } @Override public ParseContext deserialize(JsonParser jsonParser, @@ -93,18 +115,77 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { String fieldName = it.next(); JsonNode fieldValue = contextNode.get(fieldName); + // Try to resolve fieldName - either as FQCN or friendly name from registry + Class<?> keyClass = null; + // Check if fieldName is a full class name (for directly serialized Tika types) if (fieldName.startsWith("org.apache.tika.")) { try { - Class<?> keyClass = Class.forName(fieldName); - // Deserialize using the key class as the target type - Object value = MAPPER.treeToValue(fieldValue, keyClass); - parseContext.set((Class) keyClass, value); - continue; + keyClass = Class.forName(fieldName); } catch (ClassNotFoundException e) { - LOG.debug("Class not found for key '{}', storing in ConfigContainer", fieldName); - } catch (Exception e) { - throw new IOException("Failed to deserialize '" + fieldName + "': " + e.getMessage(), e); + LOG.debug("Class not found for key '{}', will check registry", fieldName); + } + } + + // If not found as FQCN, check registry for friendly name + boolean isSelfConfiguring = false; + Class<?> contextKey = null; // The key to use when adding to ParseContext + if (keyClass == null) { + ComponentRegistry reg = getRegistry(); + if (reg != null && reg.hasComponent(fieldName)) { + try { + ComponentInfo info = reg.getComponentInfo(fieldName); + keyClass = info.componentClass(); + isSelfConfiguring = info.selfConfiguring(); + contextKey = info.contextKey(); + LOG.debug("Resolved friendly name '{}' to class {} (selfConfiguring={}, contextKey={})", + fieldName, keyClass.getName(), isSelfConfiguring, + contextKey != null ? contextKey.getName() : "null"); + } catch (TikaConfigException e) { + LOG.debug("Failed to get component info for '{}': {}", fieldName, e.getMessage()); + } + } + } else { + // For FQCN resolution, check SelfConfiguring directly + isSelfConfiguring = SelfConfiguring.class.isAssignableFrom(keyClass); + } + + // If we found a class, check if it's SelfConfiguring + if (keyClass != null) { + // SelfConfiguring components (Parsers, Detectors, etc.) handle their own config + // at runtime - keep their config in ConfigContainer for later access + if (isSelfConfiguring) { + LOG.debug("'{}' maps to SelfConfiguring class {}, keeping in ConfigContainer", + fieldName, keyClass.getName()); + // Fall through to ConfigContainer storage below + } else { + // Non-SelfConfiguring - deserialize directly into ParseContext + try { + // Check if fieldValue is a wrapper object format: {"concrete-class": {props}} + Object value; + if (fieldValue.isObject() && fieldValue.size() == 1) { + String typeName = fieldValue.fieldNames().next(); + JsonNode configNode = fieldValue.get(typeName); + // Try to resolve the concrete class + try { + Class<?> concreteClass = ComponentNameResolver.resolveClass(typeName, + ParseContextDeserializer.class.getClassLoader()); + value = MAPPER.treeToValue(configNode, concreteClass); + } catch (ClassNotFoundException ex) { + // Fall back to key class + value = MAPPER.treeToValue(configNode, keyClass); + } + } else { + // Not wrapper format, deserialize directly + value = MAPPER.treeToValue(fieldValue, keyClass); + } + // Use contextKey if specified, otherwise use the component class + Class<?> parseContextKey = (contextKey != null) ? contextKey : keyClass; + parseContext.set((Class) parseContextKey, value); + continue; + } catch (Exception e) { + throw new IOException("Failed to deserialize '" + fieldName + "': " + e.getMessage(), e); + } } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java index a8df312dc6..bca2ef54a4 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java @@ -30,7 +30,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.loader.ComponentRegistry; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; @@ -57,7 +57,7 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { private static final Logger LOG = LoggerFactory.getLogger(ParseContextSerializer.class); public static final String PARSE_CONTEXT = "parseContext"; - private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); // Lazily loaded registry for looking up friendly names private static volatile ComponentRegistry registry; @@ -114,13 +114,13 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { // Try to get friendly name for this object's class String friendlyName = (reg != null) ? reg.getFriendlyName(value.getClass()) : null; + // Determine key: prefer friendly name, fall back to FQCN for Tika types String key; if (friendlyName != null) { - // Use friendly name if available + // Use friendly name if available (deserializer will resolve via registry) key = friendlyName; } else if (entry.getKey().startsWith("org.apache.tika.")) { - // For Tika types without friendly names (e.g., custom MetadataFilter subclasses), - // use the context key - polymorphic mapper will add @class for the concrete type + // For Tika types without friendly names, use the context key (FQCN) key = entry.getKey(); } else { // Skip non-Tika types without friendly names (e.g., String, custom non-Tika classes) @@ -129,19 +129,17 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { if (!writtenKeys.contains(key)) { jsonGenerator.writeFieldName(key); - // If using the context key (not friendly name), we need to serialize - // with the base type to get polymorphic @class info for custom subclasses - if (friendlyName == null) { - try { - Class<?> contextKeyClass = Class.forName(entry.getKey()); - MAPPER.writerFor(contextKeyClass).writeValue(jsonGenerator, value); - } catch (ClassNotFoundException e) { - // Fallback to default serialization - MAPPER.writeValue(jsonGenerator, value); - } - } else { - MAPPER.writeValue(jsonGenerator, value); + // Write wrapper object format with type info for polymorphic deserialization + // Format: {"concrete-class-name": {properties...}} + jsonGenerator.writeStartObject(); + String typeName = (friendlyName != null) ? friendlyName : + ComponentNameResolver.getFriendlyName(value.getClass()); + if (typeName == null) { + typeName = value.getClass().getName(); } + jsonGenerator.writeFieldName(typeName); + MAPPER.writeValue(jsonGenerator, value); + jsonGenerator.writeEndObject(); writtenKeys.add(key); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java index d06b63f58e..354e8d1781 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -28,7 +28,7 @@ import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.JsonConfig; import org.apache.tika.config.loader.ComponentInfo; import org.apache.tika.config.loader.ComponentRegistry; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; @@ -36,22 +36,14 @@ import org.apache.tika.parser.ParseContext; /** * Utility methods for working with ParseContext objects in JSON-based configurations. * <p> - * Supports both legacy verbose format and new friendly-name format: + * Uses friendly-name format for configuration: * <pre> - * // Legacy format: - * "parse-context": { - * "objects": { - * "org.apache.tika.config.TikaTaskTimeout": { - * "@class": "org.apache.tika.config.TikaTaskTimeout", - * "timeoutMillis": 30000 - * } - * } - * } - * - * // New friendly-name format: * "parse-context": { * "tika-task-timeout": { * "timeoutMillis": 30000 + * }, + * "pdf-parser": { + * "extractInlineImages": true * } * } * </pre> @@ -62,7 +54,7 @@ import org.apache.tika.parser.ParseContext; public class ParseContextUtils { private static final Logger LOG = LoggerFactory.getLogger(ParseContextUtils.class); - private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); /** * Known interfaces that should be used as ParseContext keys. diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java new file mode 100644 index 0000000000..f6b1555dec --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import java.io.IOException; +import java.lang.reflect.Modifier; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.BeanDescription; +import com.fasterxml.jackson.databind.DeserializationConfig; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.deser.BeanDeserializerModifier; +import com.fasterxml.jackson.databind.module.SimpleModule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.loader.ComponentInstantiator; +import org.apache.tika.exception.TikaConfigException; + +/** + * Jackson module that handles deserialization of abstract types using wrapper object format. + * <p> + * Automatically applies to ANY abstract type (interface or abstract class) without + * requiring hardcoded type lists. Supports both formats: + * <ul> + * <li>Wrapper format: {@code {"type-name": {"prop": "value"}}}</li> + * <li>Legacy @class format: {@code {"@class": "fqcn", "prop": "value"}}</li> + * </ul> + * <p> + * Example: + * <pre> + * "digesterFactory": { + * "commons-digester-factory": { + * "markLimit": 100000 + * } + * } + * </pre> + */ +public final class TikaAbstractTypeMixins { + + private static final Logger LOG = LoggerFactory.getLogger(TikaAbstractTypeMixins.class); + + private TikaAbstractTypeMixins() { + // Utility class + } + + /** + * Registers the abstract type handling module on the given ObjectMapper. + * + * @param mapper the ObjectMapper to configure + */ + public static void registerDeserializers(ObjectMapper mapper) { + SimpleModule module = new SimpleModule("TikaAbstractTypes"); + module.setDeserializerModifier(new AbstractTypeDeserializerModifier(mapper)); + mapper.registerModule(module); + } + + /** + * Modifier that intercepts deserialization of abstract types and applies + * wrapper object handling. + */ + private static class AbstractTypeDeserializerModifier extends BeanDeserializerModifier { + + private final ObjectMapper mapper; + + AbstractTypeDeserializerModifier(ObjectMapper mapper) { + this.mapper = mapper; + } + + @Override + public JsonDeserializer<?> modifyDeserializer(DeserializationConfig config, + BeanDescription beanDesc, + JsonDeserializer<?> deserializer) { + Class<?> beanClass = beanDesc.getBeanClass(); + + // Skip types that shouldn't use wrapper format + if (shouldSkip(beanClass)) { + return deserializer; + } + + // Only handle abstract types (interfaces or abstract classes) + if (beanClass.isInterface() || Modifier.isAbstract(beanClass.getModifiers())) { + LOG.debug("Registering wrapper deserializer for abstract type: {}", + beanClass.getName()); + return new WrapperObjectDeserializer<>(beanClass, mapper); + } + + return deserializer; + } + + private boolean shouldSkip(Class<?> beanClass) { + // Skip primitives and their wrappers + if (beanClass.isPrimitive()) { + return true; + } + + // Skip common JDK types + String name = beanClass.getName(); + if (name.startsWith("java.") || name.startsWith("javax.")) { + return true; + } + + // Skip arrays + if (beanClass.isArray()) { + return true; + } + + return false; + } + } + + /** + * Deserializer that handles wrapper object format for abstract types. + */ + private static class WrapperObjectDeserializer<T> extends JsonDeserializer<T> { + + private final Class<?> abstractType; + private final ObjectMapper mapper; + + WrapperObjectDeserializer(Class<?> abstractType, ObjectMapper mapper) { + this.abstractType = abstractType; + this.mapper = mapper; + } + + @Override + @SuppressWarnings("unchecked") + public T deserialize(JsonParser p, DeserializationContext ctxt) throws IOException { + JsonNode node = p.readValueAsTree(); + + if (!node.isObject() || node.isEmpty()) { + // Let Jackson's default handling fail appropriately + return (T) ctxt.handleUnexpectedToken(abstractType, p); + } + + // Check for legacy "@class" format + if (node.has("@class")) { + String typeName = node.get("@class").asText(); + // Create config node without @class + com.fasterxml.jackson.databind.node.ObjectNode configObj = + mapper.createObjectNode(); + node.fields().forEachRemaining(entry -> { + if (!"@class".equals(entry.getKey())) { + configObj.set(entry.getKey(), entry.getValue()); + } + }); + return instantiateType(typeName, configObj, ctxt); + } + + // Check for wrapper format: single field whose value is an object + // e.g., {"commons-digester-factory": {"markLimit": 100000}} + if (node.size() == 1) { + String typeName = node.fieldNames().next(); + JsonNode configNode = node.get(typeName); + // Only treat as wrapper if the value is an object (not primitive/array) + if (configNode.isObject()) { + return instantiateType(typeName, configNode, ctxt); + } + } + + // Not wrapper format - this is likely an error (can't instantiate abstract type) + // Throw JsonMappingException so ConfigLoader wraps it in TikaConfigException + throw JsonMappingException.from(p, + "Cannot deserialize abstract type " + abstractType.getSimpleName() + + ". Use wrapper format: {\"concrete-type-name\": {...}} or " + + "legacy format: {\"@class\": \"fully.qualified.ClassName\", ...}"); + } + + private T instantiateType(String typeName, JsonNode configNode, + DeserializationContext ctxt) throws IOException { + try { + Class<?> concreteClass = ComponentNameResolver.resolveClass(typeName, + TikaAbstractTypeMixins.class.getClassLoader()); + return ComponentInstantiator.instantiate(concreteClass, configNode, mapper); + } catch (ClassNotFoundException e) { + throw JsonMappingException.from(ctxt.getParser(), + "Unknown type '" + typeName + "' for " + abstractType.getSimpleName()); + } catch (TikaConfigException e) { + throw JsonMappingException.from(ctxt.getParser(), + "Failed to instantiate " + typeName + ": " + e.getMessage()); + } + } + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java index 96b58dac06..80063b151d 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java @@ -252,22 +252,22 @@ public class ConfigLoaderTest { } @Test - public void testLoadInterfaceWithAtClassAndProperties() throws Exception { - // JSON: "configured-handler": { "@class": "...", "maxSize": 100000, ... } - TestHandler handler = configLoader.load("configured-handler", TestHandler.class); - - assertNotNull(handler); - assertTrue(handler instanceof ConfiguredHandlerImpl); - assertEquals("configured", handler.getName()); - - ConfiguredHandlerImpl impl = (ConfiguredHandlerImpl) handler; + public void testLoadConcreteClassWithProperties() throws Exception { + // JSON: "configured-handler-impl": { "maxSize": 100000, ... } + // Load directly as concrete class (kebab-case matches class name) + ConfiguredHandlerImpl impl = configLoader.load("configured-handler-impl", + ConfiguredHandlerImpl.class); + + assertNotNull(impl); + assertEquals("configured", impl.getName()); assertEquals(100000, impl.getMaxSize()); assertEquals("test-", impl.getPrefix()); } @Test - public void testLoadInterfaceWithoutTypeInfoFails() throws Exception { - // Create a minimal config with just properties, no @class + public void testLoadInterfaceWithoutClassNameFails() throws Exception { + // Loading an interface with properties (not a class name string) should fail + // because Jackson can't instantiate interfaces directly Path configPath = Paths.get( getClass().getResource("/configs/test-interface-no-type.json").toURI()); TikaLoader loader = TikaLoader.load(configPath); diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java index 2515b3b29a..bd530a6395 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java @@ -29,7 +29,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; import org.junit.jupiter.api.Test; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.loader.TikaObjectMapperFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; @@ -104,7 +104,7 @@ public class CustomClassSerializationTest { } private ObjectMapper createMapper() { - ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper(); + ObjectMapper mapper = TikaObjectMapperFactory.getMapper(); SimpleModule module = new SimpleModule(); module.addDeserializer(ParseContext.class, new ParseContextDeserializer()); module.addSerializer(ParseContext.class, new ParseContextSerializer()); diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index fdd1ecbff5..0c9c457b52 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -31,7 +31,10 @@ import com.fasterxml.jackson.databind.module.SimpleModule; import org.junit.jupiter.api.Test; import org.apache.tika.config.ConfigContainer; -import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.config.TikaTaskTimeout; +import org.apache.tika.config.loader.TikaObjectMapperFactory; +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.extractor.SkipEmbeddedDocumentSelector; import org.apache.tika.parser.ParseContext; /** @@ -44,7 +47,7 @@ public class TestParseContextSerialization { private ObjectMapper createMapper() { // Start with the properly configured mapper that has polymorphic type handling - ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper(); + ObjectMapper mapper = TikaObjectMapperFactory.getMapper(); // Register our custom serializer/deserializer on top SimpleModule module = new SimpleModule(); @@ -144,12 +147,12 @@ public class TestParseContextSerialization { .get("timeoutMillis") .asInt()); - // Verify round-trip + // Verify round-trip - TikaTaskTimeout is NOT SelfConfiguring, + // so it gets resolved directly into ParseContext (not ConfigContainer) ParseContext deserialized = mapper.readValue(json, ParseContext.class); - ConfigContainer deserializedConfig = deserialized.get(ConfigContainer.class); - assertTrue(deserializedConfig - .get("tika-task-timeout") - .isPresent()); + TikaTaskTimeout timeout = deserialized.get(TikaTaskTimeout.class); + assertNotNull(timeout, "TikaTaskTimeout should be resolved directly into ParseContext"); + assertEquals(30000, timeout.getTimeoutMillis()); } @Test @@ -259,9 +262,22 @@ public class TestParseContextSerialization { assertTrue(root.has("my-custom-config")); // Verify round-trip + // After deserialization: + // - pdf-parser, html-parser → Parsers are SelfConfiguring → stay in ConfigContainer + // - my-custom-config → not in registry → stays in ConfigContainer + // - tika-task-timeout → TikaTaskTimeout is NOT SelfConfiguring → resolved directly ParseContext deserialized = mapper.readValue(json, ParseContext.class); ConfigContainer deserializedConfig = deserialized.get(ConfigContainer.class); - assertEquals(4, deserializedConfig.getKeys().size()); + assertEquals(3, deserializedConfig.getKeys().size(), + "Should have 3 configs in ConfigContainer (SelfConfiguring + unknown)"); + assertTrue(deserializedConfig.get("pdf-parser").isPresent()); + assertTrue(deserializedConfig.get("html-parser").isPresent()); + assertTrue(deserializedConfig.get("my-custom-config").isPresent()); + + // TikaTaskTimeout should be resolved directly into ParseContext + TikaTaskTimeout timeout = deserialized.get(TikaTaskTimeout.class); + assertNotNull(timeout, "TikaTaskTimeout should be resolved directly"); + assertEquals(5000, timeout.getTimeoutMillis()); } @Test @@ -279,4 +295,29 @@ public class TestParseContextSerialization { JsonNode root = mapper.readTree(json); assertEquals(0, root.size(), "Objects without friendly names should not be serialized"); } + + @Test + public void testContextKeyDeserialization() throws Exception { + // Test that components with @TikaComponent(contextKey=...) are stored + // in ParseContext with the contextKey, not the component class. + // SkipEmbeddedDocumentSelector has contextKey=DocumentSelector.class + String json = """ + { + "skip-embedded-document-selector": {} + } + """; + + ObjectMapper mapper = createMapper(); + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + + // Should be accessible via DocumentSelector.class (the contextKey) + DocumentSelector selector = deserialized.get(DocumentSelector.class); + assertNotNull(selector, "DocumentSelector should be found via contextKey"); + assertTrue(selector instanceof SkipEmbeddedDocumentSelector, + "Should be SkipEmbeddedDocumentSelector instance"); + + // The selector should skip all embedded documents (return false) + assertFalse(selector.select(new org.apache.tika.metadata.Metadata()), + "SkipEmbeddedDocumentSelector should return false for all documents"); + } } diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json index cb6264c919..5305f2a43a 100644 --- a/tika-serialization/src/test/resources/configs/test-config-loader.json +++ b/tika-serialization/src/test/resources/configs/test-config-loader.json @@ -12,8 +12,7 @@ "simple-handler": "org.apache.tika.config.loader.ConfigLoaderTest$SimpleHandlerImpl", - "configured-handler": { - "@class": "org.apache.tika.config.loader.ConfigLoaderTest$ConfiguredHandlerImpl", + "configured-handler-impl": { "maxSize": 100000, "prefix": "test-" }, diff --git a/tika-serialization/src/test/resources/configs/test-translator-config.json b/tika-serialization/src/test/resources/configs/test-translator-config.json index 4e4b88fcc8..73ad08c224 100644 --- a/tika-serialization/src/test/resources/configs/test-translator-config.json +++ b/tika-serialization/src/test/resources/configs/test-translator-config.json @@ -1,5 +1,5 @@ { "translator": { - "class": "empty-translator" + "empty-translator": {} } }
