This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3caaace346 TIKA-4545-mixins (#2444)
3caaace346 is described below
commit 3caaace346cdd018efe4a971207bb058e3690a29
Author: Tim Allison <[email protected]>
AuthorDate: Fri Dec 12 10:43:36 2025 -0500
TIKA-4545-mixins (#2444)
---
.../org/apache/tika/config/ConfigDeserializer.java | 23 ++-
.../main/java/org/apache/tika/detect/Detector.java | 3 +-
.../org/apache/tika/detect/EncodingDetector.java | 3 +-
.../ParsingEmbeddedDocumentExtractorFactory.java | 2 +
.../tika/extractor/RUnpackExtractorFactory.java | 27 ++-
.../apache/tika/language/translate/Translator.java | 3 +-
.../writefilter/StandardWriteFilterFactory.java | 17 +-
.../main/java/org/apache/tika/parser/Parser.java | 3 +-
.../java/org/apache/tika/renderer/Renderer.java | 3 +-
.../parser/digestutils/CommonsDigesterFactory.java | 2 +
.../tika/parser/ocr/TesseractOCRParserTest.java | 4 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +-
.../src/test/resources/configs/tika-4533.json | 7 +-
.../configs/tika-config-digests-pdf-only.json | 7 +-
.../tika-config-digests-skip-container.json | 9 +-
.../resources/configs/tika-config-digests.json | 7 +-
.../test/resources/configs/tika-unrar-config.json | 2 +-
.../tika/config/TIKA-1702-translator-default.json | 2 +-
.../config/TIKA-1702-translator-empty-default.json | 2 +-
.../tika/config/TIKA-1702-translator-empty.json | 2 +-
.../org/apache/tika/async/cli/PluginsWriter.java | 4 +-
.../tika/pipes/core/AbstractComponentManager.java | 4 +-
.../pipes/core/emitter/EmitterManagerTest.java | 2 +-
.../pipes/core/fetcher/FetcherManagerTest.java | 2 +-
.../tika/config/loader/ComponentInstantiator.java | 73 ++++++++
.../config/loader/CompositeComponentLoader.java | 82 +--------
.../apache/tika/config/loader/ConfigLoader.java | 18 +-
.../apache/tika/config/loader/DetectorLoader.java | 128 +++++--------
.../tika/config/loader/EncodingDetectorLoader.java | 114 +++++-------
.../apache/tika/config/loader/FrameworkConfig.java | 13 +-
.../apache/tika/config/loader/ParserLoader.java | 115 +++---------
.../loader/PolymorphicObjectMapperFactory.java | 155 ----------------
.../apache/tika/config/loader/TikaJsonConfig.java | 2 +-
.../org/apache/tika/config/loader/TikaLoader.java | 7 +-
.../config/loader/TikaObjectMapperFactory.java | 112 ++++++++++++
.../tika/config/loader/TranslatorLoader.java | 57 +++---
.../tika/serialization/ComponentNameResolver.java | 88 +++++++++
.../tika/serialization/ConfigDeserializer.java | 4 +-
.../serialization/ParseContextDeserializer.java | 105 +++++++++--
.../tika/serialization/ParseContextSerializer.java | 32 ++--
.../tika/serialization/ParseContextUtils.java | 20 +-
.../tika/serialization/TikaAbstractTypeMixins.java | 201 +++++++++++++++++++++
.../tika/config/loader/ConfigLoaderTest.java | 22 +--
.../CustomClassSerializationTest.java | 4 +-
.../TestParseContextSerialization.java | 57 +++++-
.../test/resources/configs/test-config-loader.json | 3 +-
.../resources/configs/test-translator-config.json | 2 +-
47 files changed, 925 insertions(+), 633 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
index 18ab2ff12a..dd5d12d778 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ConfigDeserializer.java
@@ -22,6 +22,10 @@ import java.lang.reflect.Method;
* Utility for deserializing JSON configuration without compile-time
dependency on Jackson.
* <p>
* This class uses reflection to call Jackson's ObjectMapper when available on
the classpath.
+ * If tika-serialization is available, it uses the configured ObjectMapper from
+ * {@code TikaObjectMapperFactory} to ensure consistent behavior with
ParseContext
+ * serialization. Otherwise, it falls back to a plain ObjectMapper.
+ * <p>
* If Jackson is not available and JSON deserialization is attempted, it
throws a clear error message.
* <p>
* Usage pattern in parsers, detectors, and other Tika components:
@@ -62,12 +66,19 @@ public class ConfigDeserializer {
Method method = null;
try {
clazz =
Class.forName("com.fasterxml.jackson.databind.ObjectMapper");
- // Use a plain ObjectMapper for simple config deserialization.
- // The polymorphic mapper from tika-serialization is meant for
ParseContext
- // serialization with actual polymorphic types, not for simple
config classes.
- //TODO -- we need to revisit this. We should be using the same
object mapper for
- //config files and for runtime configs
- instance = clazz.getDeclaredConstructor().newInstance();
+
+ // Try to use TikaObjectMapperFactory from tika-serialization if
available.
+ // This ensures we use the same configured ObjectMapper as
ParseContext serialization.
+ try {
+ Class<?> factoryClass = Class.forName(
+
"org.apache.tika.config.loader.TikaObjectMapperFactory");
+ Method getMapperMethod = factoryClass.getMethod("getMapper");
+ instance = getMapperMethod.invoke(null);
+ } catch (Exception e) {
+ // tika-serialization not on classpath, fall back to plain
ObjectMapper
+ instance = clazz.getDeclaredConstructor().newInstance();
+ }
+
method = clazz.getMethod("readValue", String.class, Class.class);
} catch (Exception e) {
// Jackson not on classpath - will fail at runtime if JSON
deserialization is attempted
diff --git a/tika-core/src/main/java/org/apache/tika/detect/Detector.java
b/tika-core/src/main/java/org/apache/tika/detect/Detector.java
index fc237aa5aa..3d513f042a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/Detector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/Detector.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
+import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -30,7 +31,7 @@ import org.apache.tika.mime.MediaType;
*
* @since Apache Tika 0.3
*/
-public interface Detector extends Serializable {
+public interface Detector extends Serializable, SelfConfiguring {
/**
* Detects the content type of the given input document. Returns
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
index 9dbad4c277..25f7bfa9d5 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
@@ -21,6 +21,7 @@ import java.io.InputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
+import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.metadata.Metadata;
/**
@@ -30,7 +31,7 @@ import org.apache.tika.metadata.Metadata;
*
* @since Apache Tika 0.4
*/
-public interface EncodingDetector extends Serializable {
+public interface EncodingDetector extends Serializable, SelfConfiguring {
/**
* Detects the character encoding of the given text document, or
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 9136228c4a..f1dfa071fe 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -17,9 +17,11 @@
package org.apache.tika.extractor;
import org.apache.tika.config.Field;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+@TikaComponent
public class ParsingEmbeddedDocumentExtractorFactory
implements EmbeddedDocumentExtractorFactory {
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
index 5813ed3abb..ef46771ec8 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
@@ -17,6 +17,7 @@
package org.apache.tika.extractor;
import java.util.HashSet;
+import java.util.Set;
import org.apache.tika.config.Field;
import org.apache.tika.config.TikaComponent;
@@ -30,11 +31,10 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l
* 1024l * 1024l;
private boolean writeFileNameToContent = true;
- //concrete HashSet class for the sake of Jackson
- private HashSet<String> embeddedBytesIncludeMimeTypes = new HashSet<>();
- private HashSet<String> embeddedBytesExcludeMimeTypes = new HashSet<>();
- private HashSet<String> embeddedBytesIncludeEmbeddedResourceTypes = new
HashSet<>();
- private HashSet<String> embeddedBytesExcludeEmbeddedResourceTypes = new
HashSet<>();
+ private Set<String> embeddedBytesIncludeMimeTypes = new HashSet<>();
+ private Set<String> embeddedBytesExcludeMimeTypes = new HashSet<>();
+ private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = new
HashSet<>();
+ private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = new
HashSet<>();
private long maxEmbeddedBytesForExtraction =
DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION;
@Field
@@ -43,23 +43,22 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
}
@Field
- public void setEmbeddedBytesIncludeMimeTypes(HashSet<String>
includeMimeTypes) {
+ public void setEmbeddedBytesIncludeMimeTypes(Set<String> includeMimeTypes)
{
embeddedBytesIncludeMimeTypes = new HashSet<>(includeMimeTypes);
}
@Field
- public void setEmbeddedBytesExcludeMimeTypes(HashSet<String>
excludeMimeTypes) {
+ public void setEmbeddedBytesExcludeMimeTypes(Set<String> excludeMimeTypes)
{
embeddedBytesExcludeMimeTypes = new HashSet<>(excludeMimeTypes);
-
}
@Field
- public void setEmbeddedBytesIncludeEmbeddedResourceTypes(HashSet<String>
includeAttachmentTypes) {
+ public void setEmbeddedBytesIncludeEmbeddedResourceTypes(Set<String>
includeAttachmentTypes) {
embeddedBytesIncludeEmbeddedResourceTypes = new
HashSet<>(includeAttachmentTypes);
}
@Field
- public void setEmbeddedBytesExcludeEmbeddedResourceTypes(HashSet<String>
excludeAttachmentTypes) {
+ public void setEmbeddedBytesExcludeEmbeddedResourceTypes(Set<String>
excludeAttachmentTypes) {
embeddedBytesExcludeEmbeddedResourceTypes = new
HashSet<>(excludeAttachmentTypes);
}
@@ -84,19 +83,19 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
return writeFileNameToContent;
}
- public HashSet<String> getEmbeddedBytesIncludeMimeTypes() {
+ public Set<String> getEmbeddedBytesIncludeMimeTypes() {
return embeddedBytesIncludeMimeTypes;
}
- public HashSet<String> getEmbeddedBytesExcludeMimeTypes() {
+ public Set<String> getEmbeddedBytesExcludeMimeTypes() {
return embeddedBytesExcludeMimeTypes;
}
- public HashSet<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() {
+ public Set<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() {
return embeddedBytesIncludeEmbeddedResourceTypes;
}
- public HashSet<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() {
+ public Set<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() {
return embeddedBytesExcludeEmbeddedResourceTypes;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
index 563e6c4fc5..a349d9a16d 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
@@ -18,6 +18,7 @@ package org.apache.tika.language.translate;
import java.io.IOException;
+import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.exception.TikaException;
/**
@@ -25,7 +26,7 @@ import org.apache.tika.exception.TikaException;
*
* @since Tika 1.6
*/
-public interface Translator {
+public interface Translator extends SelfConfiguring {
/**
* Translate text between given languages.
*
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b5d8a0288b..877dc47a0e 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -17,11 +17,15 @@
package org.apache.tika.metadata.writefilter;
import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.config.TikaComponent;
/**
* Factory class for {@link StandardWriteFilter}. See that class
* for how the estimated sizes are calculated on Strings.
*/
+@TikaComponent
public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
@@ -30,9 +34,8 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
- //concrete classes here and in the setters/getters for the sake of Jackson
- private HashSet<String> includeFields = new HashSet<>();
- private HashSet<String> excludeFields = new HashSet<>();
+ private Set<String> includeFields = new HashSet<>();
+ private Set<String> excludeFields = new HashSet<>();
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
@@ -58,11 +61,11 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
excludeFields, includeEmpty);
}
- public void setIncludeFields(HashSet<String> includeFields) {
+ public void setIncludeFields(Set<String> includeFields) {
this.includeFields = new HashSet<>(includeFields);
}
- public void setExcludeFields(HashSet<String> excludeFields) {
+ public void setExcludeFields(Set<String> excludeFields) {
this.excludeFields = new HashSet<>(excludeFields);
}
@@ -86,11 +89,11 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
this.maxValuesPerField = maxValuesPerField;
}
- public HashSet<String> getIncludeFields() {
+ public Set<String> getIncludeFields() {
return includeFields;
}
- public HashSet<String> getExcludeFields() {
+ public Set<String> getExcludeFields() {
return excludeFields;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java
b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
index 44882883a4..ef5299ba56 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
@@ -24,6 +24,7 @@ import java.util.Set;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -31,7 +32,7 @@ import org.apache.tika.mime.MediaType;
/**
* Tika parser interface.
*/
-public interface Parser extends Serializable {
+public interface Parser extends Serializable, SelfConfiguring {
/**
* Returns the set of media types supported by this parser when used
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
index bc4261f521..ff0ad40b63 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
@@ -21,6 +21,7 @@ import java.io.InputStream;
import java.io.Serializable;
import java.util.Set;
+import org.apache.tika.config.SelfConfiguring;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -31,7 +32,7 @@ import org.apache.tika.parser.ParseContext;
* but also on portions of PDF pages as well as on other document types.
*
*/
-public interface Renderer extends Serializable {
+public interface Renderer extends Serializable, SelfConfiguring {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index d37f7acb10..56bad7352e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -17,12 +17,14 @@
package org.apache.tika.parser.digestutils;
import org.apache.tika.config.Field;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.parser.DigestingParser;
/**
* Simple factory for {@link CommonsDigester} with
* default markLimit = 1000000 and md5 digester.
*/
+@TikaComponent
public class CommonsDigesterFactory implements DigestingParser.DigesterFactory
{
private int markLimit = 1000000;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index d660a61728..4f14f0cdef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -35,8 +35,8 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.ParseContextConfig;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
@@ -280,7 +280,7 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testUpdatingConfigs() throws Exception {
- ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper();
+ ObjectMapper mapper = TikaObjectMapperFactory.getMapper();
// Create default config (simulating parser initialization)
TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index d89beadcfd..7bfd3a44a2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -46,7 +46,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -589,7 +589,7 @@ public class PDFParserTest extends TikaTest {
parseContext.set(PDFParserConfig.class, config);
// Serialize using ParseContextSerializer
- com.fasterxml.jackson.databind.ObjectMapper mapper =
PolymorphicObjectMapperFactory.getMapper();
+ com.fasterxml.jackson.databind.ObjectMapper mapper =
TikaObjectMapperFactory.getMapper();
com.fasterxml.jackson.databind.module.SimpleModule module = new
com.fasterxml.jackson.databind.module.SimpleModule();
module.addSerializer(ParseContext.class, new
org.apache.tika.serialization.ParseContextSerializer());
module.addDeserializer(ParseContext.class, new
org.apache.tika.serialization.ParseContextDeserializer());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
index bab3af07e7..96729b9b30 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
@@ -5,9 +5,10 @@
"maximumPackageEntryDepth": 100,
"throwOnZeroBytes": false,
"digesterFactory": {
- "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory",
- "markLimit": 100000,
- "algorithmString": "sha256"
+ "commons-digester-factory": {
+ "markLimit": 100000,
+ "algorithmString": "sha256"
+ }
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index 34e5248c7c..8472ae4843 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -12,9 +12,10 @@
"spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
- "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory",
- "markLimit": 100000,
- "algorithmString": "sha256:32,md5"
+ "commons-digester-factory": {
+ "markLimit": 100000,
+ "algorithmString": "sha256:32,md5"
+ }
},
"throwOnZeroBytes": false
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
index 5fa5e78975..6332107097 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
@@ -3,10 +3,11 @@
"spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
- "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory",
- "markLimit": 100000,
- "algorithmString": "sha256:32,md5",
- "skipContainerDocument": true
+ "commons-digester-factory": {
+ "markLimit": 100000,
+ "algorithmString": "sha256:32,md5",
+ "skipContainerDocument": true
+ }
},
"throwOnZeroBytes": false
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
index bf12e17d7a..559542f8e1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
@@ -3,9 +3,10 @@
"spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
- "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory",
- "markLimit": 100000,
- "algorithmString": "sha256:32,md5"
+ "commons-digester-factory": {
+ "markLimit": 100000,
+ "algorithmString": "sha256:32,md5"
+ }
},
"throwOnZeroBytes": false
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
index 5511b90b7a..de3fd5b32b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": ["rar-parser"]
+ "_exclude": ["rar-parser"]
}
},
{
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
index 69f20d6784..aa268b6c4c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
@@ -1,5 +1,5 @@
{
"translator": {
- "class": "default-translator"
+ "default-translator": {}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
index 4e4b88fcc8..73ad08c224 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
@@ -1,5 +1,5 @@
{
"translator": {
- "class": "empty-translator"
+ "empty-translator": {}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
index 4e4b88fcc8..73ad08c224 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
@@ -1,5 +1,5 @@
{
"translator": {
- "class": "empty-translator"
+ "empty-translator": {}
}
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index 3093b59b69..c6e7a30af8 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -27,7 +27,7 @@ import java.util.List;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.pipes.core.PipesConfig;
import org.apache.tika.utils.StringUtils;
@@ -71,7 +71,7 @@ public class PluginsWriter {
if (simpleAsyncConfig.getTimeoutMs() != null) {
pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs());
}
- ObjectMapper objectMapper =
PolymorphicObjectMapperFactory.getMapper();
+ ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper();
ObjectNode root = (ObjectNode)
objectMapper.readTree(json.getBytes(StandardCharsets.UTF_8));
root.set("pipes", objectMapper.valueToTree(pipesConfig));
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java
index 603248ea01..02c77d4d28 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/AbstractComponentManager.java
@@ -30,7 +30,7 @@ import org.pf4j.PluginManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.plugins.ExtensionConfig;
@@ -177,7 +177,7 @@ public abstract class AbstractComponentManager<T extends
TikaExtension,
private static String toJsonString(final JsonNode node) throws
TikaConfigException {
try {
- return
PolymorphicObjectMapperFactory.getMapper().writeValueAsString(node);
+ return
TikaObjectMapperFactory.getMapper().writeValueAsString(node);
} catch (JsonProcessingException e) {
throw new TikaConfigException("Failed to serialize config to JSON
string", e);
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java
index b2f7a9c62f..e2990abd96 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/emitter/EmitterManagerTest.java
@@ -279,7 +279,7 @@ public class EmitterManagerTest {
Path configPath = tmpDir.resolve("config.json");
Files.writeString(configPath, configJson, StandardCharsets.UTF_8);
- // PolymorphicObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY
enabled
+ // TikaObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled
// so duplicate keys are caught during JSON parsing
TikaConfigException exception =
assertThrows(TikaConfigException.class, () -> {
TikaJsonConfig.load(configPath);
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java
index 192646f0d8..f41fb4a199 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/fetcher/FetcherManagerTest.java
@@ -275,7 +275,7 @@ public class FetcherManagerTest {
Path configPath = tmpDir.resolve("config.json");
Files.writeString(configPath, configJson, StandardCharsets.UTF_8);
- // PolymorphicObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY
enabled
+ // TikaObjectMapperFactory has FAIL_ON_READING_DUP_TREE_KEY enabled
// so duplicate keys are caught during JSON parsing
TikaConfigException exception =
assertThrows(TikaConfigException.class, () -> {
TikaJsonConfig.load(configPath);
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
index 2f9a66e4cc..93d2a334fd 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstantiator.java
@@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.serialization.ComponentNameResolver;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
@@ -85,6 +86,78 @@ public class ComponentInstantiator {
}
}
+ /**
+ * Instantiates a component from a JsonNode configuration.
+ * <p>
+ * Instantiation strategy:
+ * <ol>
+ * <li>Try constructor with JsonConfig parameter</li>
+ * <li>Fall back to Jackson bean deserialization if config is
provided</li>
+ * <li>Fall back to zero-arg constructor if no config</li>
+ * </ol>
+ *
+ * @param componentClass the component class to instantiate
+ * @param configNode the JSON configuration node (may be null or empty)
+ * @param objectMapper the Jackson ObjectMapper for deserialization
+ * @param <T> the component type
+ * @return the instantiated component
+ * @throws TikaConfigException if instantiation fails
+ */
+ @SuppressWarnings("unchecked")
+ public static <T> T instantiate(Class<?> componentClass,
+ JsonNode configNode,
+ ObjectMapper objectMapper)
+ throws TikaConfigException {
+ try {
+ // Try JsonConfig constructor first
+ try {
+ Constructor<?> constructor =
componentClass.getConstructor(JsonConfig.class);
+ String jsonString = configNode != null ? configNode.toString()
: "{}";
+ JsonConfig jsonConfig = () -> jsonString;
+ return (T) constructor.newInstance(jsonConfig);
+ } catch (NoSuchMethodException e) {
+ // No JsonConfig constructor, fall back to other methods
+ }
+
+ // Fall back to Jackson bean deserialization or zero-arg
constructor
+ if (configNode == null || configNode.isEmpty()) {
+ return (T)
componentClass.getDeclaredConstructor().newInstance();
+ }
+
+ return (T) objectMapper.treeToValue(configNode, componentClass);
+
+ } catch (Exception e) {
+ throw new TikaConfigException(
+ "Failed to instantiate component '" +
componentClass.getName() + "': " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * Instantiates a component by resolving a friendly name or FQCN to a
class.
+ * <p>
+ * This is a convenience method that combines name resolution with
instantiation.
+ *
+ * @param typeName the component type name (friendly name like
"pdf-parser" or FQCN)
+ * @param configNode the JSON configuration node (may be null or empty)
+ * @param objectMapper the Jackson ObjectMapper for deserialization
+ * @param classLoader the class loader for name resolution
+ * @param <T> the component type
+ * @return the instantiated component
+ * @throws TikaConfigException if instantiation fails or type name is
unknown
+ */
+ public static <T> T instantiate(String typeName,
+ JsonNode configNode,
+ ObjectMapper objectMapper,
+ ClassLoader classLoader)
+ throws TikaConfigException {
+ try {
+ Class<?> componentClass =
ComponentNameResolver.resolveClass(typeName, classLoader);
+ return instantiate(componentClass, configNode, objectMapper);
+ } catch (ClassNotFoundException e) {
+ throw new TikaConfigException("Unknown component type: '" +
typeName + "'", e);
+ }
+ }
+
/**
* Checks if the JsonConfig contains actual configuration (non-empty JSON
object with fields).
*
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
index da5d5f59e2..6b65658abb 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
@@ -28,7 +28,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.JsonConfig;
import org.apache.tika.exception.TikaConfigException;
/**
@@ -47,7 +46,6 @@ public class CompositeComponentLoader<T> {
private final Class<T> componentInterface;
private final String componentTypeName;
- private final String indexFileName;
private final ClassLoader classLoader;
private final ObjectMapper objectMapper;
@@ -56,16 +54,13 @@ public class CompositeComponentLoader<T> {
*
* @param componentInterface the component interface (e.g., Detector.class)
* @param componentTypeName the JSON config key (e.g., "detectors")
- * @param indexFileName the index file name (e.g., "detectors")
* @param classLoader the class loader
* @param objectMapper the Jackson ObjectMapper
*/
public CompositeComponentLoader(Class<T> componentInterface, String
componentTypeName,
- String indexFileName, ClassLoader
classLoader,
- ObjectMapper objectMapper) {
+ ClassLoader classLoader, ObjectMapper
objectMapper) {
this.componentInterface = componentInterface;
this.componentTypeName = componentTypeName;
- this.indexFileName = indexFileName;
this.classLoader = classLoader;
this.objectMapper = objectMapper;
}
@@ -103,14 +98,13 @@ public class CompositeComponentLoader<T> {
return Collections.emptyList();
}
- ComponentRegistry registry = new ComponentRegistry(indexFileName,
classLoader);
List<T> instances = new ArrayList<>();
for (Map.Entry<String, JsonNode> entry : arrayComponents) {
String name = entry.getKey();
JsonNode configNode = entry.getValue();
- T instance = loadComponent(name, configNode, registry);
+ T instance = deserializeComponent(name, configNode);
instances.add(instance);
}
@@ -129,87 +123,29 @@ public class CompositeComponentLoader<T> {
// Load configured components
if (config.hasComponents(componentTypeName)) {
- ComponentRegistry registry = new ComponentRegistry(indexFileName,
classLoader);
Map<String, JsonNode> components =
config.getComponents(componentTypeName);
for (Map.Entry<String, JsonNode> entry : components.entrySet()) {
String name = entry.getKey();
JsonNode configNode = entry.getValue();
- T instance = loadConfiguredComponent(name, configNode,
registry);
+ T instance = deserializeComponent(name, configNode);
instances.add(instance);
}
}
// Add SPI-discovered components
- List<T> spiComponents = loadSpiComponents();
+ List<T> spiComponents = loadAllFromSpi();
instances.addAll(spiComponents);
return instances;
}
- private T loadConfiguredComponent(String name, JsonNode configNode,
- ComponentRegistry registry)
- throws TikaConfigException {
- try {
- // Get component class
- Class<?> componentClass = registry.getComponentClass(name);
-
- // Extract framework config
- FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
-
- // Instantiate component
- T instance = instantiateComponent(componentClass,
frameworkConfig.getComponentConfigJson());
-
- return instance;
-
- } catch (Exception e) {
- throw new TikaConfigException("Failed to load component '" + name
+ "' of type " +
- componentTypeName, e);
- }
- }
-
- private T instantiateComponent(Class<?> componentClass, JsonConfig
configJson)
- throws TikaConfigException {
- return ComponentInstantiator.instantiate(componentClass, configJson,
classLoader,
- componentTypeName, objectMapper);
- }
-
- private List<T> loadSpiComponents() {
- List<T> result = new ArrayList<>();
- ServiceLoader<T> serviceLoader =
ServiceLoader.load(componentInterface, classLoader);
-
- Iterator<T> iterator = serviceLoader.iterator();
- while (iterator.hasNext()) {
- try {
- T instance = iterator.next();
- result.add(instance);
- } catch (Exception e) {
- // Log and skip problematic SPI providers
- LOG.warn("Failed to load SPI component of type {}: {}",
componentTypeName, e.getMessage(), e);
- }
- }
-
- return result;
- }
-
- private T loadComponent(String name, JsonNode configNode,
ComponentRegistry registry)
- throws TikaConfigException {
- try {
- // Get component class
- Class<?> componentClass = registry.getComponentClass(name);
-
- // Wrap JSON string in JsonConfig
- String jsonString = objectMapper.writeValueAsString(configNode);
- JsonConfig jsonConfig = () -> jsonString;
-
- // Instantiate component
- return instantiateComponent(componentClass, jsonConfig);
-
- } catch (Exception e) {
- throw new TikaConfigException("Failed to load component '" + name
+ "' of type " +
- componentTypeName, e);
- }
+ /**
+ * Deserializes a component, trying JsonConfig constructor first, then
Jackson bean deserialization.
+ */
+ private T deserializeComponent(String name, JsonNode configNode) throws
TikaConfigException {
+ return ComponentInstantiator.instantiate(name, configNode,
objectMapper, classLoader);
}
private List<T> loadAllFromSpi() {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
index bb9e2b6d30..b57aae89ee 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java
@@ -125,12 +125,14 @@ public class ConfigLoader {
/**
* Loads a configuration object from the specified JSON key.
* <p>
- * Supports three formats for interfaces:
+ * Supports two formats:
* <ul>
- * <li>String value: treated as class name or component name to look
up</li>
- * <li>Object with "@class": explicit type specification</li>
- * <li>Object without "@class": attempts direct deserialization (works
for concrete classes)</li>
+ * <li>String value: treated as fully qualified class name to
instantiate</li>
+ * <li>Object: deserialized directly into the target class</li>
* </ul>
+ * <p>
+ * For tier-1 polymorphic types (Parser, Detector, MetadataFilter), use
the wrapper
+ * object format with friendly names: {@code {"pdf-parser": {...}}}
*
* @param key The JSON key to load from
* @param clazz The class to deserialize into (can be interface, abstract,
or concrete)
@@ -148,14 +150,14 @@ public class ConfigLoader {
}
try {
- // Strategy 1: String value - treat as class name
+ // Strategy 1: String value - treat as class name (for interfaces)
if (node.isTextual()) {
return loadFromClassName(node.asText(), clazz);
}
- // Strategy 2: Let Jackson handle everything else
- // Jackson's activateDefaultTyping will automatically handle
@class fields
- // for interfaces/abstract classes via the
PolymorphicObjectMapperFactory configuration
+ // Strategy 2: Direct deserialization
+ // For tier-1 types (Parser, Detector, MetadataFilter), mixins
handle polymorphism
+ // For concrete classes, Jackson deserializes directly
return objectMapper.treeToValue(node, clazz);
} catch (JsonProcessingException e) {
throw new TikaConfigException(
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
index 321cf878e5..35820c47b1 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
@@ -29,7 +29,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.JsonConfig;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -72,7 +71,6 @@ public class DetectorLoader {
// Load configured detectors
if (config.hasComponentSection("detectors")) {
List<Detector> detectorList = new ArrayList<>();
- ComponentRegistry registry = new ComponentRegistry("detectors",
classLoader);
List<Map.Entry<String, JsonNode>> detectors =
config.getArrayComponents("detectors");
// Check if "default-detector" is in the list and extract
exclusions
@@ -82,41 +80,7 @@ public class DetectorLoader {
for (Map.Entry<String, JsonNode> entry : detectors) {
if ("default-detector".equals(entry.getKey())) {
hasDefaultDetector = true;
-
- // Parse exclusions from default-detector config
- JsonNode configNode = entry.getValue();
- if (configNode != null && configNode.has("_exclude")) {
- JsonNode excludeNode = configNode.get("_exclude");
- if (excludeNode.isArray()) {
- for (JsonNode excludeName : excludeNode) {
- if (excludeName.isTextual()) {
- String detectorName = excludeName.asText();
- try {
- Class<?> detectorClass;
- // Try as component name first
- try {
- detectorClass =
registry.getComponentClass(detectorName);
- } catch (TikaConfigException e) {
- // If not found as component name,
try as FQCN
- try {
- detectorClass =
Class.forName(detectorName, false, classLoader);
- } catch (ClassNotFoundException
ex) {
- LOG.warn("Unknown detector in
default-detector exclude list: {}", detectorName);
- continue;
- }
- }
- @SuppressWarnings("unchecked")
- Class<? extends Detector>
detectorTyped =
- (Class<? extends Detector>)
detectorClass;
-
excludedDetectorClasses.add(detectorTyped);
- LOG.debug("Excluding detector from
SPI: {}", detectorName);
- } catch (Exception e) {
- LOG.warn("Failed to exclude detector
'{}': {}", detectorName, e.getMessage());
- }
- }
- }
- }
- }
+
excludedDetectorClasses.addAll(parseExclusions(entry.getValue()));
break;
}
}
@@ -133,8 +97,16 @@ public class DetectorLoader {
continue;
}
- JsonNode configNode = entry.getValue();
- Detector detector = loadConfiguredDetector(name, configNode,
registry);
+ // Special case: mime-types requires the initialized registry
from TikaLoader
+ if ("mime-types".equals(name)) {
+ LOG.debug("Using TikaLoader.getMimeTypes() for mime-types
detector");
+ detectorList.add(TikaLoader.getMimeTypes());
+
configuredDetectorClasses.add(TikaLoader.getMimeTypes().getClass());
+ continue;
+ }
+
+ // Use Jackson with mixins to deserialize - the TypeIdResolver
handles name resolution
+ Detector detector = deserializeDetector(name,
entry.getValue());
detectorList.add(detector);
@SuppressWarnings("unchecked")
Class<? extends Detector> detectorClass =
@@ -167,51 +139,51 @@ public class DetectorLoader {
}
}
- private Detector loadConfiguredDetector(String name, JsonNode configNode,
- ComponentRegistry registry)
- throws TikaConfigException {
- try {
- // Special case: mime-types requires the initialized registry from
TikaLoader
- // The no-arg constructor creates an empty MimeTypes without the
XML-loaded types
- if ("mime-types".equals(name)) {
- LOG.debug("Using TikaLoader.getMimeTypes() for mime-types
detector");
- return TikaLoader.getMimeTypes();
- }
-
- // Get detector class - try component name first, then FQCN
fallback
- Class<?> detectorClass;
- try {
- detectorClass = registry.getComponentClass(name);
- } catch (TikaConfigException e) {
- // If not found as component name, try as fully qualified
class name
- try {
- detectorClass = Class.forName(name, false, classLoader);
- LOG.debug("Loaded detector by FQCN: {}", name);
- } catch (ClassNotFoundException ex) {
- throw new TikaConfigException("Unknown detector: '" + name
+
- "'. Not found as component name or FQCN.", e);
- }
- }
-
- // Extract framework config
- FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
+ /**
+ * Deserializes a detector, trying JsonConfig constructor first, then
Jackson bean deserialization.
+ */
+ private Detector deserializeDetector(String name, JsonNode configNode)
throws TikaConfigException {
+ return ComponentInstantiator.instantiate(name, configNode,
objectMapper, classLoader);
+ }
- // Instantiate detector
- Detector detector = instantiateDetector(detectorClass,
frameworkConfig.getComponentConfigJson());
+ /**
+ * Parses exclusion list from default-detector config.
+ */
+ @SuppressWarnings("unchecked")
+ private Set<Class<? extends Detector>> parseExclusions(JsonNode
configNode) {
+ Set<Class<? extends Detector>> excluded = new HashSet<>();
+ if (configNode == null || !configNode.has("_exclude")) {
+ return excluded;
+ }
- return detector;
+ JsonNode excludeNode = configNode.get("_exclude");
+ if (!excludeNode.isArray()) {
+ return excluded;
+ }
- } catch (TikaConfigException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaConfigException("Failed to load detector '" + name +
"'", e);
+ for (JsonNode excludeName : excludeNode) {
+ if (!excludeName.isTextual()) {
+ continue;
+ }
+ String detectorName = excludeName.asText();
+ try {
+ // Try to resolve via TypeIdResolver's logic (registry lookup
then Class.forName)
+ Class<?> detectorClass = resolveClass(detectorName);
+ excluded.add((Class<? extends Detector>) detectorClass);
+ LOG.debug("Excluding detector from SPI: {}", detectorName);
+ } catch (Exception e) {
+ LOG.warn("Unknown detector in exclude list: {}", detectorName);
+ }
}
+ return excluded;
}
- private Detector instantiateDetector(Class<?> detectorClass, JsonConfig
jsonConfig)
- throws TikaConfigException {
- return ComponentInstantiator.instantiate(detectorClass, jsonConfig,
classLoader,
- "Detector", objectMapper);
+ /**
+ * Resolves a name to a class, trying friendly name lookup first then FQCN.
+ */
+ private Class<?> resolveClass(String name) throws ClassNotFoundException {
+ return org.apache.tika.serialization.ComponentNameResolver
+ .resolveClass(name, classLoader);
}
/**
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
index 66fa71adc8..25668d38eb 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
@@ -29,7 +29,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.JsonConfig;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
@@ -72,7 +71,6 @@ public class EncodingDetectorLoader {
// Load configured encoding detectors
if (config.hasComponentSection("encoding-detectors")) {
List<EncodingDetector> detectorList = new ArrayList<>();
- ComponentRegistry registry = new
ComponentRegistry("encoding-detectors", classLoader);
List<Map.Entry<String, JsonNode>> detectors =
config.getArrayComponents("encoding-detectors");
// Check if "default-encoding-detector" is in the list and extract
exclusions
@@ -82,41 +80,7 @@ public class EncodingDetectorLoader {
for (Map.Entry<String, JsonNode> entry : detectors) {
if ("default-encoding-detector".equals(entry.getKey())) {
hasDefaultEncodingDetector = true;
-
- // Parse exclusions from default-encoding-detector config
- JsonNode configNode = entry.getValue();
- if (configNode != null && configNode.has("_exclude")) {
- JsonNode excludeNode = configNode.get("_exclude");
- if (excludeNode.isArray()) {
- for (JsonNode excludeName : excludeNode) {
- if (excludeName.isTextual()) {
- String detectorName = excludeName.asText();
- try {
- Class<?> detectorClass;
- // Try as component name first
- try {
- detectorClass =
registry.getComponentClass(detectorName);
- } catch (TikaConfigException e) {
- // If not found as component name,
try as FQCN
- try {
- detectorClass =
Class.forName(detectorName, false, classLoader);
- } catch (ClassNotFoundException
ex) {
- LOG.warn("Unknown encoding
detector in default-encoding-detector exclude list: {}", detectorName);
- continue;
- }
- }
- @SuppressWarnings("unchecked")
- Class<? extends EncodingDetector>
detectorTyped =
- (Class<? extends
EncodingDetector>) detectorClass;
-
excludedDetectorClasses.add(detectorTyped);
- LOG.debug("Excluding encoding detector
from SPI: {}", detectorName);
- } catch (Exception e) {
- LOG.warn("Failed to exclude encoding
detector '{}': {}", detectorName, e.getMessage());
- }
- }
- }
- }
- }
+
excludedDetectorClasses.addAll(parseExclusions(entry.getValue()));
break;
}
}
@@ -133,8 +97,8 @@ public class EncodingDetectorLoader {
continue;
}
- JsonNode configNode = entry.getValue();
- EncodingDetector detector =
loadConfiguredEncodingDetector(name, configNode, registry);
+ // Use Jackson with mixins to deserialize
+ EncodingDetector detector = deserializeEncodingDetector(name,
entry.getValue());
detectorList.add(detector);
@SuppressWarnings("unchecked")
Class<? extends EncodingDetector> detectorClass =
@@ -146,8 +110,6 @@ public class EncodingDetectorLoader {
configuredDetectorClasses.addAll(excludedDetectorClasses);
// Add SPI-discovered detectors only if
"default-encoding-detector" is in config
- // If "default-encoding-detector" is present, use SPI fallback for
unlisted detectors
- // If "default-encoding-detector" is NOT present, only load
explicitly configured detectors
if (hasDefaultEncodingDetector) {
DefaultEncodingDetector defaultDetector =
createDefaultEncodingDetector(configuredDetectorClasses);
LOG.debug("Loading SPI encoding detectors because
'default-encoding-detector' is in config");
@@ -166,45 +128,51 @@ public class EncodingDetectorLoader {
}
}
- private EncodingDetector loadConfiguredEncodingDetector(String name,
JsonNode configNode,
- ComponentRegistry
registry)
+ /**
+ * Deserializes an encoding detector, trying JsonConfig constructor first,
then Jackson bean deserialization.
+ */
+ private EncodingDetector deserializeEncodingDetector(String name, JsonNode
configNode)
throws TikaConfigException {
- try {
- // Get encoding detector class - try component name first, then
FQCN fallback
- Class<?> detectorClass;
- try {
- detectorClass = registry.getComponentClass(name);
- } catch (TikaConfigException e) {
- // If not found as component name, try as fully qualified
class name
- try {
- detectorClass = Class.forName(name, false, classLoader);
- LOG.debug("Loaded encoding detector by FQCN: {}", name);
- } catch (ClassNotFoundException ex) {
- throw new TikaConfigException("Unknown encoding detector:
'" + name +
- "'. Not found as component name or FQCN.", e);
- }
- }
-
- // Extract framework config
- FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
+ return ComponentInstantiator.instantiate(name, configNode,
objectMapper, classLoader);
+ }
- // Instantiate encoding detector
- EncodingDetector detector =
instantiateEncodingDetector(detectorClass,
- frameworkConfig.getComponentConfigJson());
+ /**
+ * Parses exclusion list from default-encoding-detector config.
+ */
+ @SuppressWarnings("unchecked")
+ private Set<Class<? extends EncodingDetector>> parseExclusions(JsonNode
configNode) {
+ Set<Class<? extends EncodingDetector>> excluded = new HashSet<>();
+ if (configNode == null || !configNode.has("_exclude")) {
+ return excluded;
+ }
- return detector;
+ JsonNode excludeNode = configNode.get("_exclude");
+ if (!excludeNode.isArray()) {
+ return excluded;
+ }
- } catch (TikaConfigException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaConfigException("Failed to load encoding detector '"
+ name + "'", e);
+ for (JsonNode excludeName : excludeNode) {
+ if (!excludeName.isTextual()) {
+ continue;
+ }
+ String detectorName = excludeName.asText();
+ try {
+ Class<?> detectorClass = resolveClass(detectorName);
+ excluded.add((Class<? extends EncodingDetector>)
detectorClass);
+ LOG.debug("Excluding encoding detector from SPI: {}",
detectorName);
+ } catch (Exception e) {
+ LOG.warn("Unknown encoding detector in exclude list: {}",
detectorName);
+ }
}
+ return excluded;
}
- private EncodingDetector instantiateEncodingDetector(Class<?>
detectorClass, JsonConfig jsonConfig)
- throws TikaConfigException {
- return ComponentInstantiator.instantiate(detectorClass, jsonConfig,
classLoader,
- "EncodingDetector", objectMapper);
+ /**
+ * Resolves a name to a class, trying friendly name lookup first then FQCN.
+ */
+ private Class<?> resolveClass(String name) throws ClassNotFoundException {
+ return org.apache.tika.serialization.ComponentNameResolver
+ .resolveClass(name, classLoader);
}
/**
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
index 96a101d34a..34952ee396 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
@@ -44,10 +44,13 @@ public class FrameworkConfig {
private final ParserDecoration decoration;
private final JsonConfig componentConfigJson;
+ private final JsonNode componentConfigNode;
- private FrameworkConfig(ParserDecoration decoration, JsonConfig
componentConfigJson) {
+ private FrameworkConfig(ParserDecoration decoration, JsonConfig
componentConfigJson,
+ JsonNode componentConfigNode) {
this.decoration = decoration;
this.componentConfigJson = componentConfigJson;
+ this.componentConfigNode = componentConfigNode;
}
/**
@@ -63,7 +66,7 @@ public class FrameworkConfig {
if (configNode == null || !configNode.isObject()) {
String jsonString = objectMapper.writeValueAsString(configNode);
JsonConfig jsonConfig = () -> jsonString;
- return new FrameworkConfig(null, jsonConfig);
+ return new FrameworkConfig(null, jsonConfig, configNode);
}
ObjectNode objNode = (ObjectNode) configNode.deepCopy();
@@ -81,7 +84,7 @@ public class FrameworkConfig {
String jsonString = objectMapper.writeValueAsString(objNode);
JsonConfig componentConfigJson = () -> jsonString;
- return new FrameworkConfig(decoration, componentConfigJson);
+ return new FrameworkConfig(decoration, componentConfigJson, objNode);
}
private static List<String> parseStringList(JsonNode node) {
@@ -111,6 +114,10 @@ public class FrameworkConfig {
return componentConfigJson;
}
+ public JsonNode getComponentConfigNode() {
+ return componentConfigNode;
+ }
+
/**
* Parser decoration configuration for mime type filtering.
*/
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index aa19032f93..5d1f60cdc4 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -16,8 +16,6 @@
*/
package org.apache.tika.config.loader;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
@@ -33,7 +31,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.JsonConfig;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.mime.MediaType;
@@ -43,7 +40,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.RenderingParser;
import org.apache.tika.renderer.Renderer;
-import org.apache.tika.utils.ServiceLoaderUtils;
/**
* Loader for parsers with support for decoration (mime type filtering).
@@ -95,7 +91,6 @@ public class ParserLoader {
// Load configured parsers
if (config.hasComponentSection("parsers")) {
- ComponentRegistry registry = new ComponentRegistry("parsers",
classLoader);
List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
// Check if "default-parser" is in the list and extract exclusions
and decorations
@@ -125,23 +120,11 @@ public class ParserLoader {
if (excludeName.isTextual()) {
String parserName = excludeName.asText();
try {
- Class<?> parserClass;
- // Try as component name first
- try {
- parserClass =
registry.getComponentClass(parserName);
- } catch (TikaConfigException e) {
- // If not found as component name,
try as FQCN
- try {
- parserClass =
Class.forName(parserName, false, classLoader);
- } catch (ClassNotFoundException
ex) {
- LOG.warn("Unknown parser in
default-parser exclude list: {}", parserName);
- continue;
- }
- }
+ Class<?> parserClass =
resolveClass(parserName);
excludedParserClasses.add(parserClass);
LOG.debug("Excluding parser from SPI:
{}", parserName);
} catch (Exception e) {
- LOG.warn("Failed to exclude parser
'{}': {}", parserName, e.getMessage());
+ LOG.warn("Unknown parser in
default-parser exclude list: {}", parserName);
}
}
}
@@ -173,7 +156,7 @@ public class ParserLoader {
}
JsonNode configNode = entry.getValue();
- ParsedParserConfig parsed = loadConfiguredParser(name,
configNode, registry);
+ ParsedParserConfig parsed = loadConfiguredParser(name,
configNode);
parsedConfigs.put(name, parsed);
}
@@ -229,30 +212,24 @@ public class ParserLoader {
return new CompositeParser(TikaLoader.getMediaTypeRegistry(),
parserList);
}
- private ParsedParserConfig loadConfiguredParser(String name, JsonNode
configNode,
- ComponentRegistry registry)
+ private ParsedParserConfig loadConfiguredParser(String name, JsonNode
configNode)
throws TikaConfigException {
try {
- // Get parser class - try component name first, then FQCN fallback
- Class<?> parserClass;
- try {
- parserClass = registry.getComponentClass(name);
- } catch (TikaConfigException e) {
- // If not found as component name, try as fully qualified
class name
- try {
- parserClass = Class.forName(name, false, classLoader);
- LOG.debug("Loaded parser by FQCN: {}", name);
- } catch (ClassNotFoundException ex) {
- throw new TikaConfigException("Unknown parser: '" + name +
- "'. Not found as component name or FQCN.", e);
- }
- }
-
- // Extract framework config
+ // Extract framework config (decorations like
mimeInclude/mimeExclude)
FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
- // Instantiate parser
- Parser parser = instantiateParser(parserClass,
frameworkConfig.getComponentConfigJson());
+ // Use Jackson with mixins to deserialize - the TypeIdResolver
handles name resolution
+ Parser parser = deserializeParser(name,
frameworkConfig.getComponentConfigNode());
+
+ // Post-process: inject EncodingDetector for
AbstractEncodingDetectorParser
+ if (parser instanceof AbstractEncodingDetectorParser) {
+ ((AbstractEncodingDetectorParser)
parser).setEncodingDetector(encodingDetector);
+ }
+
+ // Post-process: inject Renderer for RenderingParser
+ if (parser instanceof RenderingParser && renderer != null) {
+ ((RenderingParser) parser).setRenderer(renderer);
+ }
return new ParsedParserConfig(name, parser,
frameworkConfig.getDecoration());
@@ -263,53 +240,19 @@ public class ParserLoader {
}
}
- @SuppressWarnings("unchecked")
- private Parser instantiateParser(Class<?> parserClass, JsonConfig
jsonConfig)
- throws TikaConfigException {
-
- try {
- Parser parser;
-
- // Try constructor with JsonConfig parameter
- try {
- Constructor<?> constructor =
parserClass.getConstructor(JsonConfig.class);
- parser = (Parser) constructor.newInstance(jsonConfig);
- } catch (NoSuchMethodException e) {
- // Check if JSON config has actual configuration
- if (ComponentInstantiator.hasConfiguration(jsonConfig,
objectMapper)) {
- throw new TikaConfigException(
- "Parser '" + parserClass.getName() + "' has
configuration in JSON, " +
- "but does not have a constructor that accepts
JsonConfig. " +
- "Please add a constructor: public " +
parserClass.getSimpleName() + "(JsonConfig jsonConfig)");
- }
-
- // Try constructor with EncodingDetector parameter (for
AbstractEncodingDetectorParser)
- if
(AbstractEncodingDetectorParser.class.isAssignableFrom(parserClass)) {
- try {
- Constructor<?> constructor =
parserClass.getConstructor(EncodingDetector.class);
- parser = (Parser)
constructor.newInstance(encodingDetector);
- } catch (NoSuchMethodException ex) {
- // Fall back to zero-arg constructor
- parser = (Parser)
ServiceLoaderUtils.newInstance(parserClass,
- new
org.apache.tika.config.ServiceLoader(classLoader));
- }
- } else {
- // Fall back to zero-arg constructor
- parser = (Parser)
ServiceLoaderUtils.newInstance(parserClass,
- new
org.apache.tika.config.ServiceLoader(classLoader));
- }
- }
-
- // Inject renderer for RenderingParser instances
- if (parser instanceof RenderingParser && renderer != null) {
- ((RenderingParser) parser).setRenderer(renderer);
- }
+ /**
+ * Deserializes a parser, trying JsonConfig constructor first, then
Jackson bean deserialization.
+ */
+ private Parser deserializeParser(String name, JsonNode configNode) throws
TikaConfigException {
+ return ComponentInstantiator.instantiate(name, configNode,
objectMapper, classLoader);
+ }
- return parser;
- } catch (InstantiationException | IllegalAccessException |
InvocationTargetException e) {
- throw new TikaConfigException("Failed to instantiate parser: " +
- parserClass.getName(), e);
- }
+ /**
+ * Resolves a name to a class, trying friendly name lookup first then FQCN.
+ */
+ private Class<?> resolveClass(String name) throws ClassNotFoundException {
+ return org.apache.tika.serialization.ComponentNameResolver
+ .resolveClass(name, classLoader);
}
private Parser applyMimeFiltering(Parser parser,
FrameworkConfig.ParserDecoration decoration) {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java
deleted file mode 100644
index b920d7cd0a..0000000000
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/PolymorphicObjectMapperFactory.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config.loader;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Enumeration;
-import java.util.List;
-
-import com.fasterxml.jackson.annotation.JsonTypeInfo;
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.SerializationFeature;
-import com.fasterxml.jackson.databind.jsontype.BasicPolymorphicTypeValidator;
-import com.fasterxml.jackson.databind.jsontype.PolymorphicTypeValidator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Factory for creating ObjectMappers with consistent polymorphic type handling
- * across Tika configuration and ParseContext serialization.
- */
-public class PolymorphicObjectMapperFactory {
-
- private static final Logger LOG =
LoggerFactory.getLogger(PolymorphicObjectMapperFactory.class);
-
- /**
- * Classpath resource file where users can specify additional package
prefixes
- * to allow for polymorphic deserialization. One package prefix per line.
- * Comments (lines starting with #) and blank lines are ignored.
- *
- * Example content:
- * <pre>
- * # Allow com.acme classes
- * com.acme
- * # Allow com.example classes
- * com.example
- * </pre>
- */
- public static final String ALLOWED_PACKAGES_RESOURCE =
"META-INF/tika-serialization-allowlist.txt";
-
- private static ObjectMapper MAPPER = null;
-
- public static synchronized ObjectMapper getMapper() {
- if (MAPPER == null) {
- MAPPER = createPolymorphicMapper();
- }
- return MAPPER;
- }
-
- /**
- * Creates an ObjectMapper with polymorphic type handling for Tika
configuration.
- * Configures security validation to allow Tika classes and any additional
- * packages specified via {@link #ALLOWED_PACKAGES_RESOURCE} files on the
classpath.
- *
- * @return configured ObjectMapper
- */
- public static ObjectMapper createPolymorphicMapper() {
- ObjectMapper mapper = new ObjectMapper();
-
- // Fail on unknown properties to catch configuration errors early
- mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,
true);
-
- // Prevent null values being assigned to primitive fields (int,
boolean, etc.)
- mapper.configure(DeserializationFeature.FAIL_ON_NULL_FOR_PRIMITIVES,
true);
-
- // Ensure enums are properly validated (not just numeric values)
- mapper.configure(DeserializationFeature.FAIL_ON_NUMBERS_FOR_ENUMS,
true);
-
- // Catch duplicate keys in JSON objects
- mapper.configure(DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY,
true);
-
- //Need to allow creation of classes without setters/getters -- we may
want to revisit this
- mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
-
- // Build polymorphic type validator
- BasicPolymorphicTypeValidator.Builder builder =
BasicPolymorphicTypeValidator.builder()
- .allowIfSubType("org.apache.tika.")
- .allowIfSubType("java.util.")
- .allowIfSubType("java.nio.file.");
-
- // Add user-specified packages from classpath
- List<String> additionalPackages = loadAllowedPackages();
- for (String packagePrefix : additionalPackages) {
- builder.allowIfSubType(packagePrefix);
- }
-
- PolymorphicTypeValidator typeValidator = builder.build();
-
- // Use OBJECT_AND_NON_CONCRETE to add type info when static type is:
- // - Object.class (for objects in maps)
- // - Abstract classes or interfaces (for polymorphic fields)
- mapper.activateDefaultTyping(typeValidator,
ObjectMapper.DefaultTyping.OBJECT_AND_NON_CONCRETE, JsonTypeInfo.As.PROPERTY);
-
- return mapper;
- }
-
- /**
- * Loads additional package prefixes from classpath resources.
- * Scans all {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath.
- *
- * @return list of additional package prefixes to allow
- */
- private static List<String> loadAllowedPackages() {
- List<String> packages = new ArrayList<>();
- try {
- Enumeration<URL> resources =
PolymorphicObjectMapperFactory.class.getClassLoader()
- .getResources(ALLOWED_PACKAGES_RESOURCE);
-
- while (resources.hasMoreElements()) {
- URL resource = resources.nextElement();
- LOG.debug("Loading allowed packages from: {}", resource);
-
- try (InputStream is = resource.openStream();
- BufferedReader reader = new BufferedReader(new
InputStreamReader(is, StandardCharsets.UTF_8))) {
-
- String line;
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- // Skip comments and empty lines
- if (line.isEmpty() || line.startsWith("#")) {
- continue;
- }
- packages.add(line);
- LOG.info("Allowing polymorphic deserialization for
package: {}", line);
- }
- } catch (IOException e) {
- LOG.warn("Failed to read allowed packages from: {}",
resource, e);
- }
- }
- } catch (IOException e) {
- LOG.warn("Failed to load allowed packages resources", e);
- }
- return packages;
- }
-}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 06da0f3175..2eeb8bc7a2 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -126,7 +126,7 @@ public class TikaJsonConfig {
);
private static final ObjectMapper OBJECT_MAPPER =
- PolymorphicObjectMapperFactory.getMapper();
+ TikaObjectMapperFactory.getMapper();
private final JsonNode rootNode;
private final Map<String, Map<String, JsonNode>> componentsByType;
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 01aa21e0f6..acf7be7161 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -92,7 +92,7 @@ public class TikaLoader {
private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) {
this.config = config;
this.classLoader = classLoader;
- this.objectMapper = PolymorphicObjectMapperFactory.getMapper();
+ this.objectMapper = TikaObjectMapperFactory.getMapper();
}
/**
@@ -250,8 +250,7 @@ public class TikaLoader {
if (config.hasComponentSection("metadata-filters")) {
// Load explicitly configured filters (no SPI fallback)
CompositeComponentLoader<MetadataFilter> loader = new
CompositeComponentLoader<>(
- MetadataFilter.class, "metadata-filters",
"metadata-filters",
- classLoader, objectMapper);
+ MetadataFilter.class, "metadata-filters", classLoader,
objectMapper);
filterList = loader.loadFromArray(config);
} else {
// No config section - metadata filters are opt-in only, don't
load from SPI
@@ -278,7 +277,7 @@ public class TikaLoader {
public synchronized Renderer loadRenderers() throws TikaConfigException {
if (renderers == null) {
CompositeComponentLoader<Renderer> loader = new
CompositeComponentLoader<>(
- Renderer.class, "renderers", "renderers", classLoader,
objectMapper);
+ Renderer.class, "renderers", classLoader, objectMapper);
List<Renderer> rendererList = loader.loadFromArray(config);
renderers = new CompositeRenderer(rendererList);
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
new file mode 100644
index 0000000000..b45a20afc4
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.serialization.ComponentNameResolver;
+import org.apache.tika.serialization.TikaAbstractTypeMixins;
+
+/**
+ * Factory for creating ObjectMappers configured for Tika serialization.
+ * <p>
+ * Configures strict validation settings and loads component registries
+ * for friendly name resolution.
+ */
+public class TikaObjectMapperFactory {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaObjectMapperFactory.class);
+
+ /**
+ * Index file names for component registries.
+ */
+ private static final String[] REGISTRY_INDEX_FILES = {
+ "parsers",
+ "detectors",
+ "encoding-detectors",
+ "metadata-filters",
+ "renderers",
+ "translators",
+ "other-configs"
+ };
+
+ private static ObjectMapper MAPPER = null;
+
+ public static synchronized ObjectMapper getMapper() {
+ if (MAPPER == null) {
+ MAPPER = createMapper();
+ }
+ return MAPPER;
+ }
+
+ /**
+ * Creates an ObjectMapper configured for Tika serialization.
+ *
+ * @return configured ObjectMapper
+ */
+ public static ObjectMapper createMapper() {
+ ObjectMapper mapper = new ObjectMapper();
+
+ // Fail on unknown properties to catch configuration errors early
+ mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,
true);
+
+ // Prevent null values being assigned to primitive fields (int,
boolean, etc.)
+ mapper.configure(DeserializationFeature.FAIL_ON_NULL_FOR_PRIMITIVES,
true);
+
+ // Ensure enums are properly validated (not just numeric values)
+ mapper.configure(DeserializationFeature.FAIL_ON_NUMBERS_FOR_ENUMS,
true);
+
+ // Catch duplicate keys in JSON objects
+ mapper.configure(DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY,
true);
+
+ // Need to allow creation of classes without setters/getters -- we may
want to revisit this
+ mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
+
+ // Load component registries for name resolution
+ loadComponentRegistries();
+
+ // Register deserializers for abstract types using wrapper object
format
+ TikaAbstractTypeMixins.registerDeserializers(mapper);
+
+ return mapper;
+ }
+
+ /**
+ * Loads component registries for name resolution.
+ * Registries are loaded from index files and registered with the
ComponentNameResolver.
+ * Missing registries are silently ignored (may not be on classpath).
+ */
+ private static void loadComponentRegistries() {
+ ClassLoader classLoader =
TikaObjectMapperFactory.class.getClassLoader();
+
+ for (String indexFile : REGISTRY_INDEX_FILES) {
+ try {
+ ComponentRegistry registry = new ComponentRegistry(indexFile,
classLoader);
+ ComponentNameResolver.registerRegistry(indexFile, registry);
+ LOG.debug("Loaded component registry: {}", indexFile);
+ } catch (TikaConfigException e) {
+ // Registry not available - this is expected if the module
isn't on classpath
+ LOG.debug("Component registry not available: {} - {}",
indexFile, e.getMessage());
+ }
+ }
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java
index b84905e2c3..55b7f745a7 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TranslatorLoader.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.config.loader;
+import java.util.Iterator;
+import java.util.Map;
+
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.node.ObjectNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.config.JsonConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
@@ -30,6 +31,17 @@ import org.apache.tika.language.translate.Translator;
/**
* Loader for translators.
* Only one translator is supported at a time.
+ * <p>
+ * JSON format uses wrapper object style:
+ * <pre>
+ * {
+ * "translator": {
+ * "google-translator": {
+ * "apiKey": "..."
+ * }
+ * }
+ * }
+ * </pre>
*/
public class TranslatorLoader {
@@ -66,35 +78,32 @@ public class TranslatorLoader {
private Translator loadConfiguredTranslator(JsonNode translatorNode)
throws TikaConfigException {
- try {
- // The translator node should be an object with a "class" field
- if (!translatorNode.has("class")) {
- throw new TikaConfigException("Translator configuration must
have a 'class' field");
- }
-
- String className = translatorNode.get("class").asText();
- ComponentRegistry registry = new ComponentRegistry("translators",
classLoader);
- Class<?> translatorClass = registry.getComponentClass(className);
+ if (!translatorNode.isObject() || translatorNode.isEmpty()) {
+ throw new TikaConfigException(
+ "Translator configuration must be an object with
translator type as key");
+ }
- // Remove "class" field from config before extraction
- ObjectNode configCopy = ((ObjectNode) translatorNode).deepCopy();
- configCopy.remove("class");
+ // Get the single field name (translator type) and its config
+ Iterator<Map.Entry<String, JsonNode>> fields = translatorNode.fields();
+ Map.Entry<String, JsonNode> entry = fields.next();
- // Extract framework config (e.g., _decorate if present)
- FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configCopy, objectMapper);
+ if (fields.hasNext()) {
+ throw new TikaConfigException(
+ "Translator configuration must have exactly one translator
type");
+ }
- // Instantiate translator
- return instantiateTranslator(translatorClass,
frameworkConfig.getComponentConfigJson());
+ String typeName = entry.getKey();
+ JsonNode configNode = entry.getValue();
- } catch (Exception e) {
- throw new TikaConfigException("Failed to load translator", e);
- }
+ return deserializeTranslator(typeName, configNode);
}
- private Translator instantiateTranslator(Class<?> translatorClass,
JsonConfig jsonConfig)
+ /**
+ * Deserializes a translator, trying JsonConfig constructor first, then
Jackson bean deserialization.
+ */
+ private Translator deserializeTranslator(String name, JsonNode configNode)
throws TikaConfigException {
- return ComponentInstantiator.instantiate(translatorClass, jsonConfig,
classLoader,
- "Translator", objectMapper);
+ return ComponentInstantiator.instantiate(name, configNode,
objectMapper, classLoader);
}
/**
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
new file mode 100644
index 0000000000..739ed9944b
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.serialization;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Utility class that resolves friendly component names to classes using
ComponentRegistry.
+ * <p>
+ * Supports friendly names like "pdf-parser" as well as fully qualified class
names.
+ * Registries must be registered via {@link #registerRegistry(String,
ComponentRegistry)}
+ * before use.
+ */
+public final class ComponentNameResolver {
+
+ private static final Map<String, ComponentRegistry> REGISTRIES = new
ConcurrentHashMap<>();
+
+ private ComponentNameResolver() {
+ // Utility class
+ }
+
+ /**
+ * Registers a ComponentRegistry for name resolution.
+ *
+ * @param indexName the index file name (e.g., "parsers", "detectors")
+ * @param registry the registry to register
+ */
+ public static void registerRegistry(String indexName, ComponentRegistry
registry) {
+ REGISTRIES.put(indexName, registry);
+ }
+
+ /**
+ * Resolves a friendly name or FQCN to a Class.
+ * Searches all registered component registries, falling back to
Class.forName.
+ *
+ * @param name friendly name or fully qualified class name
+ * @param classLoader the class loader to use for FQCN fallback
+ * @return the resolved class
+ * @throws ClassNotFoundException if not found in any registry and not a
valid FQCN
+ */
+ public static Class<?> resolveClass(String name, ClassLoader classLoader)
+ throws ClassNotFoundException {
+ for (ComponentRegistry registry : REGISTRIES.values()) {
+ if (registry.hasComponent(name)) {
+ try {
+ return registry.getComponentClass(name);
+ } catch (TikaConfigException e) {
+ // continue to next registry
+ }
+ }
+ }
+ return Class.forName(name, false, classLoader);
+ }
+
+ /**
+ * Gets the friendly name for a class, or null if not registered.
+ *
+ * @param clazz the class to look up
+ * @return the friendly name, or null if not found
+ */
+ public static String getFriendlyName(Class<?> clazz) {
+ for (ComponentRegistry registry : REGISTRIES.values()) {
+ String friendlyName = registry.getFriendlyName(clazz);
+ if (friendlyName != null) {
+ return friendlyName;
+ }
+ }
+ return null;
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java
index c8feb39919..507a182110 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java
@@ -23,7 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.loader.JsonMergeUtils;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.parser.ParseContext;
/**
@@ -52,7 +52,7 @@ import org.apache.tika.parser.ParseContext;
*/
public class ConfigDeserializer {
- private static final ObjectMapper MAPPER =
PolymorphicObjectMapperFactory.getMapper();
+ private static final ObjectMapper MAPPER =
TikaObjectMapperFactory.getMapper();
/**
* Retrieves and deserializes a parser configuration from the
ConfigContainer in ParseContext.
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
index 518bffd160..437bca9332 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
@@ -31,7 +31,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.config.ConfigContainer;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.SelfConfiguring;
+import org.apache.tika.config.loader.ComponentInfo;
+import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.parser.ParseContext;
/**
@@ -44,15 +48,33 @@ import org.apache.tika.parser.ParseContext;
* <pre>
* {
* "pdf-parser": {"extractActions": true},
- * "tika-task-timeout": {"timeoutMillis": 5000},
- * "org.apache.tika.metadata.filter.MetadataFilter": {"@class": "...", ...}
+ * "tika-task-timeout": {"timeoutMillis": 5000}
* }
* </pre>
*/
public class ParseContextDeserializer extends JsonDeserializer<ParseContext> {
private static final Logger LOG =
LoggerFactory.getLogger(ParseContextDeserializer.class);
- private static final ObjectMapper MAPPER =
PolymorphicObjectMapperFactory.getMapper();
+ private static final ObjectMapper MAPPER =
TikaObjectMapperFactory.getMapper();
+
+ // Lazily loaded registry for looking up friendly names
+ private static volatile ComponentRegistry registry;
+
+ private static ComponentRegistry getRegistry() {
+ if (registry == null) {
+ synchronized (ParseContextDeserializer.class) {
+ if (registry == null) {
+ try {
+ registry = new ComponentRegistry("other-configs",
+
ParseContextDeserializer.class.getClassLoader());
+ } catch (TikaConfigException e) {
+ LOG.warn("Failed to load component registry for
deserialization", e);
+ }
+ }
+ }
+ }
+ return registry;
+ }
@Override
public ParseContext deserialize(JsonParser jsonParser,
@@ -93,18 +115,77 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
String fieldName = it.next();
JsonNode fieldValue = contextNode.get(fieldName);
+ // Try to resolve fieldName - either as FQCN or friendly name from
registry
+ Class<?> keyClass = null;
+
// Check if fieldName is a full class name (for directly
serialized Tika types)
if (fieldName.startsWith("org.apache.tika.")) {
try {
- Class<?> keyClass = Class.forName(fieldName);
- // Deserialize using the key class as the target type
- Object value = MAPPER.treeToValue(fieldValue, keyClass);
- parseContext.set((Class) keyClass, value);
- continue;
+ keyClass = Class.forName(fieldName);
} catch (ClassNotFoundException e) {
- LOG.debug("Class not found for key '{}', storing in
ConfigContainer", fieldName);
- } catch (Exception e) {
- throw new IOException("Failed to deserialize '" +
fieldName + "': " + e.getMessage(), e);
+ LOG.debug("Class not found for key '{}', will check
registry", fieldName);
+ }
+ }
+
+ // If not found as FQCN, check registry for friendly name
+ boolean isSelfConfiguring = false;
+ Class<?> contextKey = null; // The key to use when adding to
ParseContext
+ if (keyClass == null) {
+ ComponentRegistry reg = getRegistry();
+ if (reg != null && reg.hasComponent(fieldName)) {
+ try {
+ ComponentInfo info = reg.getComponentInfo(fieldName);
+ keyClass = info.componentClass();
+ isSelfConfiguring = info.selfConfiguring();
+ contextKey = info.contextKey();
+ LOG.debug("Resolved friendly name '{}' to class {}
(selfConfiguring={}, contextKey={})",
+ fieldName, keyClass.getName(),
isSelfConfiguring,
+ contextKey != null ? contextKey.getName() :
"null");
+ } catch (TikaConfigException e) {
+ LOG.debug("Failed to get component info for '{}': {}",
fieldName, e.getMessage());
+ }
+ }
+ } else {
+ // For FQCN resolution, check SelfConfiguring directly
+ isSelfConfiguring =
SelfConfiguring.class.isAssignableFrom(keyClass);
+ }
+
+ // If we found a class, check if it's SelfConfiguring
+ if (keyClass != null) {
+ // SelfConfiguring components (Parsers, Detectors, etc.)
handle their own config
+ // at runtime - keep their config in ConfigContainer for later
access
+ if (isSelfConfiguring) {
+ LOG.debug("'{}' maps to SelfConfiguring class {}, keeping
in ConfigContainer",
+ fieldName, keyClass.getName());
+ // Fall through to ConfigContainer storage below
+ } else {
+ // Non-SelfConfiguring - deserialize directly into
ParseContext
+ try {
+ // Check if fieldValue is a wrapper object format:
{"concrete-class": {props}}
+ Object value;
+ if (fieldValue.isObject() && fieldValue.size() == 1) {
+ String typeName = fieldValue.fieldNames().next();
+ JsonNode configNode = fieldValue.get(typeName);
+ // Try to resolve the concrete class
+ try {
+ Class<?> concreteClass =
ComponentNameResolver.resolveClass(typeName,
+
ParseContextDeserializer.class.getClassLoader());
+ value = MAPPER.treeToValue(configNode,
concreteClass);
+ } catch (ClassNotFoundException ex) {
+ // Fall back to key class
+ value = MAPPER.treeToValue(configNode,
keyClass);
+ }
+ } else {
+ // Not wrapper format, deserialize directly
+ value = MAPPER.treeToValue(fieldValue, keyClass);
+ }
+ // Use contextKey if specified, otherwise use the
component class
+ Class<?> parseContextKey = (contextKey != null) ?
contextKey : keyClass;
+ parseContext.set((Class) parseContextKey, value);
+ continue;
+ } catch (Exception e) {
+ throw new IOException("Failed to deserialize '" +
fieldName + "': " + e.getMessage(), e);
+ }
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
index a8df312dc6..bca2ef54a4 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
@@ -30,7 +30,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.loader.ComponentRegistry;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.parser.ParseContext;
@@ -57,7 +57,7 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
private static final Logger LOG =
LoggerFactory.getLogger(ParseContextSerializer.class);
public static final String PARSE_CONTEXT = "parseContext";
- private static final ObjectMapper MAPPER =
PolymorphicObjectMapperFactory.getMapper();
+ private static final ObjectMapper MAPPER =
TikaObjectMapperFactory.getMapper();
// Lazily loaded registry for looking up friendly names
private static volatile ComponentRegistry registry;
@@ -114,13 +114,13 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
// Try to get friendly name for this object's class
String friendlyName = (reg != null) ?
reg.getFriendlyName(value.getClass()) : null;
+ // Determine key: prefer friendly name, fall back to FQCN for Tika
types
String key;
if (friendlyName != null) {
- // Use friendly name if available
+ // Use friendly name if available (deserializer will resolve
via registry)
key = friendlyName;
} else if (entry.getKey().startsWith("org.apache.tika.")) {
- // For Tika types without friendly names (e.g., custom
MetadataFilter subclasses),
- // use the context key - polymorphic mapper will add @class
for the concrete type
+ // For Tika types without friendly names, use the context key
(FQCN)
key = entry.getKey();
} else {
// Skip non-Tika types without friendly names (e.g., String,
custom non-Tika classes)
@@ -129,19 +129,17 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
if (!writtenKeys.contains(key)) {
jsonGenerator.writeFieldName(key);
- // If using the context key (not friendly name), we need to
serialize
- // with the base type to get polymorphic @class info for
custom subclasses
- if (friendlyName == null) {
- try {
- Class<?> contextKeyClass =
Class.forName(entry.getKey());
-
MAPPER.writerFor(contextKeyClass).writeValue(jsonGenerator, value);
- } catch (ClassNotFoundException e) {
- // Fallback to default serialization
- MAPPER.writeValue(jsonGenerator, value);
- }
- } else {
- MAPPER.writeValue(jsonGenerator, value);
+ // Write wrapper object format with type info for polymorphic
deserialization
+ // Format: {"concrete-class-name": {properties...}}
+ jsonGenerator.writeStartObject();
+ String typeName = (friendlyName != null) ? friendlyName :
+
ComponentNameResolver.getFriendlyName(value.getClass());
+ if (typeName == null) {
+ typeName = value.getClass().getName();
}
+ jsonGenerator.writeFieldName(typeName);
+ MAPPER.writeValue(jsonGenerator, value);
+ jsonGenerator.writeEndObject();
writtenKeys.add(key);
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index d06b63f58e..354e8d1781 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -28,7 +28,7 @@ import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.config.loader.ComponentRegistry;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
@@ -36,22 +36,14 @@ import org.apache.tika.parser.ParseContext;
/**
* Utility methods for working with ParseContext objects in JSON-based
configurations.
* <p>
- * Supports both legacy verbose format and new friendly-name format:
+ * Uses friendly-name format for configuration:
* <pre>
- * // Legacy format:
- * "parse-context": {
- * "objects": {
- * "org.apache.tika.config.TikaTaskTimeout": {
- * "@class": "org.apache.tika.config.TikaTaskTimeout",
- * "timeoutMillis": 30000
- * }
- * }
- * }
- *
- * // New friendly-name format:
* "parse-context": {
* "tika-task-timeout": {
* "timeoutMillis": 30000
+ * },
+ * "pdf-parser": {
+ * "extractInlineImages": true
* }
* }
* </pre>
@@ -62,7 +54,7 @@ import org.apache.tika.parser.ParseContext;
public class ParseContextUtils {
private static final Logger LOG =
LoggerFactory.getLogger(ParseContextUtils.class);
- private static final ObjectMapper MAPPER =
PolymorphicObjectMapperFactory.getMapper();
+ private static final ObjectMapper MAPPER =
TikaObjectMapperFactory.getMapper();
/**
* Known interfaces that should be used as ParseContext keys.
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java
new file mode 100644
index 0000000000..f6b1555dec
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.serialization;
+
+import java.io.IOException;
+import java.lang.reflect.Modifier;
+
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.BeanDescription;
+import com.fasterxml.jackson.databind.DeserializationConfig;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonMappingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.deser.BeanDeserializerModifier;
+import com.fasterxml.jackson.databind.module.SimpleModule;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.loader.ComponentInstantiator;
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Jackson module that handles deserialization of abstract types using wrapper
object format.
+ * <p>
+ * Automatically applies to ANY abstract type (interface or abstract class)
without
+ * requiring hardcoded type lists. Supports both formats:
+ * <ul>
+ * <li>Wrapper format: {@code {"type-name": {"prop": "value"}}}</li>
+ * <li>Legacy @class format: {@code {"@class": "fqcn", "prop": "value"}}</li>
+ * </ul>
+ * <p>
+ * Example:
+ * <pre>
+ * "digesterFactory": {
+ * "commons-digester-factory": {
+ * "markLimit": 100000
+ * }
+ * }
+ * </pre>
+ */
+public final class TikaAbstractTypeMixins {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaAbstractTypeMixins.class);
+
+ private TikaAbstractTypeMixins() {
+ // Utility class
+ }
+
+ /**
+ * Registers the abstract type handling module on the given ObjectMapper.
+ *
+ * @param mapper the ObjectMapper to configure
+ */
+ public static void registerDeserializers(ObjectMapper mapper) {
+ SimpleModule module = new SimpleModule("TikaAbstractTypes");
+ module.setDeserializerModifier(new
AbstractTypeDeserializerModifier(mapper));
+ mapper.registerModule(module);
+ }
+
+ /**
+ * Modifier that intercepts deserialization of abstract types and applies
+ * wrapper object handling.
+ */
+ private static class AbstractTypeDeserializerModifier extends
BeanDeserializerModifier {
+
+ private final ObjectMapper mapper;
+
+ AbstractTypeDeserializerModifier(ObjectMapper mapper) {
+ this.mapper = mapper;
+ }
+
+ @Override
+ public JsonDeserializer<?> modifyDeserializer(DeserializationConfig
config,
+ BeanDescription
beanDesc,
+ JsonDeserializer<?>
deserializer) {
+ Class<?> beanClass = beanDesc.getBeanClass();
+
+ // Skip types that shouldn't use wrapper format
+ if (shouldSkip(beanClass)) {
+ return deserializer;
+ }
+
+ // Only handle abstract types (interfaces or abstract classes)
+ if (beanClass.isInterface() ||
Modifier.isAbstract(beanClass.getModifiers())) {
+ LOG.debug("Registering wrapper deserializer for abstract type:
{}",
+ beanClass.getName());
+ return new WrapperObjectDeserializer<>(beanClass, mapper);
+ }
+
+ return deserializer;
+ }
+
+ private boolean shouldSkip(Class<?> beanClass) {
+ // Skip primitives and their wrappers
+ if (beanClass.isPrimitive()) {
+ return true;
+ }
+
+ // Skip common JDK types
+ String name = beanClass.getName();
+ if (name.startsWith("java.") || name.startsWith("javax.")) {
+ return true;
+ }
+
+ // Skip arrays
+ if (beanClass.isArray()) {
+ return true;
+ }
+
+ return false;
+ }
+ }
+
+ /**
+ * Deserializer that handles wrapper object format for abstract types.
+ */
+ private static class WrapperObjectDeserializer<T> extends
JsonDeserializer<T> {
+
+ private final Class<?> abstractType;
+ private final ObjectMapper mapper;
+
+ WrapperObjectDeserializer(Class<?> abstractType, ObjectMapper mapper) {
+ this.abstractType = abstractType;
+ this.mapper = mapper;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public T deserialize(JsonParser p, DeserializationContext ctxt) throws
IOException {
+ JsonNode node = p.readValueAsTree();
+
+ if (!node.isObject() || node.isEmpty()) {
+ // Let Jackson's default handling fail appropriately
+ return (T) ctxt.handleUnexpectedToken(abstractType, p);
+ }
+
+ // Check for legacy "@class" format
+ if (node.has("@class")) {
+ String typeName = node.get("@class").asText();
+ // Create config node without @class
+ com.fasterxml.jackson.databind.node.ObjectNode configObj =
+ mapper.createObjectNode();
+ node.fields().forEachRemaining(entry -> {
+ if (!"@class".equals(entry.getKey())) {
+ configObj.set(entry.getKey(), entry.getValue());
+ }
+ });
+ return instantiateType(typeName, configObj, ctxt);
+ }
+
+ // Check for wrapper format: single field whose value is an object
+ // e.g., {"commons-digester-factory": {"markLimit": 100000}}
+ if (node.size() == 1) {
+ String typeName = node.fieldNames().next();
+ JsonNode configNode = node.get(typeName);
+ // Only treat as wrapper if the value is an object (not
primitive/array)
+ if (configNode.isObject()) {
+ return instantiateType(typeName, configNode, ctxt);
+ }
+ }
+
+ // Not wrapper format - this is likely an error (can't instantiate
abstract type)
+ // Throw JsonMappingException so ConfigLoader wraps it in
TikaConfigException
+ throw JsonMappingException.from(p,
+ "Cannot deserialize abstract type " +
abstractType.getSimpleName() +
+ ". Use wrapper format: {\"concrete-type-name\": {...}} or
" +
+ "legacy format: {\"@class\":
\"fully.qualified.ClassName\", ...}");
+ }
+
+ private T instantiateType(String typeName, JsonNode configNode,
+ DeserializationContext ctxt) throws
IOException {
+ try {
+ Class<?> concreteClass =
ComponentNameResolver.resolveClass(typeName,
+ TikaAbstractTypeMixins.class.getClassLoader());
+ return ComponentInstantiator.instantiate(concreteClass,
configNode, mapper);
+ } catch (ClassNotFoundException e) {
+ throw JsonMappingException.from(ctxt.getParser(),
+ "Unknown type '" + typeName + "' for " +
abstractType.getSimpleName());
+ } catch (TikaConfigException e) {
+ throw JsonMappingException.from(ctxt.getParser(),
+ "Failed to instantiate " + typeName + ": " +
e.getMessage());
+ }
+ }
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
index 96b58dac06..80063b151d 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
@@ -252,22 +252,22 @@ public class ConfigLoaderTest {
}
@Test
- public void testLoadInterfaceWithAtClassAndProperties() throws Exception {
- // JSON: "configured-handler": { "@class": "...", "maxSize": 100000,
... }
- TestHandler handler = configLoader.load("configured-handler",
TestHandler.class);
-
- assertNotNull(handler);
- assertTrue(handler instanceof ConfiguredHandlerImpl);
- assertEquals("configured", handler.getName());
-
- ConfiguredHandlerImpl impl = (ConfiguredHandlerImpl) handler;
+ public void testLoadConcreteClassWithProperties() throws Exception {
+ // JSON: "configured-handler-impl": { "maxSize": 100000, ... }
+ // Load directly as concrete class (kebab-case matches class name)
+ ConfiguredHandlerImpl impl =
configLoader.load("configured-handler-impl",
+ ConfiguredHandlerImpl.class);
+
+ assertNotNull(impl);
+ assertEquals("configured", impl.getName());
assertEquals(100000, impl.getMaxSize());
assertEquals("test-", impl.getPrefix());
}
@Test
- public void testLoadInterfaceWithoutTypeInfoFails() throws Exception {
- // Create a minimal config with just properties, no @class
+ public void testLoadInterfaceWithoutClassNameFails() throws Exception {
+ // Loading an interface with properties (not a class name string)
should fail
+ // because Jackson can't instantiate interfaces directly
Path configPath = Paths.get(
getClass().getResource("/configs/test-interface-no-type.json").toURI());
TikaLoader loader = TikaLoader.load(configPath);
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
index 2515b3b29a..bd530a6395 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
@@ -29,7 +29,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.junit.jupiter.api.Test;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
@@ -104,7 +104,7 @@ public class CustomClassSerializationTest {
}
private ObjectMapper createMapper() {
- ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper();
+ ObjectMapper mapper = TikaObjectMapperFactory.getMapper();
SimpleModule module = new SimpleModule();
module.addDeserializer(ParseContext.class, new
ParseContextDeserializer());
module.addSerializer(ParseContext.class, new ParseContextSerializer());
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index fdd1ecbff5..0c9c457b52 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -31,7 +31,10 @@ import com.fasterxml.jackson.databind.module.SimpleModule;
import org.junit.jupiter.api.Test;
import org.apache.tika.config.ConfigContainer;
-import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.TikaTaskTimeout;
+import org.apache.tika.config.loader.TikaObjectMapperFactory;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.extractor.SkipEmbeddedDocumentSelector;
import org.apache.tika.parser.ParseContext;
/**
@@ -44,7 +47,7 @@ public class TestParseContextSerialization {
private ObjectMapper createMapper() {
// Start with the properly configured mapper that has polymorphic type
handling
- ObjectMapper mapper = PolymorphicObjectMapperFactory.getMapper();
+ ObjectMapper mapper = TikaObjectMapperFactory.getMapper();
// Register our custom serializer/deserializer on top
SimpleModule module = new SimpleModule();
@@ -144,12 +147,12 @@ public class TestParseContextSerialization {
.get("timeoutMillis")
.asInt());
- // Verify round-trip
+ // Verify round-trip - TikaTaskTimeout is NOT SelfConfiguring,
+ // so it gets resolved directly into ParseContext (not ConfigContainer)
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
- ConfigContainer deserializedConfig =
deserialized.get(ConfigContainer.class);
- assertTrue(deserializedConfig
- .get("tika-task-timeout")
- .isPresent());
+ TikaTaskTimeout timeout = deserialized.get(TikaTaskTimeout.class);
+ assertNotNull(timeout, "TikaTaskTimeout should be resolved directly
into ParseContext");
+ assertEquals(30000, timeout.getTimeoutMillis());
}
@Test
@@ -259,9 +262,22 @@ public class TestParseContextSerialization {
assertTrue(root.has("my-custom-config"));
// Verify round-trip
+ // After deserialization:
+ // - pdf-parser, html-parser → Parsers are SelfConfiguring → stay in
ConfigContainer
+ // - my-custom-config → not in registry → stays in ConfigContainer
+ // - tika-task-timeout → TikaTaskTimeout is NOT SelfConfiguring →
resolved directly
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
ConfigContainer deserializedConfig =
deserialized.get(ConfigContainer.class);
- assertEquals(4, deserializedConfig.getKeys().size());
+ assertEquals(3, deserializedConfig.getKeys().size(),
+ "Should have 3 configs in ConfigContainer (SelfConfiguring +
unknown)");
+ assertTrue(deserializedConfig.get("pdf-parser").isPresent());
+ assertTrue(deserializedConfig.get("html-parser").isPresent());
+ assertTrue(deserializedConfig.get("my-custom-config").isPresent());
+
+ // TikaTaskTimeout should be resolved directly into ParseContext
+ TikaTaskTimeout timeout = deserialized.get(TikaTaskTimeout.class);
+ assertNotNull(timeout, "TikaTaskTimeout should be resolved directly");
+ assertEquals(5000, timeout.getTimeoutMillis());
}
@Test
@@ -279,4 +295,29 @@ public class TestParseContextSerialization {
JsonNode root = mapper.readTree(json);
assertEquals(0, root.size(), "Objects without friendly names should
not be serialized");
}
+
+ @Test
+ public void testContextKeyDeserialization() throws Exception {
+ // Test that components with @TikaComponent(contextKey=...) are stored
+ // in ParseContext with the contextKey, not the component class.
+ // SkipEmbeddedDocumentSelector has contextKey=DocumentSelector.class
+ String json = """
+ {
+ "skip-embedded-document-selector": {}
+ }
+ """;
+
+ ObjectMapper mapper = createMapper();
+ ParseContext deserialized = mapper.readValue(json, ParseContext.class);
+
+ // Should be accessible via DocumentSelector.class (the contextKey)
+ DocumentSelector selector = deserialized.get(DocumentSelector.class);
+ assertNotNull(selector, "DocumentSelector should be found via
contextKey");
+ assertTrue(selector instanceof SkipEmbeddedDocumentSelector,
+ "Should be SkipEmbeddedDocumentSelector instance");
+
+ // The selector should skip all embedded documents (return false)
+ assertFalse(selector.select(new org.apache.tika.metadata.Metadata()),
+ "SkipEmbeddedDocumentSelector should return false for all
documents");
+ }
}
diff --git
a/tika-serialization/src/test/resources/configs/test-config-loader.json
b/tika-serialization/src/test/resources/configs/test-config-loader.json
index cb6264c919..5305f2a43a 100644
--- a/tika-serialization/src/test/resources/configs/test-config-loader.json
+++ b/tika-serialization/src/test/resources/configs/test-config-loader.json
@@ -12,8 +12,7 @@
"simple-handler":
"org.apache.tika.config.loader.ConfigLoaderTest$SimpleHandlerImpl",
- "configured-handler": {
- "@class":
"org.apache.tika.config.loader.ConfigLoaderTest$ConfiguredHandlerImpl",
+ "configured-handler-impl": {
"maxSize": 100000,
"prefix": "test-"
},
diff --git
a/tika-serialization/src/test/resources/configs/test-translator-config.json
b/tika-serialization/src/test/resources/configs/test-translator-config.json
index 4e4b88fcc8..73ad08c224 100644
--- a/tika-serialization/src/test/resources/configs/test-translator-config.json
+++ b/tika-serialization/src/test/resources/configs/test-translator-config.json
@@ -1,5 +1,5 @@
{
"translator": {
- "class": "empty-translator"
+ "empty-translator": {}
}
}