This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new db2dfe820 TIKA-4544 -- this adds a deserializer for common tika
components like parsers etc. (#2403)
db2dfe820 is described below
commit db2dfe8203b03b9d71fb3864d9e94f40cdb1087d
Author: Tim Allison <[email protected]>
AuthorDate: Mon Nov 24 15:36:09 2025 -0500
TIKA-4544 -- this adds a deserializer for common tika components like
parsers etc. (#2403)
* TIKA-4544 -- first steps to adding a json loader for TikaConfig
---
pom.xml | 1 +
tika-annotation-processor/pom.xml | 72 +++++
.../apache/tika/annotation/KebabCaseConverter.java | 67 +++++
.../tika/annotation/TikaComponentProcessor.java | 280 ++++++++++++++++++
.../services/javax.annotation.processing.Processor | 1 +
.../tika/annotation/KebabCaseConverterTest.java | 81 ++++++
.../java/org/apache/tika/config/TikaComponent.java | 81 ++++++
.../tika/config/loader/ComponentInstance.java | 67 +++++
.../tika/config/loader/ComponentRegistry.java | 163 +++++++++++
.../config/loader/CompositeComponentLoader.java | 234 +++++++++++++++
.../apache/tika/config/loader/FrameworkConfig.java | 157 ++++++++++
.../apache/tika/config/loader/ParserLoader.java | 297 +++++++++++++++++++
.../apache/tika/config/loader/TikaJsonConfig.java | 239 ++++++++++++++++
.../org/apache/tika/config/loader/TikaLoader.java | 251 ++++++++++++++++
.../tika/config/loader/ComponentRegistryTest.java | 108 +++++++
.../tika/config/loader/ConfigurableTestParser.java | 125 ++++++++
.../tika/config/loader/FallbackTestParser.java | 108 +++++++
.../tika/config/loader/FrameworkConfigTest.java | 125 ++++++++
.../tika/config/loader/MinimalTestParser.java | 59 ++++
.../apache/tika/config/loader/OptInTestParser.java | 53 ++++
.../apache/tika/config/loader/TikaLoaderTest.java | 318 +++++++++++++++++++++
.../tika/serialization/JsonMetadataListTest.java | 2 +-
.../tika/serialization/JsonMetadataTest.java | 2 +-
.../services/org.apache.tika.parser.Parser | 4 +
.../src/test/resources/META-INF/tika/parsers.idx | 5 +
.../resources/configs/example-tika-config.json | 56 ++++
.../resources/configs/test-decoration-config.json | 14 +
.../test-default-parser-with-exclusions.json | 16 ++
.../test/resources/configs/test-loader-config.json | 25 ++
.../configs/test-no-duplicate-parsers.json | 14 +
.../resources/configs/test-no-spi-fallback.json | 11 +
.../configs/test-opt-in-parser-explicit.json | 7 +
.../configs/test-opt-in-parser-with-default.json | 7 +
.../configs/test-with-default-parser.json | 14 +
.../{config => configs}/tika-config-json.xml | 0
35 files changed, 3062 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 53eabbc4d..b1d2a0ae7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -38,6 +38,7 @@
<module>tika-parent</module>
<module>tika-bom</module>
<module>tika-core</module>
+ <module>tika-annotation-processor</module>
<module>tika-serialization</module>
<module>tika-plugins-core</module>
<module>tika-detectors</module>
diff --git a/tika-annotation-processor/pom.xml
b/tika-annotation-processor/pom.xml
new file mode 100644
index 000000000..9c93eb9ac
--- /dev/null
+++ b/tika-annotation-processor/pom.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>4.0.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-annotation-processor</artifactId>
+ <name>Apache Tika Annotation Processor</name>
+ <description>
+ Compile-time annotation processor for @TikaComponent that generates
+ SPI files and component registries.
+ </description>
+ <url>https://tika.apache.org</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <!-- Disable annotation processing in this module to avoid infinite
loop -->
+ <compilerArgument>-proc:none</compilerArgument>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+
<Automatic-Module-Name>org.apache.tika.annotation.processor</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java
new file mode 100644
index 000000000..bc7fc0dc9
--- /dev/null
+++
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.annotation;
+
+import java.util.Locale;
+
+/**
+ * Utility for converting Java class names to kebab-case.
+ * Used for automatic component name generation from class names.
+ *
+ * <p>Examples:
+ * <ul>
+ * <li>PDFParser → pdf-parser</li>
+ * <li>OCRParser → ocr-parser</li>
+ * <li>HTMLParser → html-parser</li>
+ * <li>DefaultParser → default-parser</li>
+ * <li>TesseractOCRParser → tesseract-ocr-parser</li>
+ * </ul>
+ */
+public class KebabCaseConverter {
+
+ private KebabCaseConverter() {
+ // Utility class
+ }
+
+ /**
+ * Converts a Java class name to kebab-case.
+ *
+ * @param className the simple class name (without package)
+ * @return the kebab-case version of the name
+ */
+ public static String toKebabCase(String className) {
+ if (className == null || className.isEmpty()) {
+ return className;
+ }
+
+ // Insert hyphen before uppercase letters that follow lowercase letters
+ // or before uppercase letters that are followed by lowercase letters
+ String result = className
+ // Insert hyphen between lowercase and uppercase: "aB" -> "a-B"
+ .replaceAll("([a-z])([A-Z])", "$1-$2")
+ // Insert hyphen before uppercase letter followed by lowercase
+ // in a sequence of uppercase letters: "HTMLParser" ->
"HTML-Parser"
+ .replaceAll("([A-Z]+)([A-Z][a-z])", "$1-$2")
+ // Insert hyphen between letter and digit: "PDF2Text" ->
"PDF2-Text"
+ .replaceAll("([a-zA-Z])(\\d)", "$1-$2")
+ // Insert hyphen between digit and letter: "2Text" -> "2-Text"
+ .replaceAll("(\\d)([a-zA-Z])", "$1-$2")
+ .toLowerCase(Locale.ROOT);
+
+ return result;
+ }
+}
diff --git
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
new file mode 100644
index 000000000..3a1800679
--- /dev/null
+++
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.annotation;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import javax.annotation.processing.AbstractProcessor;
+import javax.annotation.processing.Filer;
+import javax.annotation.processing.Messager;
+import javax.annotation.processing.ProcessingEnvironment;
+import javax.annotation.processing.RoundEnvironment;
+import javax.annotation.processing.SupportedAnnotationTypes;
+import javax.annotation.processing.SupportedSourceVersion;
+import javax.lang.model.SourceVersion;
+import javax.lang.model.element.Element;
+import javax.lang.model.element.TypeElement;
+import javax.lang.model.type.DeclaredType;
+import javax.lang.model.type.TypeMirror;
+import javax.tools.Diagnostic;
+import javax.tools.FileObject;
+import javax.tools.StandardLocation;
+
+import org.apache.tika.config.TikaComponent;
+
+/**
+ * Annotation processor for {@link TikaComponent} that generates:
+ * <ul>
+ * <li>Standard Java SPI files (META-INF/services/*) for ServiceLoader</li>
+ * <li>Component index files (META-INF/tika/*.idx) for name-based lookup</li>
+ * </ul>
+ *
+ * <p>The processor maintains an inclusion list of known Tika service
interfaces
+ * to avoid generating SPI files for utility interfaces like Serializable,
Initializable, etc.
+ */
+@SupportedAnnotationTypes("org.apache.tika.config.TikaComponent")
+@SupportedSourceVersion(SourceVersion.RELEASE_11)
+public class TikaComponentProcessor extends AbstractProcessor {
+
+ /**
+ * Known Tika service interfaces for SPI generation.
+ * Only classes implementing these interfaces will have SPI files
generated.
+ */
+ private static final Map<String, String> SERVICE_INTERFACES = new
LinkedHashMap<>();
+
+ static {
+ // Map interface fully qualified name -> index file name
+ SERVICE_INTERFACES.put("org.apache.tika.parser.Parser", "parsers");
+ SERVICE_INTERFACES.put("org.apache.tika.detect.Detector", "detectors");
+ SERVICE_INTERFACES.put("org.apache.tika.detect.EncodingDetector",
"encoding-detectors");
+
SERVICE_INTERFACES.put("org.apache.tika.language.translate.Translator",
"translators");
+ SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer",
"renderers");
+
SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter",
"metadata-filters");
+ }
+
+ private Messager messager;
+ private Filer filer;
+
+ // Accumulate components across rounds
+ // Map: service interface name -> set of implementing class names
+ private final Map<String, Set<String>> spiServices = new HashMap<>();
+
+ // Map: index file name -> map of (component name -> class name)
+ private final Map<String, Map<String, String>> indexFiles = new
HashMap<>();
+
+ @Override
+ public synchronized void init(ProcessingEnvironment processingEnv) {
+ super.init(processingEnv);
+ this.messager = processingEnv.getMessager();
+ this.filer = processingEnv.getFiler();
+ }
+
+ @Override
+ public boolean process(Set<? extends TypeElement> annotations,
RoundEnvironment roundEnv) {
+ if (roundEnv.processingOver()) {
+ // Final round - write accumulated data
+ writeServiceFiles();
+ writeIndexFiles();
+ return true;
+ }
+
+ for (Element element :
roundEnv.getElementsAnnotatedWith(TikaComponent.class)) {
+ if (element instanceof TypeElement) {
+ processComponent((TypeElement) element);
+ }
+ }
+
+ return true;
+ }
+
+ private void processComponent(TypeElement element) {
+ String className = element.getQualifiedName().toString();
+ TikaComponent annotation = element.getAnnotation(TikaComponent.class);
+
+ // Determine component name
+ String componentName = annotation.name();
+ if (componentName == null || componentName.isEmpty()) {
+ // Auto-generate from class name
+ String simpleName = element.getSimpleName().toString();
+ componentName = KebabCaseConverter.toKebabCase(simpleName);
+ }
+
+ // Check if component should be included in SPI
+ boolean includeSpi = annotation.spi();
+
+ messager.printMessage(Diagnostic.Kind.NOTE,
+ "Processing @TikaComponent: " + className + " -> " +
componentName +
+ " (SPI: " + includeSpi + ")");
+
+ // Find all implemented service interfaces
+ List<String> serviceInterfaces = findServiceInterfaces(element);
+
+ if (serviceInterfaces.isEmpty()) {
+ messager.printMessage(Diagnostic.Kind.WARNING,
+ "Class " + className + " annotated with @TikaComponent " +
+ "but does not implement any known Tika service interface",
element);
+ return;
+ }
+
+ // Process each service interface
+ for (String serviceInterface : serviceInterfaces) {
+ // Add to SPI services only if spi = true
+ if (includeSpi) {
+ spiServices.computeIfAbsent(serviceInterface, k -> new
LinkedHashSet<>())
+ .add(className);
+ }
+
+ // Always add to index files (regardless of SPI setting)
+ String indexFileName = SERVICE_INTERFACES.get(serviceInterface);
+ if (indexFileName != null) {
+ Map<String, String> index =
indexFiles.computeIfAbsent(indexFileName,
+ k -> new LinkedHashMap<>());
+
+ // Check for duplicate names
+ if (index.containsKey(componentName)) {
+ String existingClass = index.get(componentName);
+ if (!existingClass.equals(className)) {
+ messager.printMessage(Diagnostic.Kind.ERROR,
+ "Duplicate component name '" + componentName +
"' for classes: " +
+ existingClass + " and " + className, element);
+ }
+ } else {
+ index.put(componentName, className);
+ }
+ }
+ }
+ }
+
+ /**
+ * Finds all Tika service interfaces implemented by the given type element.
+ */
+ private List<String> findServiceInterfaces(TypeElement element) {
+ List<String> result = new ArrayList<>();
+ Set<String> visited = new LinkedHashSet<>();
+ findServiceInterfacesRecursive(element.asType(), result, visited);
+ return result;
+ }
+
+ /**
+ * Recursively searches for service interfaces in the type hierarchy.
+ */
+ private void findServiceInterfacesRecursive(TypeMirror type, List<String>
result,
+ Set<String> visited) {
+ if (type == null || !(type instanceof DeclaredType)) {
+ return;
+ }
+
+ DeclaredType declaredType = (DeclaredType) type;
+ TypeElement typeElement = (TypeElement) declaredType.asElement();
+ String typeName = typeElement.getQualifiedName().toString();
+
+ // Avoid infinite loops
+ if (!visited.add(typeName)) {
+ return;
+ }
+
+ // Check if this is a service interface
+ if (SERVICE_INTERFACES.containsKey(typeName)) {
+ if (!result.contains(typeName)) {
+ result.add(typeName);
+ }
+ }
+
+ // Check superclass
+ TypeMirror superclass = typeElement.getSuperclass();
+ findServiceInterfacesRecursive(superclass, result, visited);
+
+ // Check interfaces
+ for (TypeMirror interfaceType : typeElement.getInterfaces()) {
+ findServiceInterfacesRecursive(interfaceType, result, visited);
+ }
+ }
+
+ /**
+ * Writes META-INF/services files for Java SPI.
+ */
+ private void writeServiceFiles() {
+ for (Map.Entry<String, Set<String>> entry : spiServices.entrySet()) {
+ String serviceInterface = entry.getKey();
+ Set<String> implementations = entry.getValue();
+
+ try {
+ FileObject file =
filer.createResource(StandardLocation.CLASS_OUTPUT, "",
+ "META-INF/services/" + serviceInterface);
+
+ try (Writer writer = file.openWriter()) {
+ writer.write("# Generated by TikaComponentProcessor\n");
+ writer.write("# Do not edit manually\n");
+ for (String impl : implementations) {
+ writer.write(impl);
+ writer.write("\n");
+ }
+ }
+
+ messager.printMessage(Diagnostic.Kind.NOTE,
+ "Generated SPI file: META-INF/services/" +
serviceInterface +
+ " with " + implementations.size() + "
implementations");
+
+ } catch (IOException e) {
+ messager.printMessage(Diagnostic.Kind.ERROR,
+ "Failed to write SPI file for " + serviceInterface +
": " + e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * Writes META-INF/tika/*.idx files for name-based component lookup.
+ */
+ private void writeIndexFiles() {
+ for (Map.Entry<String, Map<String, String>> entry :
indexFiles.entrySet()) {
+ String fileName = entry.getKey();
+ Map<String, String> components = entry.getValue();
+
+ try {
+ FileObject file =
filer.createResource(StandardLocation.CLASS_OUTPUT, "",
+ "META-INF/tika/" + fileName + ".idx");
+
+ try (Writer writer = file.openWriter()) {
+ writer.write("# Generated by TikaComponentProcessor\n");
+ writer.write("# Do not edit manually\n");
+ writer.write("# Format:
component-name=fully.qualified.ClassName\n");
+ for (Map.Entry<String, String> component :
components.entrySet()) {
+ writer.write(component.getKey());
+ writer.write("=");
+ writer.write(component.getValue());
+ writer.write("\n");
+ }
+ }
+
+ messager.printMessage(Diagnostic.Kind.NOTE,
+ "Generated index file: META-INF/tika/" + fileName +
".idx" +
+ " with " + components.size() + " components");
+
+ } catch (IOException e) {
+ messager.printMessage(Diagnostic.Kind.ERROR,
+ "Failed to write index file " + fileName + ": " +
e.getMessage());
+ }
+ }
+ }
+}
diff --git
a/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor
b/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor
new file mode 100644
index 000000000..462f29223
--- /dev/null
+++
b/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor
@@ -0,0 +1 @@
+org.apache.tika.annotation.TikaComponentProcessor
diff --git
a/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java
b/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java
new file mode 100644
index 000000000..c742e020f
--- /dev/null
+++
b/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.annotation;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for KebabCaseConverter.
+ */
+public class KebabCaseConverterTest {
+
+ @Test
+ public void testSimpleClassName() {
+ assertEquals("parser", KebabCaseConverter.toKebabCase("Parser"));
+ assertEquals("detector", KebabCaseConverter.toKebabCase("Detector"));
+ }
+
+ @Test
+ public void testTwoWordClassName() {
+ assertEquals("pdf-parser",
KebabCaseConverter.toKebabCase("PDFParser"));
+ assertEquals("html-parser",
KebabCaseConverter.toKebabCase("HTMLParser"));
+ assertEquals("ocr-parser",
KebabCaseConverter.toKebabCase("OCRParser"));
+ }
+
+ @Test
+ public void testMixedCase() {
+ assertEquals("default-parser",
KebabCaseConverter.toKebabCase("DefaultParser"));
+ assertEquals("composite-detector",
KebabCaseConverter.toKebabCase("CompositeDetector"));
+ }
+
+ @Test
+ public void testAcronymsFollowedByWord() {
+ assertEquals("html-parser",
KebabCaseConverter.toKebabCase("HTMLParser"));
+ assertEquals("xml-parser",
KebabCaseConverter.toKebabCase("XMLParser"));
+ assertEquals("tesseract-ocr-parser",
KebabCaseConverter.toKebabCase("TesseractOCRParser"));
+ }
+
+ @Test
+ public void testNumbersInName() {
+ assertEquals("pdf-2-text-parser",
KebabCaseConverter.toKebabCase("PDF2TextParser"));
+ assertEquals("mp-3-parser",
KebabCaseConverter.toKebabCase("MP3Parser"));
+ }
+
+ @Test
+ public void testEdgeCases() {
+ assertNull(KebabCaseConverter.toKebabCase(null));
+ assertEquals("", KebabCaseConverter.toKebabCase(""));
+ assertEquals("a", KebabCaseConverter.toKebabCase("A"));
+ assertEquals("ab", KebabCaseConverter.toKebabCase("AB"));
+ }
+
+ @Test
+ public void testAlreadyLowerCase() {
+ assertEquals("parser", KebabCaseConverter.toKebabCase("parser"));
+ }
+
+ @Test
+ public void testComplexNames() {
+ assertEquals("microsoft-office-parser",
+ KebabCaseConverter.toKebabCase("MicrosoftOfficeParser"));
+ assertEquals("zip-container-detector",
+ KebabCaseConverter.toKebabCase("ZipContainerDetector"));
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaComponent.java
b/tika-core/src/main/java/org/apache/tika/config/TikaComponent.java
new file mode 100644
index 000000000..8696ab2db
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaComponent.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Annotation for Tika components (parsers, detectors, etc.) that enables:
+ * <ul>
+ * <li>Automatic SPI file generation (META-INF/services/...)</li>
+ * <li>Name-based component registry for JSON configuration</li>
+ * </ul>
+ *
+ * <p>The annotation processor generates:
+ * <ul>
+ * <li>Standard Java SPI files for ServiceLoader</li>
+ * <li>Component index files (META-INF/tika/{type}.idx) for name-based
lookup</li>
+ * </ul>
+ *
+ * <p>Example usage:
+ * <pre>
+ * {@code @TikaComponent}
+ * public class PDFParser extends AbstractParser {
+ * // auto-generates name "pdf-parser", included in SPI
+ * }
+ *
+ * {@code @TikaComponent(name = "tesseract-ocr")}
+ * public class TesseractOCRParser extends AbstractParser {
+ * // explicit name override, included in SPI
+ * }
+ *
+ * {@code @TikaComponent(spi = false)}
+ * public class DWGReadParser extends AbstractParser {
+ * // available by name, but NOT auto-loaded by default-parser
+ * }
+ * </pre>
+ *
+ * @since 3.1.0
+ */
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface TikaComponent {
+
+ /**
+ * The component name used in JSON configuration. If empty, the name is
+ * automatically generated from the class name using kebab-case conversion
+ * (e.g., PDFParser becomes "pdf-parser").
+ *
+ * @return the component name, or empty string for auto-generation
+ */
+ String name() default "";
+
+ /**
+ * Whether this component should be included in SPI files for automatic
+ * discovery via ServiceLoader. When false, the component is only available
+ * via explicit configuration (not loaded by "default-parser").
+ *
+ * <p>Use {@code spi = false} for opt-in components that users must
explicitly
+ * enable in their configuration.
+ *
+ * @return true to include in SPI (default), false to require explicit
config
+ */
+ boolean spi() default true;
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java
new file mode 100644
index 000000000..85594c4ad
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+/**
+ * Holds a component instance along with its metadata (priority, etc.).
+ *
+ * @param <T> the component type
+ */
+public class ComponentInstance<T> implements Comparable<ComponentInstance<T>> {
+
+ private final String name;
+ private final T instance;
+ private final int priority;
+
+ public ComponentInstance(String name, T instance, int priority) {
+ this.name = name;
+ this.instance = instance;
+ this.priority = priority;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public T getInstance() {
+ return instance;
+ }
+
+ public int getPriority() {
+ return priority;
+ }
+
+ @Override
+ public int compareTo(ComponentInstance<T> other) {
+ // Lower priority value = higher priority (processed first)
+ int priorityCompare = Integer.compare(this.priority, other.priority);
+ if (priorityCompare != 0) {
+ return priorityCompare;
+ }
+ // Secondary sort by name for stability
+ return this.name.compareTo(other.name);
+ }
+
+ @Override
+ public String toString() {
+ return "ComponentInstance{" +
+ "name='" + name + '\'' +
+ ", instance=" + instance.getClass().getSimpleName() +
+ ", priority=" + priority +
+ '}';
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
new file mode 100644
index 000000000..23c343edf
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Registry for looking up Tika component classes by name.
+ * Loads component name-to-class mappings from META-INF/tika/*.idx files
+ * generated by the {@code @TikaComponent} annotation processor.
+ */
+public class ComponentRegistry {
+
+ private final Map<String, Class<?>> components;
+ private final ClassLoader classLoader;
+
+ /**
+ * Creates a component registry by loading the specified index file.
+ *
+ * @param indexFileName the index file name (e.g., "parsers", "detectors")
+ * without the .idx extension
+ * @param classLoader the class loader to use for loading classes
+ * @throws TikaConfigException if the index file cannot be loaded
+ */
+ public ComponentRegistry(String indexFileName, ClassLoader classLoader)
+ throws TikaConfigException {
+ this.classLoader = classLoader;
+ this.components = loadComponents(indexFileName);
+ }
+
+ /**
+ * Looks up a component class by name.
+ *
+ * @param name the component name (e.g., "pdf-parser")
+ * @return the component class
+ * @throws TikaConfigException if the component name is not found
+ */
+ public Class<?> getComponentClass(String name) throws TikaConfigException {
+ Class<?> clazz = components.get(name);
+ if (clazz == null) {
+ throw new TikaConfigException("Unknown component name: '" + name +
"'. " +
+ "Available components: " + components.keySet());
+ }
+ return clazz;
+ }
+
+ /**
+ * Returns all registered component names.
+ *
+ * @return unmodifiable map of component names to classes
+ */
+ public Map<String, Class<?>> getAllComponents() {
+ return Collections.unmodifiableMap(components);
+ }
+
+ /**
+ * Checks if a component with the given name is registered.
+ *
+ * @param name the component name
+ * @return true if the component is registered
+ */
+ public boolean hasComponent(String name) {
+ return components.containsKey(name);
+ }
+
+ private Map<String, Class<?>> loadComponents(String indexFileName)
+ throws TikaConfigException {
+ Map<String, Class<?>> result = new LinkedHashMap<>();
+ String resourcePath = "META-INF/tika/" + indexFileName + ".idx";
+
+ try {
+ Enumeration<URL> resources =
classLoader.getResources(resourcePath);
+
+ if (!resources.hasMoreElements()) {
+ throw new TikaConfigException("Component index file not found:
" + resourcePath);
+ }
+
+ while (resources.hasMoreElements()) {
+ URL url = resources.nextElement();
+ loadFromUrl(url, result);
+ }
+
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to load component index: " +
resourcePath, e);
+ }
+
+ return result;
+ }
+
+ private void loadFromUrl(URL url, Map<String, Class<?>> result) throws
TikaConfigException {
+ try (InputStream in = url.openStream();
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(in, StandardCharsets.UTF_8))) {
+
+ String line;
+ int lineNumber = 0;
+
+ while ((line = reader.readLine()) != null) {
+ lineNumber++;
+ line = line.trim();
+
+ // Skip comments and empty lines
+ if (line.isEmpty() || line.startsWith("#")) {
+ continue;
+ }
+
+ // Parse: component-name=fully.qualified.ClassName
+ int equalsIndex = line.indexOf('=');
+ if (equalsIndex == -1) {
+ throw new TikaConfigException(
+ "Invalid index file format at " + url + " line " +
lineNumber +
+ ": expected 'name=class', got: " + line);
+ }
+
+ String name = line.substring(0, equalsIndex).trim();
+ String className = line.substring(equalsIndex + 1).trim();
+
+ if (name.isEmpty() || className.isEmpty()) {
+ throw new TikaConfigException(
+ "Invalid index file format at " + url + " line " +
lineNumber +
+ ": name or class is empty");
+ }
+
+ // Load the class
+ try {
+ Class<?> clazz = classLoader.loadClass(className);
+ result.put(name, clazz);
+ } catch (ClassNotFoundException e) {
+ throw new TikaConfigException(
+ "Component class not found: " + className + "
(from " + url + ")", e);
+ }
+ }
+
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to read component index
from: " + url, e);
+ }
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
new file mode 100644
index 000000000..69174ed66
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.ServiceLoader;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+/**
+ * Generic loader for Tika components (detectors, encoding detectors, filters,
etc.).
+ * Supports two loading modes:
+ * <ul>
+ * <li>Array-based: explicit ordering, no SPI fallback (default for
non-parsers)</li>
+ * <li>SPI-only: all components from ServiceLoader (when config section
missing)</li>
+ * </ul>
+ *
+ * @param <T> the component type
+ */
+public class CompositeComponentLoader<T> {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(CompositeComponentLoader.class);
+
+ private final Class<T> componentInterface;
+ private final String componentTypeName;
+ private final String indexFileName;
+ private final ClassLoader classLoader;
+ private final ObjectMapper objectMapper;
+
+ /**
+ * Creates a component loader.
+ *
+ * @param componentInterface the component interface (e.g., Detector.class)
+ * @param componentTypeName the JSON config key (e.g., "detectors")
+ * @param indexFileName the index file name (e.g., "detectors")
+ * @param classLoader the class loader
+ * @param objectMapper the Jackson ObjectMapper
+ */
+ public CompositeComponentLoader(Class<T> componentInterface, String
componentTypeName,
+ String indexFileName, ClassLoader
classLoader,
+ ObjectMapper objectMapper) {
+ this.componentInterface = componentInterface;
+ this.componentTypeName = componentTypeName;
+ this.indexFileName = indexFileName;
+ this.classLoader = classLoader;
+ this.objectMapper = objectMapper;
+ }
+
+ /**
+ * Loads components from array-based JSON config.
+ * If config section exists, uses only explicitly listed components (no
SPI fallback).
+ * If config section missing, uses SPI to discover all components.
+ *
+ * @param config the Tika JSON configuration
+ * @return ordered list of component instances
+ * @throws TikaConfigException if loading fails
+ */
+ public List<T> loadFromArray(TikaJsonConfig config) throws
TikaConfigException {
+ // Check if section exists in config
+ if (!config.hasComponentSection(componentTypeName)) {
+ // Section doesn't exist - use SPI fallback
+ return loadAllFromSpi();
+ }
+
+ // Section exists - load only explicitly configured components (no SPI)
+ List<Map.Entry<String, JsonNode>> arrayComponents =
config.getArrayComponents(componentTypeName);
+
+ if (arrayComponents.isEmpty()) {
+ // Explicit empty array means no components
+ return Collections.emptyList();
+ }
+
+ ComponentRegistry registry = new ComponentRegistry(indexFileName,
classLoader);
+ List<T> instances = new ArrayList<>();
+
+ for (Map.Entry<String, JsonNode> entry : arrayComponents) {
+ String name = entry.getKey();
+ JsonNode configNode = entry.getValue();
+
+ T instance = loadComponent(name, configNode, registry);
+ instances.add(instance);
+ }
+
+ return instances;
+ }
+
+ /**
+ * Loads components from JSON config with SPI fallback (used by parsers).
+ *
+ * @param config the Tika JSON configuration
+ * @return list of component instances
+ * @throws TikaConfigException if loading fails
+ */
+ public List<T> load(TikaJsonConfig config) throws TikaConfigException {
+ List<T> instances = new ArrayList<>();
+
+ // Load configured components
+ if (config.hasComponents(componentTypeName)) {
+ ComponentRegistry registry = new ComponentRegistry(indexFileName,
classLoader);
+ Map<String, JsonNode> components =
config.getComponents(componentTypeName);
+
+ for (Map.Entry<String, JsonNode> entry : components.entrySet()) {
+ String name = entry.getKey();
+ JsonNode configNode = entry.getValue();
+
+ T instance = loadConfiguredComponent(name, configNode,
registry);
+ instances.add(instance);
+ }
+ }
+
+ // Add SPI-discovered components
+ List<T> spiComponents = loadSpiComponents();
+ instances.addAll(spiComponents);
+
+ return instances;
+ }
+
+ private T loadConfiguredComponent(String name, JsonNode configNode,
+ ComponentRegistry registry)
+ throws TikaConfigException {
+ try {
+ // Get component class
+ Class<?> componentClass = registry.getComponentClass(name);
+
+ // Extract framework config
+ FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
+
+ // Instantiate component
+ T instance = instantiateComponent(componentClass,
frameworkConfig.getComponentConfigJson());
+
+ return instance;
+
+ } catch (Exception e) {
+ throw new TikaConfigException("Failed to load component '" + name
+ "' of type " +
+ componentTypeName, e);
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ private T instantiateComponent(Class<?> componentClass, String configJson)
+ throws TikaConfigException {
+ try {
+ // Try constructor with String parameter (JSON config)
+ try {
+ Constructor<?> constructor =
componentClass.getConstructor(String.class);
+ return (T) constructor.newInstance(configJson);
+ } catch (NoSuchMethodException e) {
+ // Fall back to zero-arg constructor
+ return (T) ServiceLoaderUtils.newInstance(componentClass,
+ new org.apache.tika.config.ServiceLoader(classLoader));
+ }
+ } catch (InstantiationException | IllegalAccessException |
InvocationTargetException e) {
+ throw new TikaConfigException("Failed to instantiate component: " +
+ componentClass.getName(), e);
+ }
+ }
+
+ private List<T> loadSpiComponents() {
+ List<T> result = new ArrayList<>();
+ ServiceLoader<T> serviceLoader =
ServiceLoader.load(componentInterface, classLoader);
+
+ Iterator<T> iterator = serviceLoader.iterator();
+ while (iterator.hasNext()) {
+ try {
+ T instance = iterator.next();
+ result.add(instance);
+ } catch (Exception e) {
+ // Log and skip problematic SPI providers
+ LOG.warn("Failed to load SPI component of type {}: {}",
componentTypeName, e.getMessage(), e);
+ }
+ }
+
+ return result;
+ }
+
+ private T loadComponent(String name, JsonNode configNode,
ComponentRegistry registry)
+ throws TikaConfigException {
+ try {
+ // Get component class
+ Class<?> componentClass = registry.getComponentClass(name);
+
+ // Instantiate component
+ return instantiateComponent(componentClass,
objectMapper.writeValueAsString(configNode));
+
+ } catch (Exception e) {
+ throw new TikaConfigException("Failed to load component '" + name
+ "' of type " +
+ componentTypeName, e);
+ }
+ }
+
+ private List<T> loadAllFromSpi() {
+ List<T> result = new ArrayList<>();
+ ServiceLoader<T> serviceLoader =
ServiceLoader.load(componentInterface, classLoader);
+
+ Iterator<T> iterator = serviceLoader.iterator();
+ while (iterator.hasNext()) {
+ try {
+ T instance = iterator.next();
+ result.add(instance);
+ } catch (Exception e) {
+ // Log and skip problematic SPI providers
+ LOG.warn("Failed to load SPI component of type {}: {}",
componentTypeName, e.getMessage(), e);
+ }
+ }
+
+ return result;
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
new file mode 100644
index 000000000..2cba10db1
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+/**
+ * Extracts framework-level configuration from component JSON,
+ * separating fields prefixed with underscore from component-specific config.
+ *
+ * <p>Framework fields:
+ * <ul>
+ * <li>{@code _decorate} - Parser decoration config (mime filtering,
fallbacks)</li>
+ * </ul>
+ */
+public class FrameworkConfig {
+
+ private static final String DECORATE_KEY = "_decorate";
+
+ private final ParserDecoration decoration;
+ private final String componentConfigJson;
+
+ private FrameworkConfig(ParserDecoration decoration, String
componentConfigJson) {
+ this.decoration = decoration;
+ this.componentConfigJson = componentConfigJson;
+ }
+
+ /**
+ * Extracts framework config from JSON node, returning the cleaned
component config.
+ *
+ * @param configNode the configuration JSON node
+ * @param objectMapper the Jackson ObjectMapper for serialization
+ * @return the framework config
+ * @throws IOException if JSON processing fails
+ */
+ public static FrameworkConfig extract(JsonNode configNode,
+ ObjectMapper objectMapper) throws
IOException {
+ if (configNode == null || !configNode.isObject()) {
+ return new FrameworkConfig(null,
+ objectMapper.writeValueAsString(configNode));
+ }
+
+ ObjectNode objNode = (ObjectNode) configNode.deepCopy();
+
+ // Extract decoration (parser-specific)
+ ParserDecoration decoration = null;
+ if (objNode.has(DECORATE_KEY)) {
+ JsonNode decorateNode = objNode.remove(DECORATE_KEY);
+ decoration = parseDecoration(decorateNode);
+ }
+
+ // Remaining fields are component-specific config
+ String componentConfigJson = objectMapper.writeValueAsString(objNode);
+
+ return new FrameworkConfig(decoration, componentConfigJson);
+ }
+
+ private static ParserDecoration parseDecoration(JsonNode decorateNode) {
+ if (decorateNode == null || !decorateNode.isObject()) {
+ return null;
+ }
+
+ List<String> mimeInclude =
parseStringList(decorateNode.get("mimeInclude"));
+ List<String> mimeExclude =
parseStringList(decorateNode.get("mimeExclude"));
+ List<String> fallbacks =
parseStringList(decorateNode.get("fallbacks"));
+
+ if (mimeInclude.isEmpty() && mimeExclude.isEmpty() &&
fallbacks.isEmpty()) {
+ return null;
+ }
+
+ return new ParserDecoration(mimeInclude, mimeExclude, fallbacks);
+ }
+
+ private static List<String> parseStringList(JsonNode node) {
+ if (node == null) {
+ return Collections.emptyList();
+ }
+
+ List<String> result = new ArrayList<>();
+ if (node.isArray()) {
+ for (JsonNode item : node) {
+ if (item.isTextual()) {
+ result.add(item.asText());
+ }
+ }
+ } else if (node.isTextual()) {
+ result.add(node.asText());
+ }
+
+ return result;
+ }
+
+ public ParserDecoration getDecoration() {
+ return decoration;
+ }
+
+ public String getComponentConfigJson() {
+ return componentConfigJson;
+ }
+
+ /**
+ * Parser decoration configuration for mime type filtering and fallbacks.
+ */
+ public static class ParserDecoration {
+ private final List<String> mimeInclude;
+ private final List<String> mimeExclude;
+ private final List<String> fallbacks;
+
+ public ParserDecoration(List<String> mimeInclude, List<String>
mimeExclude,
+ List<String> fallbacks) {
+ this.mimeInclude = Collections.unmodifiableList(mimeInclude);
+ this.mimeExclude = Collections.unmodifiableList(mimeExclude);
+ this.fallbacks = Collections.unmodifiableList(fallbacks);
+ }
+
+ public List<String> getMimeInclude() {
+ return mimeInclude;
+ }
+
+ public List<String> getMimeExclude() {
+ return mimeExclude;
+ }
+
+ public List<String> getFallbacks() {
+ return fallbacks;
+ }
+
+ public boolean hasFiltering() {
+ return !mimeInclude.isEmpty() || !mimeExclude.isEmpty();
+ }
+
+ public boolean hasFallbacks() {
+ return !fallbacks.isEmpty();
+ }
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
new file mode 100644
index 000000000..96f668af8
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.ServiceLoader;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.parser.multiple.FallbackParser;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+/**
+ * Loader for parsers with support for decoration (mime type filtering,
fallbacks).
+ */
+public class ParserLoader {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ParserLoader.class);
+
+ private final ClassLoader classLoader;
+ private final ObjectMapper objectMapper;
+ private final MediaTypeRegistry mediaTypeRegistry;
+
+ /**
+ * Holds parsed config data before decoration is applied.
+ */
+ private static class ParsedParserConfig {
+ final String name;
+ final Parser parser;
+ final FrameworkConfig.ParserDecoration decoration;
+
+ ParsedParserConfig(String name, Parser parser,
+ FrameworkConfig.ParserDecoration decoration) {
+ this.name = name;
+ this.parser = parser;
+ this.decoration = decoration;
+ }
+ }
+
+ public ParserLoader(ClassLoader classLoader, ObjectMapper objectMapper,
+ MediaTypeRegistry mediaTypeRegistry) {
+ this.classLoader = classLoader;
+ this.objectMapper = objectMapper;
+ this.mediaTypeRegistry = mediaTypeRegistry;
+ }
+
+ /**
+ * Loads parsers from JSON config and builds a CompositeParser.
+ *
+ * @param config the Tika JSON configuration
+ * @return the composite parser
+ * @throws TikaConfigException if loading fails
+ */
+ public CompositeParser load(TikaJsonConfig config) throws
TikaConfigException {
+ List<Parser> parserList = new ArrayList<>();
+
+ // Load configured parsers
+ if (config.hasComponentSection("parsers")) {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
classLoader);
+ List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
+
+ // Check if "default-parser" is in the list and extract exclusions
+ boolean hasDefaultParser = false;
+ Set<Class<?>> excludedParserClasses = new HashSet<>();
+
+ for (Map.Entry<String, JsonNode> entry : parsers) {
+ if ("default-parser".equals(entry.getKey())) {
+ hasDefaultParser = true;
+
+ // Parse exclusions from default-parser config
+ JsonNode configNode = entry.getValue();
+ if (configNode != null && configNode.has("exclude")) {
+ JsonNode excludeNode = configNode.get("exclude");
+ if (excludeNode.isArray()) {
+ for (JsonNode excludeName : excludeNode) {
+ if (excludeName.isTextual()) {
+ String parserName = excludeName.asText();
+ try {
+ Class<?> parserClass =
registry.getComponentClass(parserName);
+ excludedParserClasses.add(parserClass);
+ LOG.debug("Excluding parser from SPI:
{}", parserName);
+ } catch (TikaConfigException e) {
+ LOG.warn("Unknown parser in
default-parser exclude list: {}",
+ parserName);
+ }
+ }
+ }
+ }
+ }
+ break;
+ }
+ }
+
+ // First pass: parse configs and instantiate parsers
+ // Skip "default-parser" - it's a special marker for SPI fallback,
not a real parser
+ Map<String, ParsedParserConfig> parsedConfigs = new
LinkedHashMap<>();
+ for (Map.Entry<String, JsonNode> entry : parsers) {
+ String name = entry.getKey();
+
+ // Skip the special "default-parser" marker
+ if ("default-parser".equals(name)) {
+ continue;
+ }
+
+ JsonNode configNode = entry.getValue();
+ ParsedParserConfig parsed = loadConfiguredParser(name,
configNode, registry);
+ parsedConfigs.put(name, parsed);
+ }
+
+ // Track configured parser classes (before decoration) to avoid
SPI duplicates
+ Set<Class<?>> configuredParserClasses = new HashSet<>();
+ for (ParsedParserConfig parsed : parsedConfigs.values()) {
+ configuredParserClasses.add(parsed.parser.getClass());
+ }
+
+ // Add excluded parsers to the configured set so they won't be
loaded from SPI
+ configuredParserClasses.addAll(excludedParserClasses);
+
+ // Second pass: apply decorations that may reference other parsers
+ for (ParsedParserConfig parsed : parsedConfigs.values()) {
+ Parser parser = parsed.parser;
+
+ // Apply decorations if present
+ if (parsed.decoration != null) {
+ // Apply mime type filtering
+ if (parsed.decoration.hasFiltering()) {
+ parser = applyMimeFiltering(parser, parsed.decoration);
+ }
+
+ // Apply fallbacks
+ if (parsed.decoration.hasFallbacks()) {
+ parser = applyFallbacks(parser, parsed.decoration,
parsedConfigs);
+ }
+ }
+
+ parserList.add(parser);
+ }
+
+ // Add SPI-discovered parsers only if "default-parser" is in config
+ // If "default-parser" is present, use SPI fallback for unlisted
parsers
+ // If "default-parser" is NOT present, only load explicitly
configured parsers
+ if (hasDefaultParser) {
+ List<Parser> spiParsers =
loadSpiParsers(configuredParserClasses);
+ parserList.addAll(spiParsers);
+ LOG.debug("Loading SPI parsers because 'default-parser' is in
config");
+ } else {
+ LOG.debug("Skipping SPI parsers - 'default-parser' not in
config");
+ }
+ } else {
+ // No configured parsers - load all from SPI
+ List<Parser> spiParsers = loadSpiParsers(Collections.emptySet());
+ parserList.addAll(spiParsers);
+ }
+
+ return new CompositeParser(mediaTypeRegistry, parserList);
+ }
+
+ private ParsedParserConfig loadConfiguredParser(String name, JsonNode
configNode,
+ ComponentRegistry registry)
+ throws TikaConfigException {
+ try {
+ // Get parser class
+ Class<?> parserClass = registry.getComponentClass(name);
+
+ // Extract framework config
+ FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
+
+ // Instantiate parser
+ Parser parser = instantiateParser(parserClass,
frameworkConfig.getComponentConfigJson());
+
+ return new ParsedParserConfig(name, parser,
frameworkConfig.getDecoration());
+
+ } catch (Exception e) {
+ throw new TikaConfigException("Failed to load parser '" + name +
"'", e);
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ private Parser instantiateParser(Class<?> parserClass, String configJson)
+ throws TikaConfigException {
+ try {
+ // Try constructor with String parameter (JSON config)
+ try {
+ Constructor<?> constructor =
parserClass.getConstructor(String.class);
+ return (Parser) constructor.newInstance(configJson);
+ } catch (NoSuchMethodException e) {
+ // Fall back to zero-arg constructor
+ return (Parser) ServiceLoaderUtils.newInstance(parserClass,
+ new org.apache.tika.config.ServiceLoader(classLoader));
+ }
+ } catch (InstantiationException | IllegalAccessException |
InvocationTargetException e) {
+ throw new TikaConfigException("Failed to instantiate parser: " +
+ parserClass.getName(), e);
+ }
+ }
+
+ private Parser applyMimeFiltering(Parser parser,
FrameworkConfig.ParserDecoration decoration) {
+ List<String> includes = decoration.getMimeInclude();
+ List<String> excludes = decoration.getMimeExclude();
+
+ if (!includes.isEmpty()) {
+ Set<MediaType> includeTypes = new HashSet<>();
+ for (String mimeStr : includes) {
+ includeTypes.add(MediaType.parse(mimeStr));
+ }
+ parser = ParserDecorator.withTypes(parser, includeTypes);
+ }
+
+ if (!excludes.isEmpty()) {
+ Set<MediaType> excludeTypes = new HashSet<>();
+ for (String mimeStr : excludes) {
+ excludeTypes.add(MediaType.parse(mimeStr));
+ }
+ parser = ParserDecorator.withoutTypes(parser, excludeTypes);
+ }
+
+ return parser;
+ }
+
+ private Parser applyFallbacks(Parser parser,
FrameworkConfig.ParserDecoration decoration,
+ Map<String, ParsedParserConfig>
parsedConfigs)
+ throws TikaConfigException {
+
+ List<String> fallbackNames = decoration.getFallbacks();
+ List<Parser> fallbackParsers = new ArrayList<>();
+ fallbackParsers.add(parser); // Primary parser first
+
+ for (String fallbackName : fallbackNames) {
+ ParsedParserConfig fallbackConfig =
parsedConfigs.get(fallbackName);
+ if (fallbackConfig == null) {
+ throw new TikaConfigException("Unknown fallback parser: " +
fallbackName);
+ }
+ fallbackParsers.add(fallbackConfig.parser);
+ }
+
+ return new FallbackParser(mediaTypeRegistry, MetadataPolicy.KEEP_ALL,
fallbackParsers);
+ }
+
+ private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) {
+ List<Parser> result = new ArrayList<>();
+ ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class,
classLoader);
+
+ Iterator<Parser> iterator = serviceLoader.iterator();
+ while (iterator.hasNext()) {
+ try {
+ Parser parser = iterator.next();
+
+ // Skip if this parser class was already loaded from config
+ if (excludeClasses.contains(parser.getClass())) {
+ LOG.debug("Skipping SPI parser {} - already configured",
+ parser.getClass().getName());
+ continue;
+ }
+
+ result.add(parser);
+ } catch (Exception e) {
+ // Log and skip problematic SPI providers
+ LOG.warn("Failed to load SPI parser: {}", e.getMessage(), e);
+ }
+ }
+
+ return result;
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
new file mode 100644
index 000000000..7b7bf1c0b
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Parsed representation of a Tika JSON configuration file.
+ * Provides access to component configurations by type (parsers, detectors,
etc.).
+ *
+ * <p>JSON structure:
+ * <pre>
+ * {
+ * "parsers": [
+ * { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } },
+ * { "html-parser": { ... } },
+ * { "default-parser": {} }
+ * ],
+ * "detectors": [
+ * { "mime-magic-detector": {} },
+ * { "zip-container-detector": { "maxDepth": 10 } }
+ * ],
+ * ...
+ * }
+ * </pre>
+ *
+ * <p>All components use array format for explicit ordering.
+ * Parsers support decoration via "_decorate" field.
+ * Special "default-parser" entry enables SPI fallback for unlisted parsers.
+ */
+public class TikaJsonConfig {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ private final JsonNode rootNode;
+ private final Map<String, Map<String, JsonNode>> componentsByType;
+ private final Map<String, List<Map.Entry<String, JsonNode>>>
arrayComponentsByType;
+
+ private TikaJsonConfig(JsonNode rootNode) {
+ this.rootNode = rootNode;
+ this.componentsByType = parseObjectComponents(rootNode);
+ this.arrayComponentsByType = parseArrayComponents(rootNode);
+ }
+
+ /**
+ * Loads configuration from a file.
+ *
+ * @param configPath the path to the JSON configuration file
+ * @return the parsed configuration
+ * @throws TikaConfigException if loading or parsing fails
+ */
+ public static TikaJsonConfig load(Path configPath) throws
TikaConfigException {
+ try (InputStream in = Files.newInputStream(configPath)) {
+ return load(in);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to load config from: " +
configPath, e);
+ }
+ }
+
+ /**
+ * Loads configuration from an input stream.
+ *
+ * @param inputStream the input stream containing JSON configuration
+ * @return the parsed configuration
+ * @throws TikaConfigException if loading or parsing fails
+ */
+ public static TikaJsonConfig load(InputStream inputStream) throws
TikaConfigException {
+ try {
+ JsonNode rootNode = OBJECT_MAPPER.readTree(inputStream);
+ return new TikaJsonConfig(rootNode);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to parse JSON
configuration", e);
+ }
+ }
+
+ /**
+ * Gets component configurations for a specific type (object format - used
for parsers).
+ *
+ * @param componentType the component type (e.g., "parsers")
+ * @return map of component name to configuration JSON, or empty map if
type not found
+ */
+ public Map<String, JsonNode> getComponents(String componentType) {
+ return componentsByType.getOrDefault(componentType,
Collections.emptyMap());
+ }
+
+ /**
+ * Gets component configurations for a specific type (array format - used
for detectors, etc.).
+ *
+ * @param componentType the component type (e.g., "detectors")
+ * @return ordered list of (name, config) entries, or empty list if type
not found
+ */
+ public List<Map.Entry<String, JsonNode>> getArrayComponents(String
componentType) {
+ return arrayComponentsByType.getOrDefault(componentType,
Collections.emptyList());
+ }
+
+ /**
+ * Checks if a component type has any configured components (object
format).
+ *
+ * @param componentType the component type
+ * @return true if the type has configurations
+ */
+ public boolean hasComponents(String componentType) {
+ Map<String, JsonNode> components = componentsByType.get(componentType);
+ return components != null && !components.isEmpty();
+ }
+
+ /**
+ * Checks if a component type has any configured components (array format).
+ *
+ * @param componentType the component type
+ * @return true if the type has configurations
+ */
+ public boolean hasArrayComponents(String componentType) {
+ List<Map.Entry<String, JsonNode>> components =
arrayComponentsByType.get(componentType);
+ return components != null && !components.isEmpty();
+ }
+
+ /**
+ * Checks if a component type section exists in the config (even if empty).
+ *
+ * @param componentType the component type
+ * @return true if the section exists
+ */
+ public boolean hasComponentSection(String componentType) {
+ return rootNode.has(componentType);
+ }
+
+ /**
+ * Gets the raw root JSON node.
+ *
+ * @return the root node
+ */
+ public JsonNode getRootNode() {
+ return rootNode;
+ }
+
+ private Map<String, Map<String, JsonNode>> parseObjectComponents(JsonNode
root) {
+ Map<String, Map<String, JsonNode>> result = new LinkedHashMap<>();
+
+ if (root == null || !root.isObject()) {
+ return result;
+ }
+
+ for (Map.Entry<String, JsonNode> entry : root.properties()) {
+ String componentType = entry.getKey();
+ JsonNode typeNode = entry.getValue();
+
+ // Only process object nodes (used for parsers)
+ if (!typeNode.isObject()) {
+ continue;
+ }
+
+ Map<String, JsonNode> components = new LinkedHashMap<>();
+ for (Map.Entry<String, JsonNode> componentEntry :
typeNode.properties()) {
+ components.put(componentEntry.getKey(),
componentEntry.getValue());
+ }
+
+ if (!components.isEmpty()) {
+ result.put(componentType, components);
+ }
+ }
+
+ return result;
+ }
+
+ private Map<String, List<Map.Entry<String, JsonNode>>>
parseArrayComponents(JsonNode root) {
+ Map<String, List<Map.Entry<String, JsonNode>>> result = new
LinkedHashMap<>();
+
+ if (root == null || !root.isObject()) {
+ return result;
+ }
+
+ for (Map.Entry<String, JsonNode> entry : root.properties()) {
+ String componentType = entry.getKey();
+ JsonNode typeNode = entry.getValue();
+
+ // Only process array nodes (used for detectors, filters, etc.)
+ if (!typeNode.isArray()) {
+ continue;
+ }
+
+ List<Map.Entry<String, JsonNode>> components = new ArrayList<>();
+
+ for (JsonNode arrayItem : typeNode) {
+ if (!arrayItem.isObject()) {
+ continue;
+ }
+
+ // Each array item should have exactly one field: {
"component-name": {...config...} }
+ for (Map.Entry<String, JsonNode> componentEntry :
arrayItem.properties()) {
+ components.add(Map.entry(componentEntry.getKey(),
componentEntry.getValue()));
+ break; // Only take the first field
+ }
+ }
+
+ if (!components.isEmpty()) {
+ result.put(componentType, components);
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * Gets the ObjectMapper used for JSON processing.
+ *
+ * @return the object mapper
+ */
+ public static ObjectMapper getObjectMapper() {
+ return OBJECT_MAPPER;
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
new file mode 100644
index 000000000..483596199
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.nio.file.Path;
+import java.util.List;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.detect.CompositeEncodingDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.Renderer;
+
+/**
+ * Main entry point for loading Tika components from JSON configuration.
+ * Provides lazy loading of component types - only loads classes when
requested.
+ *
+ * <p>Usage:
+ * <pre>
+ * TikaLoader loader = TikaLoader.load(Path.of("tika-config.json"));
+ * Parser parser = loader.loadParsers();
+ * Detector detector = loader.loadDetectors();
+ * </pre>
+ *
+ * <p>JSON configuration format:
+ * <pre>
+ * {
+ * "parsers": {
+ * "pdf-parser": {
+ * "_priority": 10,
+ * "_decorate": {
+ * "mimeInclude": ["application/pdf"],
+ * "mimeExclude": ["application/pdf+fdf"],
+ * "fallbacks": ["empty-parser"]
+ * },
+ * "ocrStrategy": "AUTO",
+ * "extractInlineImages": true
+ * }
+ * },
+ * "detectors": {
+ * "mime-magic-detector": { ... }
+ * }
+ * }
+ * </pre>
+ */
+public class TikaLoader {
+
+ private final TikaJsonConfig config;
+ private final ClassLoader classLoader;
+ private final ObjectMapper objectMapper;
+ private final MediaTypeRegistry mediaTypeRegistry;
+
+ // Cached instances (lazy loaded)
+ private Parser parsers;
+ private Detector detectors;
+ private EncodingDetector encodingDetectors;
+ private MetadataFilter metadataFilters;
+ private Renderer renderers;
+
+ private TikaLoader(TikaJsonConfig config, ClassLoader classLoader,
+ MediaTypeRegistry mediaTypeRegistry) {
+ this.config = config;
+ this.classLoader = classLoader;
+ this.objectMapper = TikaJsonConfig.getObjectMapper();
+ this.mediaTypeRegistry = mediaTypeRegistry;
+ }
+
+ /**
+ * Loads a Tika configuration from a file.
+ *
+ * @param configPath the path to the JSON configuration file
+ * @return the Tika loader
+ * @throws TikaConfigException if loading or parsing fails
+ */
+ public static TikaLoader load(Path configPath) throws TikaConfigException {
+ return load(configPath,
Thread.currentThread().getContextClassLoader());
+ }
+
+ /**
+ * Loads a Tika configuration from a file with a specific class loader.
+ *
+ * @param configPath the path to the JSON configuration file
+ * @param classLoader the class loader to use for loading components
+ * @return the Tika loader
+ * @throws TikaConfigException if loading or parsing fails
+ */
+ public static TikaLoader load(Path configPath, ClassLoader classLoader)
+ throws TikaConfigException {
+ TikaJsonConfig config = TikaJsonConfig.load(configPath);
+ MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
+ return new TikaLoader(config, classLoader, registry);
+ }
+
+ /**
+ * Loads a Tika configuration with custom media type registry.
+ *
+ * @param configPath the path to the JSON configuration file
+ * @param classLoader the class loader to use for loading components
+ * @param mediaTypeRegistry the media type registry to use
+ * @return the Tika loader
+ * @throws TikaConfigException if loading or parsing fails
+ */
+ public static TikaLoader load(Path configPath, ClassLoader classLoader,
+ MediaTypeRegistry mediaTypeRegistry)
+ throws TikaConfigException {
+ TikaJsonConfig config = TikaJsonConfig.load(configPath);
+ return new TikaLoader(config, classLoader, mediaTypeRegistry);
+ }
+
+ /**
+ * Loads and returns all parsers.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * @return the parser (typically a CompositeParser internally)
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized Parser loadParsers() throws TikaConfigException {
+ if (parsers == null) {
+ ParserLoader loader = new ParserLoader(classLoader, objectMapper,
mediaTypeRegistry);
+ parsers = loader.load(config);
+ }
+ return parsers;
+ }
+
+ /**
+ * Loads and returns all detectors.
+ * If "detectors" section exists in config, uses only those listed (no SPI
fallback).
+ * If section missing, uses SPI to discover detectors.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * @return the detector (typically a CompositeDetector internally)
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized Detector loadDetectors() throws TikaConfigException {
+ if (detectors == null) {
+ CompositeComponentLoader<Detector> loader = new
CompositeComponentLoader<>(
+ Detector.class, "detectors", "detectors", classLoader,
objectMapper);
+ List<Detector> detectorList = loader.loadFromArray(config);
+ detectors = new CompositeDetector(mediaTypeRegistry, detectorList);
+ }
+ return detectors;
+ }
+
+ /**
+ * Loads and returns all encoding detectors.
+ * If "encodingDetectors" section exists in config, uses only those listed
(no SPI fallback).
+ * If section missing, uses SPI to discover encoding detectors.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * @return the encoding detector (typically a CompositeEncodingDetector
internally)
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized EncodingDetector loadEncodingDetectors() throws
TikaConfigException {
+ if (encodingDetectors == null) {
+ CompositeComponentLoader<EncodingDetector> loader = new
CompositeComponentLoader<>(
+ EncodingDetector.class, "encodingDetectors",
"encoding-detectors",
+ classLoader, objectMapper);
+ List<EncodingDetector> detectorList = loader.loadFromArray(config);
+ encodingDetectors = new CompositeEncodingDetector(detectorList);
+ }
+ return encodingDetectors;
+ }
+
+ /**
+ * Loads and returns all metadata filters.
+ * If "metadataFilters" section exists in config, uses only those listed
(no SPI fallback).
+ * If section missing, uses SPI to discover metadata filters.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * @return the metadata filter (typically a CompositeMetadataFilter
internally)
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized MetadataFilter loadMetadataFilters() throws
TikaConfigException {
+ if (metadataFilters == null) {
+ CompositeComponentLoader<MetadataFilter> loader = new
CompositeComponentLoader<>(
+ MetadataFilter.class, "metadataFilters",
"metadata-filters",
+ classLoader, objectMapper);
+ List<MetadataFilter> filterList = loader.loadFromArray(config);
+ metadataFilters = new CompositeMetadataFilter(filterList);
+ }
+ return metadataFilters;
+ }
+
+ /**
+ * Loads and returns all renderers.
+ * If "renderers" section exists in config, uses only those listed (no SPI
fallback).
+ * If section missing, uses SPI to discover renderers.
+ * Results are cached - subsequent calls return the same instance.
+ *
+ * @return the renderer (typically a CompositeRenderer internally)
+ * @throws TikaConfigException if loading fails
+ */
+ public synchronized Renderer loadRenderers() throws TikaConfigException {
+ if (renderers == null) {
+ CompositeComponentLoader<Renderer> loader = new
CompositeComponentLoader<>(
+ Renderer.class, "renderers", "renderers", classLoader,
objectMapper);
+ List<Renderer> rendererList = loader.loadFromArray(config);
+ renderers = new CompositeRenderer(rendererList);
+ }
+ return renderers;
+ }
+
+ /**
+ * Gets the underlying JSON configuration.
+ *
+ * @return the JSON configuration
+ */
+ public TikaJsonConfig getConfig() {
+ return config;
+ }
+
+ /**
+ * Gets the class loader used for loading components.
+ *
+ * @return the class loader
+ */
+ public ClassLoader getClassLoader() {
+ return classLoader;
+ }
+
+ /**
+ * Gets the media type registry.
+ *
+ * @return the media type registry
+ */
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return mediaTypeRegistry;
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java
new file mode 100644
index 000000000..377fa81b6
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Unit tests for ComponentRegistry.
+ */
+public class ComponentRegistryTest {
+
+ @Test
+ public void testLoadParsersIndex() throws Exception {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
+ getClass().getClassLoader());
+
+ assertNotNull(registry, "Registry should not be null");
+
+ // Verify test parsers are registered
+ assertTrue(registry.hasComponent("configurable-test-parser"),
+ "Should have configurable-test-parser");
+ assertTrue(registry.hasComponent("fallback-test-parser"),
+ "Should have fallback-test-parser");
+ assertTrue(registry.hasComponent("minimal-test-parser"),
+ "Should have minimal-test-parser");
+ assertTrue(registry.hasComponent("opt-in-test-parser"),
+ "Should have opt-in-test-parser");
+ }
+
+ @Test
+ public void testGetComponentClass() throws Exception {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
+ getClass().getClassLoader());
+
+ Class<?> clazz =
registry.getComponentClass("configurable-test-parser");
+ assertNotNull(clazz, "Component class should not be null");
+ assertEquals("org.apache.tika.config.loader.ConfigurableTestParser",
+ clazz.getName());
+ }
+
+ @Test
+ public void testGetAllComponents() throws Exception {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
+ getClass().getClassLoader());
+
+ Map<String, Class<?>> all = registry.getAllComponents();
+ assertNotNull(all, "All components map should not be null");
+ assertTrue(all.size() >= 4, "Should have at least 4 test parsers");
+ }
+
+ @Test
+ public void testUnknownComponent() throws Exception {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
+ getClass().getClassLoader());
+
+ // Should throw exception
+ assertThrows(TikaConfigException.class, () -> {
+ registry.getComponentClass("non-existent-parser");
+ });
+ }
+
+ @Test
+ public void testNonExistentIndexFile() throws Exception {
+ // Should throw exception when index file doesn't exist
+ assertThrows(TikaConfigException.class, () -> {
+ new ComponentRegistry("non-existent-type",
getClass().getClassLoader());
+ });
+ }
+
+ @Test
+ public void testOptInParserInRegistry() throws Exception {
+ ComponentRegistry registry = new ComponentRegistry("parsers",
+ getClass().getClassLoader());
+
+ // Verify opt-in parser (spi=false) is in the registry
+ assertTrue(registry.hasComponent("opt-in-test-parser"),
+ "Should have opt-in-test-parser in registry");
+
+ Class<?> clazz = registry.getComponentClass("opt-in-test-parser");
+ assertNotNull(clazz, "Component class should not be null");
+ assertEquals("org.apache.tika.config.loader.OptInTestParser",
+ clazz.getName(),
+ "OptInTestParser should be in registry even with spi=false");
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java
new file mode 100644
index 000000000..5323e9de8
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Test parser with configurable properties for testing JSON configuration
loading.
+ */
+@TikaComponent(name = "configurable-test-parser")
+public class ConfigurableTestParser implements Parser {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ private static final long serialVersionUID = 1L;
+
+ private final TestParserConfig config;
+
+ /**
+ * Constructor for JSON-based configuration.
+ */
+ public ConfigurableTestParser(String jsonConfig) throws
TikaConfigException {
+ try {
+ this.config = OBJECT_MAPPER.readValue(jsonConfig,
TestParserConfig.class);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to parse JSON config", e);
+ }
+ }
+
+ /**
+ * Zero-arg constructor for SPI fallback.
+ */
+ public ConfigurableTestParser() {
+ this.config = new TestParserConfig();
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return
Collections.singleton(MediaType.parse("application/test+configurable"));
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+ // Simple implementation that writes config to metadata
+ metadata.set("parser-name", config.getName());
+ metadata.set("buffer-size", String.valueOf(config.getBufferSize()));
+ metadata.set("enabled", String.valueOf(config.isEnabled()));
+ metadata.set("mode", config.getMode());
+ }
+
+ public TestParserConfig getConfig() {
+ return config;
+ }
+
+ /**
+ * Configuration POJO for ConfigurableTestParser.
+ */
+ public static class TestParserConfig {
+ private String name = "default";
+ private int bufferSize = 1024;
+ private boolean enabled = true;
+ private String mode = "normal";
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public int getBufferSize() {
+ return bufferSize;
+ }
+
+ public void setBufferSize(int bufferSize) {
+ this.bufferSize = bufferSize;
+ }
+
+ public boolean isEnabled() {
+ return enabled;
+ }
+
+ public void setEnabled(boolean enabled) {
+ this.enabled = enabled;
+ }
+
+ public String getMode() {
+ return mode;
+ }
+
+ public void setMode(String mode) {
+ this.mode = mode;
+ }
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java
new file mode 100644
index 000000000..06470ceb2
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Simple test parser for fallback chain testing.
+ */
+@TikaComponent(name = "fallback-test-parser")
+public class FallbackTestParser implements Parser {
+
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+ private static final long serialVersionUID = 1L;
+
+ private final FallbackConfig config;
+
+ /**
+ * Constructor for JSON-based configuration.
+ */
+ public FallbackTestParser(String jsonConfig) throws TikaConfigException {
+ try {
+ this.config = OBJECT_MAPPER.readValue(jsonConfig,
FallbackConfig.class);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to parse JSON config", e);
+ }
+ }
+
+ /**
+ * Zero-arg constructor for SPI fallback.
+ */
+ public FallbackTestParser() {
+ this.config = new FallbackConfig();
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return
Collections.singleton(MediaType.parse("application/test+fallback"));
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+ if (config.isFailOnPurpose()) {
+ throw new TikaException("Intentional failure for testing fallback:
" + config.getMessage());
+ }
+ // Success case
+ metadata.set("fallback-parser", "success");
+ metadata.set("message", config.getMessage());
+ }
+
+ public FallbackConfig getConfig() {
+ return config;
+ }
+
+ /**
+ * Configuration POJO for FallbackTestParser.
+ */
+ public static class FallbackConfig {
+ private String message = "default message";
+ private boolean failOnPurpose = false;
+
+ public String getMessage() {
+ return message;
+ }
+
+ public void setMessage(String message) {
+ this.message = message;
+ }
+
+ public boolean isFailOnPurpose() {
+ return failOnPurpose;
+ }
+
+ public void setFailOnPurpose(boolean failOnPurpose) {
+ this.failOnPurpose = failOnPurpose;
+ }
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
new file mode 100644
index 000000000..beba6e055
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for FrameworkConfig.
+ */
+public class FrameworkConfigTest {
+
+ private static final ObjectMapper MAPPER = new ObjectMapper();
+
+ @Test
+ public void testExtractDecoration() throws Exception {
+ String json = """
+ {
+ "_decorate": {
+ "mimeInclude": ["application/pdf"],
+ "mimeExclude": ["application/pdf+fdf"],
+ "fallbacks": ["backup-parser"]
+ },
+ "name": "test"
+ }
+ """;
+ JsonNode node = MAPPER.readTree(json);
+
+ FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
+
+ assertNotNull(config.getDecoration(), "Decoration should be present");
+
+ FrameworkConfig.ParserDecoration decoration = config.getDecoration();
+ assertTrue(decoration.hasFiltering(), "Should have filtering");
+ assertTrue(decoration.hasFallbacks(), "Should have fallbacks");
+
+ assertEquals(1, decoration.getMimeInclude().size());
+ assertEquals("application/pdf", decoration.getMimeInclude().get(0));
+
+ assertEquals(1, decoration.getMimeExclude().size());
+ assertEquals("application/pdf+fdf",
decoration.getMimeExclude().get(0));
+
+ assertEquals(1, decoration.getFallbacks().size());
+ assertEquals("backup-parser", decoration.getFallbacks().get(0));
+
+ assertFalse(config.getComponentConfigJson().contains("_decorate"),
+ "Component config should not contain _decorate");
+ }
+
+ @Test
+ public void testNoDecoration() throws Exception {
+ String json = """
+ {
+ "name": "test"
+ }
+ """;
+ JsonNode node = MAPPER.readTree(json);
+
+ FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
+
+ assertNull(config.getDecoration(), "Decoration should be null");
+ }
+
+ @Test
+ public void testEmptyDecoration() throws Exception {
+ String json = """
+ {
+ "_decorate": {},
+ "name": "test"
+ }
+ """;
+ JsonNode node = MAPPER.readTree(json);
+
+ FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
+
+ // Empty decoration should return null
+ assertNull(config.getDecoration(), "Empty decoration should be null");
+ }
+
+ @Test
+ public void testComponentConfigJsonClean() throws Exception {
+ String json = """
+ {
+ "_decorate": {
+ "mimeInclude": ["text/plain"]
+ },
+ "bufferSize": 1024,
+ "enabled": true
+ }
+ """;
+ JsonNode node = MAPPER.readTree(json);
+
+ FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
+
+ String componentJson = config.getComponentConfigJson();
+
+ // Verify framework fields are removed
+ assertFalse(componentJson.contains("_decorate"), "Should not contain
_decorate");
+
+ // Verify component fields remain
+ assertTrue(componentJson.contains("bufferSize"), "Should contain
bufferSize");
+ assertTrue(componentJson.contains("enabled"), "Should contain
enabled");
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java
new file mode 100644
index 000000000..9b47995fb
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Minimal test parser with only zero-arg constructor for testing SPI fallback.
+ */
+@TikaComponent(name = "minimal-test-parser")
+public class MinimalTestParser implements Parser {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Zero-arg constructor only - no JSON config support.
+ */
+ public MinimalTestParser() {
+ // No config
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return
Collections.singleton(MediaType.parse("application/test+minimal"));
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+ metadata.set("parser-type", "minimal");
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/OptInTestParser.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/OptInTestParser.java
new file mode 100644
index 000000000..78e887979
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/OptInTestParser.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Test parser with spi=false - should be available by name but NOT
auto-loaded by SPI.
+ */
+@TikaComponent(name = "opt-in-test-parser", spi = false)
+public class OptInTestParser implements Parser {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return
Collections.singleton(MediaType.parse("application/test+optin"));
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+ metadata.set("parser-type", "opt-in");
+ metadata.set("opt-in-parser", "success");
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
new file mode 100644
index 000000000..168e673dd
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Unit tests for TikaLoader JSON configuration loading.
+ */
+public class TikaLoaderTest {
+
+ @Test
+ public void testBasicParserLoading() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-loader-config.json");
+ assertNotNull(configUrl, "Test config not found");
+
+ Path configPath = Path.of(configUrl.toURI());
+ TikaLoader loader = TikaLoader.load(configPath);
+
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser, "Parser should not be null");
+ }
+
+ @Test
+ public void testConfigurableParserConfiguration() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-loader-config.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Parse with the composite parser to verify config was applied
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+ }
+
+ // Verify the configured values were used
+ assertEquals("configured-parser", metadata.get("parser-name"));
+ assertEquals("2048", metadata.get("buffer-size"));
+ assertEquals("true", metadata.get("enabled"));
+ assertEquals("advanced", metadata.get("mode"));
+ }
+
+ @Test
+ public void testMimeTypeDecoration() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-decoration-config.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser parser = loader.loadParsers();
+
+ ParseContext context = new ParseContext();
+
+ // Test that included types are supported
+
assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")),
+ "Should support application/pdf");
+
assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("text/plain")),
+ "Should support text/plain");
+ }
+
+ @Test
+ public void testLazyLoading() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-loader-config.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+
+ // Verify loader created but parsers not yet loaded
+ assertNotNull(loader, "Loader should be created");
+
+ // Load parsers
+ Parser parser1 = loader.loadParsers();
+ assertNotNull(parser1, "First load should return parser");
+
+ // Load again - should return cached instance
+ Parser parser2 = loader.loadParsers();
+ assertTrue(parser1 == parser2, "Should return same cached instance");
+ }
+
+ @Test
+ public void testMinimalParser() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-loader-config.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Parse with minimal parser type
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "application/test+minimal");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+ }
+
+ // Verify minimal parser was invoked
+ assertEquals("minimal", metadata.get("parser-type"));
+ }
+
+ @Test
+ public void testFallbackConfiguration() throws Exception {
+ URL configUrl =
getClass().getResource("/configs/test-loader-config.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Parse with fallback parser type
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "application/test+fallback");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+ }
+
+ // Verify fallback parser was invoked with correct config
+ assertEquals("success", metadata.get("fallback-parser"));
+ assertEquals("primary parser", metadata.get("message"));
+ }
+
+ @Test
+ public void testNoDuplicateParsersFromSpi() throws Exception {
+ // Config explicitly configures ConfigurableTestParser but not the
others
+ URL configUrl =
getClass().getResource("/configs/test-no-duplicate-parsers.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Parse with ConfigurableTestParser - should use the explicitly
configured instance
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+ }
+
+ // Verify it used the configured instance (with
"explicitly-configured" name)
+ // NOT the SPI instance (which would have "default" name from zero-arg
constructor)
+ assertEquals("explicitly-configured", metadata.get("parser-name"));
+ assertEquals("4096", metadata.get("buffer-size"));
+
+ // Verify other parsers (FallbackTestParser, MinimalTestParser) are
still available via SPI
+ Metadata fallbackMetadata = new Metadata();
+ fallbackMetadata.set(Metadata.CONTENT_TYPE,
"application/test+fallback");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(),
fallbackMetadata, new ParseContext());
+ }
+
+ // FallbackTestParser should be loaded from SPI with default config
+ assertEquals("success", fallbackMetadata.get("fallback-parser"));
+ assertEquals("default message", fallbackMetadata.get("message"));
+ }
+
+ @Test
+ public void testWithDefaultParserLoadsSpiParsers() throws Exception {
+ // Config has "default-parser" so should load SPI parsers
+ URL configUrl =
getClass().getResource("/configs/test-with-default-parser.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Verify ConfigurableTestParser uses the configured instance
+ Metadata configurableMetadata = new Metadata();
+ configurableMetadata.set(Metadata.CONTENT_TYPE,
"application/test+configurable");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(),
configurableMetadata, new ParseContext());
+ }
+
+ assertEquals("with-default-config",
configurableMetadata.get("parser-name"));
+ assertEquals("1024", configurableMetadata.get("buffer-size"));
+
+ // Verify FallbackTestParser was loaded from SPI
+ Metadata fallbackMetadata = new Metadata();
+ fallbackMetadata.set(Metadata.CONTENT_TYPE,
"application/test+fallback");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(),
fallbackMetadata, new ParseContext());
+ }
+
+ // FallbackTestParser should be loaded from SPI with default config
+ assertEquals("success", fallbackMetadata.get("fallback-parser"));
+ }
+
+ @Test
+ public void testWithoutDefaultParserSkipsSpiParsers() throws Exception {
+ // Config does NOT have "default-parser" so should only load
configured parsers
+ URL configUrl =
getClass().getResource("/configs/test-no-spi-fallback.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ ParseContext context = new ParseContext();
+
+ // Verify ConfigurableTestParser is supported (explicitly configured)
+ assertTrue(compositeParser.getSupportedTypes(context)
+
.contains(MediaType.parse("application/test+configurable")),
+ "Should support application/test+configurable");
+
+ // Verify FallbackTestParser is NOT supported (not configured, SPI
skipped)
+ assertTrue(!compositeParser.getSupportedTypes(context)
+
.contains(MediaType.parse("application/test+fallback")),
+ "Should NOT support application/test+fallback");
+
+ // Verify MinimalTestParser is NOT supported (not configured, SPI
skipped)
+ assertTrue(!compositeParser.getSupportedTypes(context)
+ .contains(MediaType.parse("application/test+minimal")),
+ "Should NOT support application/test+minimal");
+ }
+
+ @Test
+ public void testDefaultParserWithExclusions() throws Exception {
+ // Config has "default-parser" with exclude list
+ URL configUrl =
getClass().getResource("/configs/test-default-parser-with-exclusions.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ ParseContext context = new ParseContext();
+
+ // Verify ConfigurableTestParser is supported (explicitly configured)
+ assertTrue(compositeParser.getSupportedTypes(context)
+
.contains(MediaType.parse("application/test+configurable")),
+ "Should support application/test+configurable");
+
+ // Verify MinimalTestParser is NOT supported (excluded via
default-parser config)
+ assertTrue(!compositeParser.getSupportedTypes(context)
+ .contains(MediaType.parse("application/test+minimal")),
+ "Should NOT support application/test+minimal (excluded)");
+
+ // Verify FallbackTestParser is NOT supported (excluded via
default-parser config)
+ assertTrue(!compositeParser.getSupportedTypes(context)
+
.contains(MediaType.parse("application/test+fallback")),
+ "Should NOT support application/test+fallback (excluded)");
+ }
+
+ @Test
+ public void testOptInParserExplicitLoad() throws Exception {
+ // Config explicitly loads opt-in parser (spi=false)
+ URL configUrl =
getClass().getResource("/configs/test-opt-in-parser-explicit.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ // Parse with the opt-in parser
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "application/test+optin");
+
+ try (InputStream stream = new
ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) {
+ compositeParser.parse(stream, new DefaultHandler(), metadata, new
ParseContext());
+ }
+
+ // Verify opt-in parser was loaded
+ assertEquals("opt-in", metadata.get("parser-type"));
+ assertEquals("success", metadata.get("opt-in-parser"));
+ }
+
+ @Test
+ public void testOptInParserNotLoadedBySpi() throws Exception {
+ // Config uses default-parser - should NOT load opt-in parser
(spi=false)
+ URL configUrl =
getClass().getResource("/configs/test-opt-in-parser-with-default.json");
+ Path configPath = Path.of(configUrl.toURI());
+
+ TikaLoader loader = TikaLoader.load(configPath);
+ Parser compositeParser = loader.loadParsers();
+
+ ParseContext context = new ParseContext();
+
+ // Verify regular SPI parsers are supported
+ assertTrue(compositeParser.getSupportedTypes(context)
+
.contains(MediaType.parse("application/test+configurable")),
+ "Should support application/test+configurable (SPI)");
+
+ // Verify opt-in parser is NOT supported (spi=false, not explicitly
configured)
+ assertTrue(!compositeParser.getSupportedTypes(context)
+ .contains(MediaType.parse("application/test+optin")),
+ "Should NOT support application/test+optin (opt-in only, not
in SPI)");
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
index 8c1c45a27..b94305d9b 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
@@ -137,7 +137,7 @@ public class JsonMetadataListTest {
public void testLargeValues() throws Exception {
//TIKA-4154
TikaConfig tikaConfig = null;
- try (InputStream is =
JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) {
+ try (InputStream is =
JsonMetadata.class.getResourceAsStream("/configs/tika-config-json.xml")) {
tikaConfig = new TikaConfig(is);
}
StringBuilder sb = new StringBuilder();
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
index 80d32bdc4..7b658fa46 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
@@ -119,7 +119,7 @@ public class JsonMetadataTest {
public void testLargeValues() throws Exception {
//TIKA-4154
TikaConfig tikaConfig = null;
- try (InputStream is =
JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) {
+ try (InputStream is =
JsonMetadata.class.getResourceAsStream("/configs/tika-config-json.xml")) {
tikaConfig = new TikaConfig(is);
}
StringBuilder sb = new StringBuilder();
diff --git
a/tika-serialization/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-serialization/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 000000000..fbf6d840b
--- /dev/null
+++
b/tika-serialization/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,4 @@
+# Test parsers for tika-serialization module
+org.apache.tika.config.loader.ConfigurableTestParser
+org.apache.tika.config.loader.FallbackTestParser
+org.apache.tika.config.loader.MinimalTestParser
diff --git a/tika-serialization/src/test/resources/META-INF/tika/parsers.idx
b/tika-serialization/src/test/resources/META-INF/tika/parsers.idx
new file mode 100644
index 000000000..99a58d8ed
--- /dev/null
+++ b/tika-serialization/src/test/resources/META-INF/tika/parsers.idx
@@ -0,0 +1,5 @@
+# Test parsers index for tika-serialization module tests
+configurable-test-parser=org.apache.tika.config.loader.ConfigurableTestParser
+fallback-test-parser=org.apache.tika.config.loader.FallbackTestParser
+minimal-test-parser=org.apache.tika.config.loader.MinimalTestParser
+opt-in-test-parser=org.apache.tika.config.loader.OptInTestParser
diff --git
a/tika-serialization/src/test/resources/configs/example-tika-config.json
b/tika-serialization/src/test/resources/configs/example-tika-config.json
new file mode 100644
index 000000000..e6810d34b
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/example-tika-config.json
@@ -0,0 +1,56 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "_decorate": {
+ "mimeInclude": ["application/pdf"],
+ "mimeExclude": ["application/pdf+fdf"],
+ "fallbacks": ["empty-parser"]
+ },
+ "ocrStrategy": "AUTO",
+ "extractInlineImages": true
+ }
+ },
+ {
+ "html-parser": {
+ "_decorate": {
+ "mimeExclude": ["application/xhtml+xml"]
+ },
+ "encoding": "UTF-8"
+ }
+ },
+ {
+ "empty-parser": {}
+ }
+ ],
+ "detectors": [
+ {
+ "mime-magic-detector": {}
+ },
+ {
+ "zip-container-detector": {
+ "maxDepth": 10
+ }
+ },
+ {
+ "type-detector": {}
+ }
+ ],
+ "encodingDetectors": [
+ {
+ "icu4j-detector": {}
+ },
+ {
+ "html-encoding-detector": {}
+ }
+ ],
+ "metadataFilters": [
+ {
+ "field-name-mapping-filter": {
+ "mappings": {
+ "dc:title": "title"
+ }
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-decoration-config.json
b/tika-serialization/src/test/resources/configs/test-decoration-config.json
new file mode 100644
index 000000000..63e5b169e
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/test-decoration-config.json
@@ -0,0 +1,14 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "_decorate": {
+ "mimeInclude": ["application/pdf", "text/plain"],
+ "mimeExclude": ["application/pdf+fdf"]
+ },
+ "name": "filtered-parser",
+ "bufferSize": 4096
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
b/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
new file mode 100644
index 000000000..1d6c1dab9
--- /dev/null
+++
b/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
@@ -0,0 +1,16 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "name": "explicitly-configured",
+ "bufferSize": 4096,
+ "enabled": true
+ }
+ },
+ {
+ "default-parser": {
+ "exclude": ["minimal-test-parser", "fallback-test-parser"]
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-loader-config.json
b/tika-serialization/src/test/resources/configs/test-loader-config.json
new file mode 100644
index 000000000..1c1db9688
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/test-loader-config.json
@@ -0,0 +1,25 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "name": "configured-parser",
+ "bufferSize": 2048,
+ "enabled": true,
+ "mode": "advanced"
+ }
+ },
+ {
+ "fallback-test-parser": {
+ "_decorate": {
+ "mimeInclude": ["application/test+fallback"],
+ "fallbacks": ["minimal-test-parser"]
+ },
+ "message": "primary parser",
+ "failOnPurpose": false
+ }
+ },
+ {
+ "minimal-test-parser": {}
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json
b/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json
new file mode 100644
index 000000000..ec8ee1464
--- /dev/null
+++
b/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json
@@ -0,0 +1,14 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "name": "explicitly-configured",
+ "bufferSize": 4096,
+ "enabled": true
+ }
+ },
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json
b/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json
new file mode 100644
index 000000000..a11d1849d
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json
@@ -0,0 +1,11 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "name": "no-spi-fallback",
+ "bufferSize": 512,
+ "enabled": true
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-opt-in-parser-explicit.json
b/tika-serialization/src/test/resources/configs/test-opt-in-parser-explicit.json
new file mode 100644
index 000000000..d6f3da866
--- /dev/null
+++
b/tika-serialization/src/test/resources/configs/test-opt-in-parser-explicit.json
@@ -0,0 +1,7 @@
+{
+ "parsers": [
+ {
+ "opt-in-test-parser": {}
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-opt-in-parser-with-default.json
b/tika-serialization/src/test/resources/configs/test-opt-in-parser-with-default.json
new file mode 100644
index 000000000..92795240e
--- /dev/null
+++
b/tika-serialization/src/test/resources/configs/test-opt-in-parser-with-default.json
@@ -0,0 +1,7 @@
+{
+ "parsers": [
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/test-with-default-parser.json
b/tika-serialization/src/test/resources/configs/test-with-default-parser.json
new file mode 100644
index 000000000..3303c4dba
--- /dev/null
+++
b/tika-serialization/src/test/resources/configs/test-with-default-parser.json
@@ -0,0 +1,14 @@
+{
+ "parsers": [
+ {
+ "configurable-test-parser": {
+ "name": "with-default-config",
+ "bufferSize": 1024,
+ "enabled": true
+ }
+ },
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git a/tika-serialization/src/test/resources/config/tika-config-json.xml
b/tika-serialization/src/test/resources/configs/tika-config-json.xml
similarity index 100%
rename from tika-serialization/src/test/resources/config/tika-config-json.xml
rename to tika-serialization/src/test/resources/configs/tika-config-json.xml