This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4544 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d1b61e37bcab038bb7fed646a0b7868e42838edd Author: tallison <[email protected]> AuthorDate: Mon Nov 24 13:25:39 2025 -0500 TIKA-4544 -- first steps to adding a json loader for TikaConfig --- pom.xml | 1 + tika-annotation-processor/pom.xml | 81 ++++++ .../apache/tika/annotation/KebabCaseConverter.java | 65 +++++ .../tika/annotation/TikaComponentProcessor.java | 273 +++++++++++++++++++++ .../services/javax.annotation.processing.Processor | 1 + .../tika/annotation/KebabCaseConverterTest.java | 80 ++++++ .../tika/config/loader/ComponentInstance.java | 67 +++++ .../tika/config/loader/ComponentRegistry.java | 163 ++++++++++++ .../config/loader/CompositeComponentLoader.java | 234 ++++++++++++++++++ .../apache/tika/config/loader/FrameworkConfig.java | 157 ++++++++++++ .../apache/tika/config/loader/ParserLoader.java | 271 ++++++++++++++++++++ .../apache/tika/config/loader/TikaJsonConfig.java | 248 +++++++++++++++++++ .../org/apache/tika/config/loader/TikaLoader.java | 251 +++++++++++++++++++ .../tika/config/loader/ComponentRegistryTest.java | 90 +++++++ .../tika/config/loader/ConfigurableTestParser.java | 125 ++++++++++ .../tika/config/loader/FallbackTestParser.java | 108 ++++++++ .../tika/config/loader/FrameworkConfigTest.java | 125 ++++++++++ .../tika/config/loader/MinimalTestParser.java | 59 +++++ .../apache/tika/config/loader/TikaLoaderTest.java | 247 +++++++++++++++++++ .../tika/serialization/JsonMetadataListTest.java | 2 +- .../tika/serialization/JsonMetadataTest.java | 2 +- .../resources/configs/example-tika-config.json | 56 +++++ .../resources/configs/test-decoration-config.json | 14 ++ .../test/resources/configs/test-loader-config.json | 25 ++ .../configs/test-no-duplicate-parsers.json | 14 ++ .../resources/configs/test-no-spi-fallback.json | 11 + .../configs/test-with-default-parser.json | 14 ++ .../{config => configs}/tika-config-json.xml | 0 28 files changed, 2782 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 53eabbc4d..b1d2a0ae7 100644 --- a/pom.xml +++ b/pom.xml @@ -38,6 +38,7 @@ <module>tika-parent</module> <module>tika-bom</module> <module>tika-core</module> + <module>tika-annotation-processor</module> <module>tika-serialization</module> <module>tika-plugins-core</module> <module>tika-detectors</module> diff --git a/tika-annotation-processor/pom.xml b/tika-annotation-processor/pom.xml new file mode 100644 index 000000000..65f67e79f --- /dev/null +++ b/tika-annotation-processor/pom.xml @@ -0,0 +1,81 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>4.0.0-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> + </parent> + + <artifactId>tika-annotation-processor</artifactId> + <name>Apache Tika Annotation Processor</name> + <description> + Compile-time annotation processor for @TikaComponent that generates + SPI files and component registries. + </description> + <url>https://tika.apache.org</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + + <!-- Test dependencies --> + + <dependency> + <groupId>com.google.testing.compile</groupId> + <artifactId>compile-testing</artifactId> + <version>0.21.0</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <!-- Disable annotation processing in this module to avoid infinite loop --> + <compilerArgument>-proc:none</compilerArgument> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifestEntries> + <Automatic-Module-Name>org.apache.tika.annotation.processor</Automatic-Module-Name> + </manifestEntries> + </archive> + </configuration> + </plugin> + </plugins> + </build> +</project> diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java new file mode 100644 index 000000000..ba71f5f0f --- /dev/null +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/KebabCaseConverter.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.annotation; + +/** + * Utility for converting Java class names to kebab-case. + * Used for automatic component name generation from class names. + * + * <p>Examples: + * <ul> + * <li>PDFParser → pdf-parser</li> + * <li>OCRParser → ocr-parser</li> + * <li>HTMLParser → html-parser</li> + * <li>DefaultParser → default-parser</li> + * <li>TesseractOCRParser → tesseract-ocr-parser</li> + * </ul> + */ +public class KebabCaseConverter { + + private KebabCaseConverter() { + // Utility class + } + + /** + * Converts a Java class name to kebab-case. + * + * @param className the simple class name (without package) + * @return the kebab-case version of the name + */ + public static String toKebabCase(String className) { + if (className == null || className.isEmpty()) { + return className; + } + + // Insert hyphen before uppercase letters that follow lowercase letters + // or before uppercase letters that are followed by lowercase letters + String result = className + // Insert hyphen between lowercase and uppercase: "aB" -> "a-B" + .replaceAll("([a-z])([A-Z])", "$1-$2") + // Insert hyphen before uppercase letter followed by lowercase + // in a sequence of uppercase letters: "HTMLParser" -> "HTML-Parser" + .replaceAll("([A-Z]+)([A-Z][a-z])", "$1-$2") + // Insert hyphen between letter and digit: "PDF2Text" -> "PDF2-Text" + .replaceAll("([a-zA-Z])(\\d)", "$1-$2") + // Insert hyphen between digit and letter: "2Text" -> "2-Text" + .replaceAll("(\\d)([a-zA-Z])", "$1-$2") + .toLowerCase(); + + return result; + } +} diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java new file mode 100644 index 000000000..d3f3d680b --- /dev/null +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.annotation; + +import javax.annotation.processing.AbstractProcessor; +import javax.annotation.processing.Filer; +import javax.annotation.processing.Messager; +import javax.annotation.processing.ProcessingEnvironment; +import javax.annotation.processing.RoundEnvironment; +import javax.annotation.processing.SupportedAnnotationTypes; +import javax.annotation.processing.SupportedSourceVersion; +import javax.lang.model.SourceVersion; +import javax.lang.model.element.Element; +import javax.lang.model.element.TypeElement; +import javax.lang.model.type.DeclaredType; +import javax.lang.model.type.TypeMirror; +import javax.tools.Diagnostic; +import javax.tools.FileObject; +import javax.tools.StandardLocation; +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.tika.config.TikaComponent; + +/** + * Annotation processor for {@link TikaComponent} that generates: + * <ul> + * <li>Standard Java SPI files (META-INF/services/*) for ServiceLoader</li> + * <li>Component index files (META-INF/tika/*.idx) for name-based lookup</li> + * </ul> + * + * <p>The processor maintains an inclusion list of known Tika service interfaces + * to avoid generating SPI files for utility interfaces like Serializable, Initializable, etc. + */ +@SupportedAnnotationTypes("org.apache.tika.config.TikaComponent") +@SupportedSourceVersion(SourceVersion.RELEASE_11) +public class TikaComponentProcessor extends AbstractProcessor { + + /** + * Known Tika service interfaces for SPI generation. + * Only classes implementing these interfaces will have SPI files generated. + */ + private static final Map<String, String> SERVICE_INTERFACES = new LinkedHashMap<>(); + + static { + // Map interface fully qualified name -> index file name + SERVICE_INTERFACES.put("org.apache.tika.parser.Parser", "parsers"); + SERVICE_INTERFACES.put("org.apache.tika.detect.Detector", "detectors"); + SERVICE_INTERFACES.put("org.apache.tika.detect.EncodingDetector", "encoding-detectors"); + SERVICE_INTERFACES.put("org.apache.tika.language.translate.Translator", "translators"); + SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); + SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); + } + + private Messager messager; + private Filer filer; + + // Accumulate components across rounds + // Map: service interface name -> set of implementing class names + private final Map<String, Set<String>> spiServices = new HashMap<>(); + + // Map: index file name -> map of (component name -> class name) + private final Map<String, Map<String, String>> indexFiles = new HashMap<>(); + + @Override + public synchronized void init(ProcessingEnvironment processingEnv) { + super.init(processingEnv); + this.messager = processingEnv.getMessager(); + this.filer = processingEnv.getFiler(); + } + + @Override + public boolean process(Set<? extends TypeElement> annotations, RoundEnvironment roundEnv) { + if (roundEnv.processingOver()) { + // Final round - write accumulated data + writeServiceFiles(); + writeIndexFiles(); + return true; + } + + for (Element element : roundEnv.getElementsAnnotatedWith(TikaComponent.class)) { + if (element instanceof TypeElement) { + processComponent((TypeElement) element); + } + } + + return true; + } + + private void processComponent(TypeElement element) { + String className = element.getQualifiedName().toString(); + TikaComponent annotation = element.getAnnotation(TikaComponent.class); + + // Determine component name + String componentName = annotation.name(); + if (componentName == null || componentName.isEmpty()) { + // Auto-generate from class name + String simpleName = element.getSimpleName().toString(); + componentName = KebabCaseConverter.toKebabCase(simpleName); + } + + messager.printMessage(Diagnostic.Kind.NOTE, + "Processing @TikaComponent: " + className + " -> " + componentName); + + // Find all implemented service interfaces + List<String> serviceInterfaces = findServiceInterfaces(element); + + if (serviceInterfaces.isEmpty()) { + messager.printMessage(Diagnostic.Kind.WARNING, + "Class " + className + " annotated with @TikaComponent " + + "but does not implement any known Tika service interface", element); + return; + } + + // Add to SPI services + for (String serviceInterface : serviceInterfaces) { + spiServices.computeIfAbsent(serviceInterface, k -> new LinkedHashSet<>()) + .add(className); + + // Add to index files + String indexFileName = SERVICE_INTERFACES.get(serviceInterface); + if (indexFileName != null) { + Map<String, String> index = indexFiles.computeIfAbsent(indexFileName, + k -> new LinkedHashMap<>()); + + // Check for duplicate names + if (index.containsKey(componentName)) { + String existingClass = index.get(componentName); + if (!existingClass.equals(className)) { + messager.printMessage(Diagnostic.Kind.ERROR, + "Duplicate component name '" + componentName + "' for classes: " + + existingClass + " and " + className, element); + } + } else { + index.put(componentName, className); + } + } + } + } + + /** + * Finds all Tika service interfaces implemented by the given type element. + */ + private List<String> findServiceInterfaces(TypeElement element) { + List<String> result = new ArrayList<>(); + Set<String> visited = new LinkedHashSet<>(); + findServiceInterfacesRecursive(element.asType(), result, visited); + return result; + } + + /** + * Recursively searches for service interfaces in the type hierarchy. + */ + private void findServiceInterfacesRecursive(TypeMirror type, List<String> result, + Set<String> visited) { + if (type == null || !(type instanceof DeclaredType)) { + return; + } + + DeclaredType declaredType = (DeclaredType) type; + TypeElement typeElement = (TypeElement) declaredType.asElement(); + String typeName = typeElement.getQualifiedName().toString(); + + // Avoid infinite loops + if (!visited.add(typeName)) { + return; + } + + // Check if this is a service interface + if (SERVICE_INTERFACES.containsKey(typeName)) { + if (!result.contains(typeName)) { + result.add(typeName); + } + } + + // Check superclass + TypeMirror superclass = typeElement.getSuperclass(); + findServiceInterfacesRecursive(superclass, result, visited); + + // Check interfaces + for (TypeMirror interfaceType : typeElement.getInterfaces()) { + findServiceInterfacesRecursive(interfaceType, result, visited); + } + } + + /** + * Writes META-INF/services files for Java SPI. + */ + private void writeServiceFiles() { + for (Map.Entry<String, Set<String>> entry : spiServices.entrySet()) { + String serviceInterface = entry.getKey(); + Set<String> implementations = entry.getValue(); + + try { + FileObject file = filer.createResource(StandardLocation.CLASS_OUTPUT, "", + "META-INF/services/" + serviceInterface); + + try (Writer writer = file.openWriter()) { + writer.write("# Generated by TikaComponentProcessor\n"); + writer.write("# Do not edit manually\n"); + for (String impl : implementations) { + writer.write(impl); + writer.write("\n"); + } + } + + messager.printMessage(Diagnostic.Kind.NOTE, + "Generated SPI file: META-INF/services/" + serviceInterface + + " with " + implementations.size() + " implementations"); + + } catch (IOException e) { + messager.printMessage(Diagnostic.Kind.ERROR, + "Failed to write SPI file for " + serviceInterface + ": " + e.getMessage()); + } + } + } + + /** + * Writes META-INF/tika/*.idx files for name-based component lookup. + */ + private void writeIndexFiles() { + for (Map.Entry<String, Map<String, String>> entry : indexFiles.entrySet()) { + String fileName = entry.getKey(); + Map<String, String> components = entry.getValue(); + + try { + FileObject file = filer.createResource(StandardLocation.CLASS_OUTPUT, "", + "META-INF/tika/" + fileName + ".idx"); + + try (Writer writer = file.openWriter()) { + writer.write("# Generated by TikaComponentProcessor\n"); + writer.write("# Do not edit manually\n"); + writer.write("# Format: component-name=fully.qualified.ClassName\n"); + for (Map.Entry<String, String> component : components.entrySet()) { + writer.write(component.getKey()); + writer.write("="); + writer.write(component.getValue()); + writer.write("\n"); + } + } + + messager.printMessage(Diagnostic.Kind.NOTE, + "Generated index file: META-INF/tika/" + fileName + ".idx" + + " with " + components.size() + " components"); + + } catch (IOException e) { + messager.printMessage(Diagnostic.Kind.ERROR, + "Failed to write index file " + fileName + ": " + e.getMessage()); + } + } + } +} diff --git a/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor b/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor new file mode 100644 index 000000000..462f29223 --- /dev/null +++ b/tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor @@ -0,0 +1 @@ +org.apache.tika.annotation.TikaComponentProcessor diff --git a/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java b/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java new file mode 100644 index 000000000..d0ebacdd2 --- /dev/null +++ b/tika-annotation-processor/src/test/java/org/apache/tika/annotation/KebabCaseConverterTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.annotation; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for KebabCaseConverter. + */ +public class KebabCaseConverterTest { + + @Test + public void testSimpleClassName() { + assertEquals("parser", KebabCaseConverter.toKebabCase("Parser")); + assertEquals("detector", KebabCaseConverter.toKebabCase("Detector")); + } + + @Test + public void testTwoWordClassName() { + assertEquals("pdf-parser", KebabCaseConverter.toKebabCase("PDFParser")); + assertEquals("html-parser", KebabCaseConverter.toKebabCase("HTMLParser")); + assertEquals("ocr-parser", KebabCaseConverter.toKebabCase("OCRParser")); + } + + @Test + public void testMixedCase() { + assertEquals("default-parser", KebabCaseConverter.toKebabCase("DefaultParser")); + assertEquals("composite-detector", KebabCaseConverter.toKebabCase("CompositeDetector")); + } + + @Test + public void testAcronymsFollowedByWord() { + assertEquals("html-parser", KebabCaseConverter.toKebabCase("HTMLParser")); + assertEquals("xml-parser", KebabCaseConverter.toKebabCase("XMLParser")); + assertEquals("tesseract-ocr-parser", KebabCaseConverter.toKebabCase("TesseractOCRParser")); + } + + @Test + public void testNumbersInName() { + assertEquals("pdf-2-text-parser", KebabCaseConverter.toKebabCase("PDF2TextParser")); + assertEquals("mp-3-parser", KebabCaseConverter.toKebabCase("MP3Parser")); + } + + @Test + public void testEdgeCases() { + assertEquals("", KebabCaseConverter.toKebabCase(null)); + assertEquals("", KebabCaseConverter.toKebabCase("")); + assertEquals("a", KebabCaseConverter.toKebabCase("A")); + assertEquals("ab", KebabCaseConverter.toKebabCase("AB")); + } + + @Test + public void testAlreadyLowerCase() { + assertEquals("parser", KebabCaseConverter.toKebabCase("parser")); + } + + @Test + public void testComplexNames() { + assertEquals("microsoft-office-parser", + KebabCaseConverter.toKebabCase("MicrosoftOfficeParser")); + assertEquals("zip-container-detector", + KebabCaseConverter.toKebabCase("ZipContainerDetector")); + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java new file mode 100644 index 000000000..85594c4ad --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInstance.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +/** + * Holds a component instance along with its metadata (priority, etc.). + * + * @param <T> the component type + */ +public class ComponentInstance<T> implements Comparable<ComponentInstance<T>> { + + private final String name; + private final T instance; + private final int priority; + + public ComponentInstance(String name, T instance, int priority) { + this.name = name; + this.instance = instance; + this.priority = priority; + } + + public String getName() { + return name; + } + + public T getInstance() { + return instance; + } + + public int getPriority() { + return priority; + } + + @Override + public int compareTo(ComponentInstance<T> other) { + // Lower priority value = higher priority (processed first) + int priorityCompare = Integer.compare(this.priority, other.priority); + if (priorityCompare != 0) { + return priorityCompare; + } + // Secondary sort by name for stability + return this.name.compareTo(other.name); + } + + @Override + public String toString() { + return "ComponentInstance{" + + "name='" + name + '\'' + + ", instance=" + instance.getClass().getSimpleName() + + ", priority=" + priority + + '}'; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java new file mode 100644 index 000000000..23c343edf --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.Enumeration; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Registry for looking up Tika component classes by name. + * Loads component name-to-class mappings from META-INF/tika/*.idx files + * generated by the {@code @TikaComponent} annotation processor. + */ +public class ComponentRegistry { + + private final Map<String, Class<?>> components; + private final ClassLoader classLoader; + + /** + * Creates a component registry by loading the specified index file. + * + * @param indexFileName the index file name (e.g., "parsers", "detectors") + * without the .idx extension + * @param classLoader the class loader to use for loading classes + * @throws TikaConfigException if the index file cannot be loaded + */ + public ComponentRegistry(String indexFileName, ClassLoader classLoader) + throws TikaConfigException { + this.classLoader = classLoader; + this.components = loadComponents(indexFileName); + } + + /** + * Looks up a component class by name. + * + * @param name the component name (e.g., "pdf-parser") + * @return the component class + * @throws TikaConfigException if the component name is not found + */ + public Class<?> getComponentClass(String name) throws TikaConfigException { + Class<?> clazz = components.get(name); + if (clazz == null) { + throw new TikaConfigException("Unknown component name: '" + name + "'. " + + "Available components: " + components.keySet()); + } + return clazz; + } + + /** + * Returns all registered component names. + * + * @return unmodifiable map of component names to classes + */ + public Map<String, Class<?>> getAllComponents() { + return Collections.unmodifiableMap(components); + } + + /** + * Checks if a component with the given name is registered. + * + * @param name the component name + * @return true if the component is registered + */ + public boolean hasComponent(String name) { + return components.containsKey(name); + } + + private Map<String, Class<?>> loadComponents(String indexFileName) + throws TikaConfigException { + Map<String, Class<?>> result = new LinkedHashMap<>(); + String resourcePath = "META-INF/tika/" + indexFileName + ".idx"; + + try { + Enumeration<URL> resources = classLoader.getResources(resourcePath); + + if (!resources.hasMoreElements()) { + throw new TikaConfigException("Component index file not found: " + resourcePath); + } + + while (resources.hasMoreElements()) { + URL url = resources.nextElement(); + loadFromUrl(url, result); + } + + } catch (IOException e) { + throw new TikaConfigException("Failed to load component index: " + resourcePath, e); + } + + return result; + } + + private void loadFromUrl(URL url, Map<String, Class<?>> result) throws TikaConfigException { + try (InputStream in = url.openStream(); + BufferedReader reader = new BufferedReader( + new InputStreamReader(in, StandardCharsets.UTF_8))) { + + String line; + int lineNumber = 0; + + while ((line = reader.readLine()) != null) { + lineNumber++; + line = line.trim(); + + // Skip comments and empty lines + if (line.isEmpty() || line.startsWith("#")) { + continue; + } + + // Parse: component-name=fully.qualified.ClassName + int equalsIndex = line.indexOf('='); + if (equalsIndex == -1) { + throw new TikaConfigException( + "Invalid index file format at " + url + " line " + lineNumber + + ": expected 'name=class', got: " + line); + } + + String name = line.substring(0, equalsIndex).trim(); + String className = line.substring(equalsIndex + 1).trim(); + + if (name.isEmpty() || className.isEmpty()) { + throw new TikaConfigException( + "Invalid index file format at " + url + " line " + lineNumber + + ": name or class is empty"); + } + + // Load the class + try { + Class<?> clazz = classLoader.loadClass(className); + result.put(name, clazz); + } catch (ClassNotFoundException e) { + throw new TikaConfigException( + "Component class not found: " + className + " (from " + url + ")", e); + } + } + + } catch (IOException e) { + throw new TikaConfigException("Failed to read component index from: " + url, e); + } + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java new file mode 100644 index 000000000..69174ed66 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.ServiceLoader; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.utils.ServiceLoaderUtils; + +/** + * Generic loader for Tika components (detectors, encoding detectors, filters, etc.). + * Supports two loading modes: + * <ul> + * <li>Array-based: explicit ordering, no SPI fallback (default for non-parsers)</li> + * <li>SPI-only: all components from ServiceLoader (when config section missing)</li> + * </ul> + * + * @param <T> the component type + */ +public class CompositeComponentLoader<T> { + + private static final Logger LOG = LoggerFactory.getLogger(CompositeComponentLoader.class); + + private final Class<T> componentInterface; + private final String componentTypeName; + private final String indexFileName; + private final ClassLoader classLoader; + private final ObjectMapper objectMapper; + + /** + * Creates a component loader. + * + * @param componentInterface the component interface (e.g., Detector.class) + * @param componentTypeName the JSON config key (e.g., "detectors") + * @param indexFileName the index file name (e.g., "detectors") + * @param classLoader the class loader + * @param objectMapper the Jackson ObjectMapper + */ + public CompositeComponentLoader(Class<T> componentInterface, String componentTypeName, + String indexFileName, ClassLoader classLoader, + ObjectMapper objectMapper) { + this.componentInterface = componentInterface; + this.componentTypeName = componentTypeName; + this.indexFileName = indexFileName; + this.classLoader = classLoader; + this.objectMapper = objectMapper; + } + + /** + * Loads components from array-based JSON config. + * If config section exists, uses only explicitly listed components (no SPI fallback). + * If config section missing, uses SPI to discover all components. + * + * @param config the Tika JSON configuration + * @return ordered list of component instances + * @throws TikaConfigException if loading fails + */ + public List<T> loadFromArray(TikaJsonConfig config) throws TikaConfigException { + // Check if section exists in config + if (!config.hasComponentSection(componentTypeName)) { + // Section doesn't exist - use SPI fallback + return loadAllFromSpi(); + } + + // Section exists - load only explicitly configured components (no SPI) + List<Map.Entry<String, JsonNode>> arrayComponents = config.getArrayComponents(componentTypeName); + + if (arrayComponents.isEmpty()) { + // Explicit empty array means no components + return Collections.emptyList(); + } + + ComponentRegistry registry = new ComponentRegistry(indexFileName, classLoader); + List<T> instances = new ArrayList<>(); + + for (Map.Entry<String, JsonNode> entry : arrayComponents) { + String name = entry.getKey(); + JsonNode configNode = entry.getValue(); + + T instance = loadComponent(name, configNode, registry); + instances.add(instance); + } + + return instances; + } + + /** + * Loads components from JSON config with SPI fallback (used by parsers). + * + * @param config the Tika JSON configuration + * @return list of component instances + * @throws TikaConfigException if loading fails + */ + public List<T> load(TikaJsonConfig config) throws TikaConfigException { + List<T> instances = new ArrayList<>(); + + // Load configured components + if (config.hasComponents(componentTypeName)) { + ComponentRegistry registry = new ComponentRegistry(indexFileName, classLoader); + Map<String, JsonNode> components = config.getComponents(componentTypeName); + + for (Map.Entry<String, JsonNode> entry : components.entrySet()) { + String name = entry.getKey(); + JsonNode configNode = entry.getValue(); + + T instance = loadConfiguredComponent(name, configNode, registry); + instances.add(instance); + } + } + + // Add SPI-discovered components + List<T> spiComponents = loadSpiComponents(); + instances.addAll(spiComponents); + + return instances; + } + + private T loadConfiguredComponent(String name, JsonNode configNode, + ComponentRegistry registry) + throws TikaConfigException { + try { + // Get component class + Class<?> componentClass = registry.getComponentClass(name); + + // Extract framework config + FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); + + // Instantiate component + T instance = instantiateComponent(componentClass, frameworkConfig.getComponentConfigJson()); + + return instance; + + } catch (Exception e) { + throw new TikaConfigException("Failed to load component '" + name + "' of type " + + componentTypeName, e); + } + } + + @SuppressWarnings("unchecked") + private T instantiateComponent(Class<?> componentClass, String configJson) + throws TikaConfigException { + try { + // Try constructor with String parameter (JSON config) + try { + Constructor<?> constructor = componentClass.getConstructor(String.class); + return (T) constructor.newInstance(configJson); + } catch (NoSuchMethodException e) { + // Fall back to zero-arg constructor + return (T) ServiceLoaderUtils.newInstance(componentClass, + new org.apache.tika.config.ServiceLoader(classLoader)); + } + } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { + throw new TikaConfigException("Failed to instantiate component: " + + componentClass.getName(), e); + } + } + + private List<T> loadSpiComponents() { + List<T> result = new ArrayList<>(); + ServiceLoader<T> serviceLoader = ServiceLoader.load(componentInterface, classLoader); + + Iterator<T> iterator = serviceLoader.iterator(); + while (iterator.hasNext()) { + try { + T instance = iterator.next(); + result.add(instance); + } catch (Exception e) { + // Log and skip problematic SPI providers + LOG.warn("Failed to load SPI component of type {}: {}", componentTypeName, e.getMessage(), e); + } + } + + return result; + } + + private T loadComponent(String name, JsonNode configNode, ComponentRegistry registry) + throws TikaConfigException { + try { + // Get component class + Class<?> componentClass = registry.getComponentClass(name); + + // Instantiate component + return instantiateComponent(componentClass, objectMapper.writeValueAsString(configNode)); + + } catch (Exception e) { + throw new TikaConfigException("Failed to load component '" + name + "' of type " + + componentTypeName, e); + } + } + + private List<T> loadAllFromSpi() { + List<T> result = new ArrayList<>(); + ServiceLoader<T> serviceLoader = ServiceLoader.load(componentInterface, classLoader); + + Iterator<T> iterator = serviceLoader.iterator(); + while (iterator.hasNext()) { + try { + T instance = iterator.next(); + result.add(instance); + } catch (Exception e) { + // Log and skip problematic SPI providers + LOG.warn("Failed to load SPI component of type {}: {}", componentTypeName, e.getMessage(), e); + } + } + + return result; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java new file mode 100644 index 000000000..2cba10db1 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; + +/** + * Extracts framework-level configuration from component JSON, + * separating fields prefixed with underscore from component-specific config. + * + * <p>Framework fields: + * <ul> + * <li>{@code _decorate} - Parser decoration config (mime filtering, fallbacks)</li> + * </ul> + */ +public class FrameworkConfig { + + private static final String DECORATE_KEY = "_decorate"; + + private final ParserDecoration decoration; + private final String componentConfigJson; + + private FrameworkConfig(ParserDecoration decoration, String componentConfigJson) { + this.decoration = decoration; + this.componentConfigJson = componentConfigJson; + } + + /** + * Extracts framework config from JSON node, returning the cleaned component config. + * + * @param configNode the configuration JSON node + * @param objectMapper the Jackson ObjectMapper for serialization + * @return the framework config + * @throws IOException if JSON processing fails + */ + public static FrameworkConfig extract(JsonNode configNode, + ObjectMapper objectMapper) throws IOException { + if (configNode == null || !configNode.isObject()) { + return new FrameworkConfig(null, + objectMapper.writeValueAsString(configNode)); + } + + ObjectNode objNode = (ObjectNode) configNode.deepCopy(); + + // Extract decoration (parser-specific) + ParserDecoration decoration = null; + if (objNode.has(DECORATE_KEY)) { + JsonNode decorateNode = objNode.remove(DECORATE_KEY); + decoration = parseDecoration(decorateNode); + } + + // Remaining fields are component-specific config + String componentConfigJson = objectMapper.writeValueAsString(objNode); + + return new FrameworkConfig(decoration, componentConfigJson); + } + + private static ParserDecoration parseDecoration(JsonNode decorateNode) { + if (decorateNode == null || !decorateNode.isObject()) { + return null; + } + + List<String> mimeInclude = parseStringList(decorateNode.get("mimeInclude")); + List<String> mimeExclude = parseStringList(decorateNode.get("mimeExclude")); + List<String> fallbacks = parseStringList(decorateNode.get("fallbacks")); + + if (mimeInclude.isEmpty() && mimeExclude.isEmpty() && fallbacks.isEmpty()) { + return null; + } + + return new ParserDecoration(mimeInclude, mimeExclude, fallbacks); + } + + private static List<String> parseStringList(JsonNode node) { + if (node == null) { + return Collections.emptyList(); + } + + List<String> result = new ArrayList<>(); + if (node.isArray()) { + for (JsonNode item : node) { + if (item.isTextual()) { + result.add(item.asText()); + } + } + } else if (node.isTextual()) { + result.add(node.asText()); + } + + return result; + } + + public ParserDecoration getDecoration() { + return decoration; + } + + public String getComponentConfigJson() { + return componentConfigJson; + } + + /** + * Parser decoration configuration for mime type filtering and fallbacks. + */ + public static class ParserDecoration { + private final List<String> mimeInclude; + private final List<String> mimeExclude; + private final List<String> fallbacks; + + public ParserDecoration(List<String> mimeInclude, List<String> mimeExclude, + List<String> fallbacks) { + this.mimeInclude = Collections.unmodifiableList(mimeInclude); + this.mimeExclude = Collections.unmodifiableList(mimeExclude); + this.fallbacks = Collections.unmodifiableList(fallbacks); + } + + public List<String> getMimeInclude() { + return mimeInclude; + } + + public List<String> getMimeExclude() { + return mimeExclude; + } + + public List<String> getFallbacks() { + return fallbacks; + } + + public boolean hasFiltering() { + return !mimeInclude.isEmpty() || !mimeExclude.isEmpty(); + } + + public boolean hasFallbacks() { + return !fallbacks.isEmpty(); + } + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java new file mode 100644 index 000000000..710b642ee --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.ServiceLoader; +import java.util.Set; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; +import org.apache.tika.parser.multiple.FallbackParser; +import org.apache.tika.utils.ServiceLoaderUtils; + +/** + * Loader for parsers with support for decoration (mime type filtering, fallbacks). + */ +public class ParserLoader { + + private static final Logger LOG = LoggerFactory.getLogger(ParserLoader.class); + + private final ClassLoader classLoader; + private final ObjectMapper objectMapper; + private final MediaTypeRegistry mediaTypeRegistry; + + /** + * Holds parsed config data before decoration is applied. + */ + private static class ParsedParserConfig { + final String name; + final Parser parser; + final FrameworkConfig.ParserDecoration decoration; + + ParsedParserConfig(String name, Parser parser, + FrameworkConfig.ParserDecoration decoration) { + this.name = name; + this.parser = parser; + this.decoration = decoration; + } + } + + public ParserLoader(ClassLoader classLoader, ObjectMapper objectMapper, + MediaTypeRegistry mediaTypeRegistry) { + this.classLoader = classLoader; + this.objectMapper = objectMapper; + this.mediaTypeRegistry = mediaTypeRegistry; + } + + /** + * Loads parsers from JSON config and builds a CompositeParser. + * + * @param config the Tika JSON configuration + * @return the composite parser + * @throws TikaConfigException if loading fails + */ + public CompositeParser load(TikaJsonConfig config) throws TikaConfigException { + List<Parser> parserList = new ArrayList<>(); + + // Load configured parsers + if (config.hasComponentSection("parsers")) { + ComponentRegistry registry = new ComponentRegistry("parsers", classLoader); + List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers"); + + // Check if "default-parser" is in the list + boolean hasDefaultParser = false; + for (Map.Entry<String, JsonNode> entry : parsers) { + if ("default-parser".equals(entry.getKey())) { + hasDefaultParser = true; + break; + } + } + + // First pass: parse configs and instantiate parsers + // Skip "default-parser" - it's a special marker for SPI fallback, not a real parser + Map<String, ParsedParserConfig> parsedConfigs = new LinkedHashMap<>(); + for (Map.Entry<String, JsonNode> entry : parsers) { + String name = entry.getKey(); + + // Skip the special "default-parser" marker + if ("default-parser".equals(name)) { + continue; + } + + JsonNode configNode = entry.getValue(); + ParsedParserConfig parsed = loadConfiguredParser(name, configNode, registry); + parsedConfigs.put(name, parsed); + } + + // Track configured parser classes (before decoration) to avoid SPI duplicates + Set<Class<?>> configuredParserClasses = new HashSet<>(); + for (ParsedParserConfig parsed : parsedConfigs.values()) { + configuredParserClasses.add(parsed.parser.getClass()); + } + + // Second pass: apply decorations that may reference other parsers + for (ParsedParserConfig parsed : parsedConfigs.values()) { + Parser parser = parsed.parser; + + // Apply decorations if present + if (parsed.decoration != null) { + // Apply mime type filtering + if (parsed.decoration.hasFiltering()) { + parser = applyMimeFiltering(parser, parsed.decoration); + } + + // Apply fallbacks + if (parsed.decoration.hasFallbacks()) { + parser = applyFallbacks(parser, parsed.decoration, parsedConfigs); + } + } + + parserList.add(parser); + } + + // Add SPI-discovered parsers only if "default-parser" is in config + // If "default-parser" is present, use SPI fallback for unlisted parsers + // If "default-parser" is NOT present, only load explicitly configured parsers + if (hasDefaultParser) { + List<Parser> spiParsers = loadSpiParsers(configuredParserClasses); + parserList.addAll(spiParsers); + LOG.debug("Loading SPI parsers because 'default-parser' is in config"); + } else { + LOG.debug("Skipping SPI parsers - 'default-parser' not in config"); + } + } else { + // No configured parsers - load all from SPI + List<Parser> spiParsers = loadSpiParsers(Collections.emptySet()); + parserList.addAll(spiParsers); + } + + return new CompositeParser(mediaTypeRegistry, parserList); + } + + private ParsedParserConfig loadConfiguredParser(String name, JsonNode configNode, + ComponentRegistry registry) + throws TikaConfigException { + try { + // Get parser class + Class<?> parserClass = registry.getComponentClass(name); + + // Extract framework config + FrameworkConfig frameworkConfig = FrameworkConfig.extract(configNode, objectMapper); + + // Instantiate parser + Parser parser = instantiateParser(parserClass, frameworkConfig.getComponentConfigJson()); + + return new ParsedParserConfig(name, parser, frameworkConfig.getDecoration()); + + } catch (Exception e) { + throw new TikaConfigException("Failed to load parser '" + name + "'", e); + } + } + + @SuppressWarnings("unchecked") + private Parser instantiateParser(Class<?> parserClass, String configJson) + throws TikaConfigException { + try { + // Try constructor with String parameter (JSON config) + try { + Constructor<?> constructor = parserClass.getConstructor(String.class); + return (Parser) constructor.newInstance(configJson); + } catch (NoSuchMethodException e) { + // Fall back to zero-arg constructor + return (Parser) ServiceLoaderUtils.newInstance(parserClass, + new org.apache.tika.config.ServiceLoader(classLoader)); + } + } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { + throw new TikaConfigException("Failed to instantiate parser: " + + parserClass.getName(), e); + } + } + + private Parser applyMimeFiltering(Parser parser, FrameworkConfig.ParserDecoration decoration) { + List<String> includes = decoration.getMimeInclude(); + List<String> excludes = decoration.getMimeExclude(); + + if (!includes.isEmpty()) { + Set<MediaType> includeTypes = new HashSet<>(); + for (String mimeStr : includes) { + includeTypes.add(MediaType.parse(mimeStr)); + } + parser = ParserDecorator.withTypes(parser, includeTypes); + } + + if (!excludes.isEmpty()) { + Set<MediaType> excludeTypes = new HashSet<>(); + for (String mimeStr : excludes) { + excludeTypes.add(MediaType.parse(mimeStr)); + } + parser = ParserDecorator.withoutTypes(parser, excludeTypes); + } + + return parser; + } + + private Parser applyFallbacks(Parser parser, FrameworkConfig.ParserDecoration decoration, + Map<String, ParsedParserConfig> parsedConfigs) + throws TikaConfigException { + + List<String> fallbackNames = decoration.getFallbacks(); + List<Parser> fallbackParsers = new ArrayList<>(); + fallbackParsers.add(parser); // Primary parser first + + for (String fallbackName : fallbackNames) { + ParsedParserConfig fallbackConfig = parsedConfigs.get(fallbackName); + if (fallbackConfig == null) { + throw new TikaConfigException("Unknown fallback parser: " + fallbackName); + } + fallbackParsers.add(fallbackConfig.parser); + } + + return new FallbackParser(mediaTypeRegistry, MetadataPolicy.KEEP_ALL, fallbackParsers); + } + + private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) { + List<Parser> result = new ArrayList<>(); + ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class, classLoader); + + Iterator<Parser> iterator = serviceLoader.iterator(); + while (iterator.hasNext()) { + try { + Parser parser = iterator.next(); + + // Skip if this parser class was already loaded from config + if (excludeClasses.contains(parser.getClass())) { + LOG.debug("Skipping SPI parser {} - already configured", + parser.getClass().getName()); + continue; + } + + result.add(parser); + } catch (Exception e) { + // Log and skip problematic SPI providers + LOG.warn("Failed to load SPI parser: {}", e.getMessage(), e); + } + } + + return result; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java new file mode 100644 index 000000000..a2cfa7a7d --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Parsed representation of a Tika JSON configuration file. + * Provides access to component configurations by type (parsers, detectors, etc.). + * + * <p>JSON structure: + * <pre> + * { + * "parsers": [ + * { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } }, + * { "html-parser": { ... } }, + * { "default-parser": {} } + * ], + * "detectors": [ + * { "mime-magic-detector": {} }, + * { "zip-container-detector": { "maxDepth": 10 } } + * ], + * ... + * } + * </pre> + * + * <p>All components use array format for explicit ordering. + * Parsers support decoration via "_decorate" field. + * Special "default-parser" entry enables SPI fallback for unlisted parsers. + */ +public class TikaJsonConfig { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final JsonNode rootNode; + private final Map<String, Map<String, JsonNode>> componentsByType; + private final Map<String, List<Map.Entry<String, JsonNode>>> arrayComponentsByType; + + private TikaJsonConfig(JsonNode rootNode) { + this.rootNode = rootNode; + this.componentsByType = parseObjectComponents(rootNode); + this.arrayComponentsByType = parseArrayComponents(rootNode); + } + + /** + * Loads configuration from a file. + * + * @param configPath the path to the JSON configuration file + * @return the parsed configuration + * @throws TikaConfigException if loading or parsing fails + */ + public static TikaJsonConfig load(Path configPath) throws TikaConfigException { + try (InputStream in = Files.newInputStream(configPath)) { + return load(in); + } catch (IOException e) { + throw new TikaConfigException("Failed to load config from: " + configPath, e); + } + } + + /** + * Loads configuration from an input stream. + * + * @param inputStream the input stream containing JSON configuration + * @return the parsed configuration + * @throws TikaConfigException if loading or parsing fails + */ + public static TikaJsonConfig load(InputStream inputStream) throws TikaConfigException { + try { + JsonNode rootNode = OBJECT_MAPPER.readTree(inputStream); + return new TikaJsonConfig(rootNode); + } catch (IOException e) { + throw new TikaConfigException("Failed to parse JSON configuration", e); + } + } + + /** + * Gets component configurations for a specific type (object format - used for parsers). + * + * @param componentType the component type (e.g., "parsers") + * @return map of component name to configuration JSON, or empty map if type not found + */ + public Map<String, JsonNode> getComponents(String componentType) { + return componentsByType.getOrDefault(componentType, Collections.emptyMap()); + } + + /** + * Gets component configurations for a specific type (array format - used for detectors, etc.). + * + * @param componentType the component type (e.g., "detectors") + * @return ordered list of (name, config) entries, or empty list if type not found + */ + public List<Map.Entry<String, JsonNode>> getArrayComponents(String componentType) { + return arrayComponentsByType.getOrDefault(componentType, Collections.emptyList()); + } + + /** + * Checks if a component type has any configured components (object format). + * + * @param componentType the component type + * @return true if the type has configurations + */ + public boolean hasComponents(String componentType) { + Map<String, JsonNode> components = componentsByType.get(componentType); + return components != null && !components.isEmpty(); + } + + /** + * Checks if a component type has any configured components (array format). + * + * @param componentType the component type + * @return true if the type has configurations + */ + public boolean hasArrayComponents(String componentType) { + List<Map.Entry<String, JsonNode>> components = arrayComponentsByType.get(componentType); + return components != null && !components.isEmpty(); + } + + /** + * Checks if a component type section exists in the config (even if empty). + * + * @param componentType the component type + * @return true if the section exists + */ + public boolean hasComponentSection(String componentType) { + return rootNode.has(componentType); + } + + /** + * Gets the raw root JSON node. + * + * @return the root node + */ + public JsonNode getRootNode() { + return rootNode; + } + + private Map<String, Map<String, JsonNode>> parseObjectComponents(JsonNode root) { + Map<String, Map<String, JsonNode>> result = new LinkedHashMap<>(); + + if (root == null || !root.isObject()) { + return result; + } + + Iterator<Map.Entry<String, JsonNode>> fields = root.fields(); + while (fields.hasNext()) { + Map.Entry<String, JsonNode> entry = fields.next(); + String componentType = entry.getKey(); + JsonNode typeNode = entry.getValue(); + + // Only process object nodes (used for parsers) + if (!typeNode.isObject()) { + continue; + } + + Map<String, JsonNode> components = new LinkedHashMap<>(); + Iterator<Map.Entry<String, JsonNode>> componentFields = typeNode.fields(); + + while (componentFields.hasNext()) { + Map.Entry<String, JsonNode> componentEntry = componentFields.next(); + components.put(componentEntry.getKey(), componentEntry.getValue()); + } + + if (!components.isEmpty()) { + result.put(componentType, components); + } + } + + return result; + } + + private Map<String, List<Map.Entry<String, JsonNode>>> parseArrayComponents(JsonNode root) { + Map<String, List<Map.Entry<String, JsonNode>>> result = new LinkedHashMap<>(); + + if (root == null || !root.isObject()) { + return result; + } + + Iterator<Map.Entry<String, JsonNode>> fields = root.fields(); + while (fields.hasNext()) { + Map.Entry<String, JsonNode> entry = fields.next(); + String componentType = entry.getKey(); + JsonNode typeNode = entry.getValue(); + + // Only process array nodes (used for detectors, filters, etc.) + if (!typeNode.isArray()) { + continue; + } + + List<Map.Entry<String, JsonNode>> components = new ArrayList<>(); + + for (JsonNode arrayItem : typeNode) { + if (!arrayItem.isObject()) { + continue; + } + + // Each array item should have exactly one field: { "component-name": {...config...} } + Iterator<Map.Entry<String, JsonNode>> itemFields = arrayItem.fields(); + if (itemFields.hasNext()) { + Map.Entry<String, JsonNode> componentEntry = itemFields.next(); + components.add(Map.entry(componentEntry.getKey(), componentEntry.getValue())); + } + } + + if (!components.isEmpty()) { + result.put(componentType, components); + } + } + + return result; + } + + /** + * Gets the ObjectMapper used for JSON processing. + * + * @return the object mapper + */ + public static ObjectMapper getObjectMapper() { + return OBJECT_MAPPER; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java new file mode 100644 index 000000000..483596199 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.nio.file.Path; +import java.util.List; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.tika.detect.CompositeDetector; +import org.apache.tika.detect.CompositeEncodingDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.metadata.filter.CompositeMetadataFilter; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.Parser; +import org.apache.tika.renderer.CompositeRenderer; +import org.apache.tika.renderer.Renderer; + +/** + * Main entry point for loading Tika components from JSON configuration. + * Provides lazy loading of component types - only loads classes when requested. + * + * <p>Usage: + * <pre> + * TikaLoader loader = TikaLoader.load(Path.of("tika-config.json")); + * Parser parser = loader.loadParsers(); + * Detector detector = loader.loadDetectors(); + * </pre> + * + * <p>JSON configuration format: + * <pre> + * { + * "parsers": { + * "pdf-parser": { + * "_priority": 10, + * "_decorate": { + * "mimeInclude": ["application/pdf"], + * "mimeExclude": ["application/pdf+fdf"], + * "fallbacks": ["empty-parser"] + * }, + * "ocrStrategy": "AUTO", + * "extractInlineImages": true + * } + * }, + * "detectors": { + * "mime-magic-detector": { ... } + * } + * } + * </pre> + */ +public class TikaLoader { + + private final TikaJsonConfig config; + private final ClassLoader classLoader; + private final ObjectMapper objectMapper; + private final MediaTypeRegistry mediaTypeRegistry; + + // Cached instances (lazy loaded) + private Parser parsers; + private Detector detectors; + private EncodingDetector encodingDetectors; + private MetadataFilter metadataFilters; + private Renderer renderers; + + private TikaLoader(TikaJsonConfig config, ClassLoader classLoader, + MediaTypeRegistry mediaTypeRegistry) { + this.config = config; + this.classLoader = classLoader; + this.objectMapper = TikaJsonConfig.getObjectMapper(); + this.mediaTypeRegistry = mediaTypeRegistry; + } + + /** + * Loads a Tika configuration from a file. + * + * @param configPath the path to the JSON configuration file + * @return the Tika loader + * @throws TikaConfigException if loading or parsing fails + */ + public static TikaLoader load(Path configPath) throws TikaConfigException { + return load(configPath, Thread.currentThread().getContextClassLoader()); + } + + /** + * Loads a Tika configuration from a file with a specific class loader. + * + * @param configPath the path to the JSON configuration file + * @param classLoader the class loader to use for loading components + * @return the Tika loader + * @throws TikaConfigException if loading or parsing fails + */ + public static TikaLoader load(Path configPath, ClassLoader classLoader) + throws TikaConfigException { + TikaJsonConfig config = TikaJsonConfig.load(configPath); + MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry(); + return new TikaLoader(config, classLoader, registry); + } + + /** + * Loads a Tika configuration with custom media type registry. + * + * @param configPath the path to the JSON configuration file + * @param classLoader the class loader to use for loading components + * @param mediaTypeRegistry the media type registry to use + * @return the Tika loader + * @throws TikaConfigException if loading or parsing fails + */ + public static TikaLoader load(Path configPath, ClassLoader classLoader, + MediaTypeRegistry mediaTypeRegistry) + throws TikaConfigException { + TikaJsonConfig config = TikaJsonConfig.load(configPath); + return new TikaLoader(config, classLoader, mediaTypeRegistry); + } + + /** + * Loads and returns all parsers. + * Results are cached - subsequent calls return the same instance. + * + * @return the parser (typically a CompositeParser internally) + * @throws TikaConfigException if loading fails + */ + public synchronized Parser loadParsers() throws TikaConfigException { + if (parsers == null) { + ParserLoader loader = new ParserLoader(classLoader, objectMapper, mediaTypeRegistry); + parsers = loader.load(config); + } + return parsers; + } + + /** + * Loads and returns all detectors. + * If "detectors" section exists in config, uses only those listed (no SPI fallback). + * If section missing, uses SPI to discover detectors. + * Results are cached - subsequent calls return the same instance. + * + * @return the detector (typically a CompositeDetector internally) + * @throws TikaConfigException if loading fails + */ + public synchronized Detector loadDetectors() throws TikaConfigException { + if (detectors == null) { + CompositeComponentLoader<Detector> loader = new CompositeComponentLoader<>( + Detector.class, "detectors", "detectors", classLoader, objectMapper); + List<Detector> detectorList = loader.loadFromArray(config); + detectors = new CompositeDetector(mediaTypeRegistry, detectorList); + } + return detectors; + } + + /** + * Loads and returns all encoding detectors. + * If "encodingDetectors" section exists in config, uses only those listed (no SPI fallback). + * If section missing, uses SPI to discover encoding detectors. + * Results are cached - subsequent calls return the same instance. + * + * @return the encoding detector (typically a CompositeEncodingDetector internally) + * @throws TikaConfigException if loading fails + */ + public synchronized EncodingDetector loadEncodingDetectors() throws TikaConfigException { + if (encodingDetectors == null) { + CompositeComponentLoader<EncodingDetector> loader = new CompositeComponentLoader<>( + EncodingDetector.class, "encodingDetectors", "encoding-detectors", + classLoader, objectMapper); + List<EncodingDetector> detectorList = loader.loadFromArray(config); + encodingDetectors = new CompositeEncodingDetector(detectorList); + } + return encodingDetectors; + } + + /** + * Loads and returns all metadata filters. + * If "metadataFilters" section exists in config, uses only those listed (no SPI fallback). + * If section missing, uses SPI to discover metadata filters. + * Results are cached - subsequent calls return the same instance. + * + * @return the metadata filter (typically a CompositeMetadataFilter internally) + * @throws TikaConfigException if loading fails + */ + public synchronized MetadataFilter loadMetadataFilters() throws TikaConfigException { + if (metadataFilters == null) { + CompositeComponentLoader<MetadataFilter> loader = new CompositeComponentLoader<>( + MetadataFilter.class, "metadataFilters", "metadata-filters", + classLoader, objectMapper); + List<MetadataFilter> filterList = loader.loadFromArray(config); + metadataFilters = new CompositeMetadataFilter(filterList); + } + return metadataFilters; + } + + /** + * Loads and returns all renderers. + * If "renderers" section exists in config, uses only those listed (no SPI fallback). + * If section missing, uses SPI to discover renderers. + * Results are cached - subsequent calls return the same instance. + * + * @return the renderer (typically a CompositeRenderer internally) + * @throws TikaConfigException if loading fails + */ + public synchronized Renderer loadRenderers() throws TikaConfigException { + if (renderers == null) { + CompositeComponentLoader<Renderer> loader = new CompositeComponentLoader<>( + Renderer.class, "renderers", "renderers", classLoader, objectMapper); + List<Renderer> rendererList = loader.loadFromArray(config); + renderers = new CompositeRenderer(rendererList); + } + return renderers; + } + + /** + * Gets the underlying JSON configuration. + * + * @return the JSON configuration + */ + public TikaJsonConfig getConfig() { + return config; + } + + /** + * Gets the class loader used for loading components. + * + * @return the class loader + */ + public ClassLoader getClassLoader() { + return classLoader; + } + + /** + * Gets the media type registry. + * + * @return the media type registry + */ + public MediaTypeRegistry getMediaTypeRegistry() { + return mediaTypeRegistry; + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java new file mode 100644 index 000000000..e1c0b31c1 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ComponentRegistryTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Map; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Unit tests for ComponentRegistry. + */ +public class ComponentRegistryTest { + + @Test + public void testLoadParsersIndex() throws Exception { + ComponentRegistry registry = new ComponentRegistry("parsers", + getClass().getClassLoader()); + + assertNotNull(registry, "Registry should not be null"); + + // Verify test parsers are registered + assertTrue(registry.hasComponent("configurable-test-parser"), + "Should have configurable-test-parser"); + assertTrue(registry.hasComponent("fallback-test-parser"), + "Should have fallback-test-parser"); + assertTrue(registry.hasComponent("minimal-test-parser"), + "Should have minimal-test-parser"); + } + + @Test + public void testGetComponentClass() throws Exception { + ComponentRegistry registry = new ComponentRegistry("parsers", + getClass().getClassLoader()); + + Class<?> clazz = registry.getComponentClass("configurable-test-parser"); + assertNotNull(clazz, "Component class should not be null"); + assertEquals("org.apache.tika.config.loader.ConfigurableTestParser", + clazz.getName()); + } + + @Test + public void testGetAllComponents() throws Exception { + ComponentRegistry registry = new ComponentRegistry("parsers", + getClass().getClassLoader()); + + Map<String, Class<?>> all = registry.getAllComponents(); + assertNotNull(all, "All components map should not be null"); + assertTrue(all.size() >= 3, "Should have at least 3 test parsers"); + } + + @Test + public void testUnknownComponent() throws Exception { + ComponentRegistry registry = new ComponentRegistry("parsers", + getClass().getClassLoader()); + + // Should throw exception + assertThrows(TikaConfigException.class, () -> { + registry.getComponentClass("non-existent-parser"); + }); + } + + @Test + public void testNonExistentIndexFile() throws Exception { + // Should throw exception when index file doesn't exist + assertThrows(TikaConfigException.class, () -> { + new ComponentRegistry("non-existent-type", getClass().getClassLoader()); + }); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java new file mode 100644 index 000000000..5323e9de8 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigurableTestParser.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * Test parser with configurable properties for testing JSON configuration loading. + */ +@TikaComponent(name = "configurable-test-parser") +public class ConfigurableTestParser implements Parser { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final long serialVersionUID = 1L; + + private final TestParserConfig config; + + /** + * Constructor for JSON-based configuration. + */ + public ConfigurableTestParser(String jsonConfig) throws TikaConfigException { + try { + this.config = OBJECT_MAPPER.readValue(jsonConfig, TestParserConfig.class); + } catch (IOException e) { + throw new TikaConfigException("Failed to parse JSON config", e); + } + } + + /** + * Zero-arg constructor for SPI fallback. + */ + public ConfigurableTestParser() { + this.config = new TestParserConfig(); + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.parse("application/test+configurable")); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + // Simple implementation that writes config to metadata + metadata.set("parser-name", config.getName()); + metadata.set("buffer-size", String.valueOf(config.getBufferSize())); + metadata.set("enabled", String.valueOf(config.isEnabled())); + metadata.set("mode", config.getMode()); + } + + public TestParserConfig getConfig() { + return config; + } + + /** + * Configuration POJO for ConfigurableTestParser. + */ + public static class TestParserConfig { + private String name = "default"; + private int bufferSize = 1024; + private boolean enabled = true; + private String mode = "normal"; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public int getBufferSize() { + return bufferSize; + } + + public void setBufferSize(int bufferSize) { + this.bufferSize = bufferSize; + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + + public String getMode() { + return mode; + } + + public void setMode(String mode) { + this.mode = mode; + } + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java new file mode 100644 index 000000000..06470ceb2 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/FallbackTestParser.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * Simple test parser for fallback chain testing. + */ +@TikaComponent(name = "fallback-test-parser") +public class FallbackTestParser implements Parser { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final long serialVersionUID = 1L; + + private final FallbackConfig config; + + /** + * Constructor for JSON-based configuration. + */ + public FallbackTestParser(String jsonConfig) throws TikaConfigException { + try { + this.config = OBJECT_MAPPER.readValue(jsonConfig, FallbackConfig.class); + } catch (IOException e) { + throw new TikaConfigException("Failed to parse JSON config", e); + } + } + + /** + * Zero-arg constructor for SPI fallback. + */ + public FallbackTestParser() { + this.config = new FallbackConfig(); + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.parse("application/test+fallback")); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + if (config.isFailOnPurpose()) { + throw new TikaException("Intentional failure for testing fallback: " + config.getMessage()); + } + // Success case + metadata.set("fallback-parser", "success"); + metadata.set("message", config.getMessage()); + } + + public FallbackConfig getConfig() { + return config; + } + + /** + * Configuration POJO for FallbackTestParser. + */ + public static class FallbackConfig { + private String message = "default message"; + private boolean failOnPurpose = false; + + public String getMessage() { + return message; + } + + public void setMessage(String message) { + this.message = message; + } + + public boolean isFailOnPurpose() { + return failOnPurpose; + } + + public void setFailOnPurpose(boolean failOnPurpose) { + this.failOnPurpose = failOnPurpose; + } + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java new file mode 100644 index 000000000..beba6e055 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for FrameworkConfig. + */ +public class FrameworkConfigTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + @Test + public void testExtractDecoration() throws Exception { + String json = """ + { + "_decorate": { + "mimeInclude": ["application/pdf"], + "mimeExclude": ["application/pdf+fdf"], + "fallbacks": ["backup-parser"] + }, + "name": "test" + } + """; + JsonNode node = MAPPER.readTree(json); + + FrameworkConfig config = FrameworkConfig.extract(node, MAPPER); + + assertNotNull(config.getDecoration(), "Decoration should be present"); + + FrameworkConfig.ParserDecoration decoration = config.getDecoration(); + assertTrue(decoration.hasFiltering(), "Should have filtering"); + assertTrue(decoration.hasFallbacks(), "Should have fallbacks"); + + assertEquals(1, decoration.getMimeInclude().size()); + assertEquals("application/pdf", decoration.getMimeInclude().get(0)); + + assertEquals(1, decoration.getMimeExclude().size()); + assertEquals("application/pdf+fdf", decoration.getMimeExclude().get(0)); + + assertEquals(1, decoration.getFallbacks().size()); + assertEquals("backup-parser", decoration.getFallbacks().get(0)); + + assertFalse(config.getComponentConfigJson().contains("_decorate"), + "Component config should not contain _decorate"); + } + + @Test + public void testNoDecoration() throws Exception { + String json = """ + { + "name": "test" + } + """; + JsonNode node = MAPPER.readTree(json); + + FrameworkConfig config = FrameworkConfig.extract(node, MAPPER); + + assertNull(config.getDecoration(), "Decoration should be null"); + } + + @Test + public void testEmptyDecoration() throws Exception { + String json = """ + { + "_decorate": {}, + "name": "test" + } + """; + JsonNode node = MAPPER.readTree(json); + + FrameworkConfig config = FrameworkConfig.extract(node, MAPPER); + + // Empty decoration should return null + assertNull(config.getDecoration(), "Empty decoration should be null"); + } + + @Test + public void testComponentConfigJsonClean() throws Exception { + String json = """ + { + "_decorate": { + "mimeInclude": ["text/plain"] + }, + "bufferSize": 1024, + "enabled": true + } + """; + JsonNode node = MAPPER.readTree(json); + + FrameworkConfig config = FrameworkConfig.extract(node, MAPPER); + + String componentJson = config.getComponentConfigJson(); + + // Verify framework fields are removed + assertFalse(componentJson.contains("_decorate"), "Should not contain _decorate"); + + // Verify component fields remain + assertTrue(componentJson.contains("bufferSize"), "Should contain bufferSize"); + assertTrue(componentJson.contains("enabled"), "Should contain enabled"); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java new file mode 100644 index 000000000..9b47995fb --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/MinimalTestParser.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * Minimal test parser with only zero-arg constructor for testing SPI fallback. + */ +@TikaComponent(name = "minimal-test-parser") +public class MinimalTestParser implements Parser { + + private static final long serialVersionUID = 1L; + + /** + * Zero-arg constructor only - no JSON config support. + */ + public MinimalTestParser() { + // No config + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.parse("application/test+minimal")); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + metadata.set("parser-type", "minimal"); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java new file mode 100644 index 000000000..28650793f --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * Unit tests for TikaLoader JSON configuration loading. + */ +public class TikaLoaderTest { + + @Test + public void testBasicParserLoading() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + assertNotNull(configUrl, "Test config not found"); + + Path configPath = Path.of(configUrl.toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + Parser parser = loader.loadParsers(); + assertNotNull(parser, "Parser should not be null"); + } + + @Test + public void testConfigurableParserConfiguration() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + // Parse with the composite parser to verify config was applied + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } + + // Verify the configured values were used + assertEquals("configured-parser", metadata.get("parser-name")); + assertEquals("2048", metadata.get("buffer-size")); + assertEquals("true", metadata.get("enabled")); + assertEquals("advanced", metadata.get("mode")); + } + + @Test + public void testMimeTypeDecoration() throws Exception { + URL configUrl = getClass().getResource("/configs/test-decoration-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser parser = loader.loadParsers(); + + ParseContext context = new ParseContext(); + + // Test that included types are supported + assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("application/pdf")), + "Should support application/pdf"); + assertTrue(parser.getSupportedTypes(context).contains(MediaType.parse("text/plain")), + "Should support text/plain"); + } + + @Test + public void testLazyLoading() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + + // Verify loader created but parsers not yet loaded + assertNotNull(loader, "Loader should be created"); + + // Load parsers + Parser parser1 = loader.loadParsers(); + assertNotNull(parser1, "First load should return parser"); + + // Load again - should return cached instance + Parser parser2 = loader.loadParsers(); + assertTrue(parser1 == parser2, "Should return same cached instance"); + } + + @Test + public void testMinimalParser() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + // Parse with minimal parser type + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/test+minimal"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } + + // Verify minimal parser was invoked + assertEquals("minimal", metadata.get("parser-type")); + } + + @Test + public void testFallbackConfiguration() throws Exception { + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + // Parse with fallback parser type + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/test+fallback"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } + + // Verify fallback parser was invoked with correct config + assertEquals("success", metadata.get("fallback-parser")); + assertEquals("primary parser", metadata.get("message")); + } + + @Test + public void testNoDuplicateParsersFromSpi() throws Exception { + // Config explicitly configures ConfigurableTestParser but not the others + URL configUrl = getClass().getResource("/configs/test-no-duplicate-parsers.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + // Parse with ConfigurableTestParser - should use the explicitly configured instance + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "application/test+configurable"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } + + // Verify it used the configured instance (with "explicitly-configured" name) + // NOT the SPI instance (which would have "default" name from zero-arg constructor) + assertEquals("explicitly-configured", metadata.get("parser-name")); + assertEquals("4096", metadata.get("buffer-size")); + + // Verify other parsers (FallbackTestParser, MinimalTestParser) are still available via SPI + Metadata fallbackMetadata = new Metadata(); + fallbackMetadata.set(Metadata.CONTENT_TYPE, "application/test+fallback"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), fallbackMetadata, new ParseContext()); + } + + // FallbackTestParser should be loaded from SPI with default config + assertEquals("success", fallbackMetadata.get("fallback-parser")); + assertEquals("default message", fallbackMetadata.get("message")); + } + + @Test + public void testWithDefaultParserLoadsSpiParsers() throws Exception { + // Config has "default-parser" so should load SPI parsers + URL configUrl = getClass().getResource("/configs/test-with-default-parser.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + // Verify ConfigurableTestParser uses the configured instance + Metadata configurableMetadata = new Metadata(); + configurableMetadata.set(Metadata.CONTENT_TYPE, "application/test+configurable"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), configurableMetadata, new ParseContext()); + } + + assertEquals("with-default-config", configurableMetadata.get("parser-name")); + assertEquals("1024", configurableMetadata.get("buffer-size")); + + // Verify FallbackTestParser was loaded from SPI + Metadata fallbackMetadata = new Metadata(); + fallbackMetadata.set(Metadata.CONTENT_TYPE, "application/test+fallback"); + + try (InputStream stream = new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))) { + compositeParser.parse(stream, new DefaultHandler(), fallbackMetadata, new ParseContext()); + } + + // FallbackTestParser should be loaded from SPI with default config + assertEquals("success", fallbackMetadata.get("fallback-parser")); + } + + @Test + public void testWithoutDefaultParserSkipsSpiParsers() throws Exception { + // Config does NOT have "default-parser" so should only load configured parsers + URL configUrl = getClass().getResource("/configs/test-no-spi-fallback.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + Parser compositeParser = loader.loadParsers(); + + ParseContext context = new ParseContext(); + + // Verify ConfigurableTestParser is supported (explicitly configured) + assertTrue(compositeParser.getSupportedTypes(context) + .contains(MediaType.parse("application/test+configurable")), + "Should support application/test+configurable"); + + // Verify FallbackTestParser is NOT supported (not configured, SPI skipped) + assertTrue(!compositeParser.getSupportedTypes(context) + .contains(MediaType.parse("application/test+fallback")), + "Should NOT support application/test+fallback"); + + // Verify MinimalTestParser is NOT supported (not configured, SPI skipped) + assertTrue(!compositeParser.getSupportedTypes(context) + .contains(MediaType.parse("application/test+minimal")), + "Should NOT support application/test+minimal"); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java index 8c1c45a27..b94305d9b 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java @@ -137,7 +137,7 @@ public class JsonMetadataListTest { public void testLargeValues() throws Exception { //TIKA-4154 TikaConfig tikaConfig = null; - try (InputStream is = JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) { + try (InputStream is = JsonMetadata.class.getResourceAsStream("/configs/tika-config-json.xml")) { tikaConfig = new TikaConfig(is); } StringBuilder sb = new StringBuilder(); diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java index 80d32bdc4..7b658fa46 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java @@ -119,7 +119,7 @@ public class JsonMetadataTest { public void testLargeValues() throws Exception { //TIKA-4154 TikaConfig tikaConfig = null; - try (InputStream is = JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) { + try (InputStream is = JsonMetadata.class.getResourceAsStream("/configs/tika-config-json.xml")) { tikaConfig = new TikaConfig(is); } StringBuilder sb = new StringBuilder(); diff --git a/tika-serialization/src/test/resources/configs/example-tika-config.json b/tika-serialization/src/test/resources/configs/example-tika-config.json new file mode 100644 index 000000000..e6810d34b --- /dev/null +++ b/tika-serialization/src/test/resources/configs/example-tika-config.json @@ -0,0 +1,56 @@ +{ + "parsers": [ + { + "pdf-parser": { + "_decorate": { + "mimeInclude": ["application/pdf"], + "mimeExclude": ["application/pdf+fdf"], + "fallbacks": ["empty-parser"] + }, + "ocrStrategy": "AUTO", + "extractInlineImages": true + } + }, + { + "html-parser": { + "_decorate": { + "mimeExclude": ["application/xhtml+xml"] + }, + "encoding": "UTF-8" + } + }, + { + "empty-parser": {} + } + ], + "detectors": [ + { + "mime-magic-detector": {} + }, + { + "zip-container-detector": { + "maxDepth": 10 + } + }, + { + "type-detector": {} + } + ], + "encodingDetectors": [ + { + "icu4j-detector": {} + }, + { + "html-encoding-detector": {} + } + ], + "metadataFilters": [ + { + "field-name-mapping-filter": { + "mappings": { + "dc:title": "title" + } + } + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/test-decoration-config.json b/tika-serialization/src/test/resources/configs/test-decoration-config.json new file mode 100644 index 000000000..63e5b169e --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-decoration-config.json @@ -0,0 +1,14 @@ +{ + "parsers": [ + { + "configurable-test-parser": { + "_decorate": { + "mimeInclude": ["application/pdf", "text/plain"], + "mimeExclude": ["application/pdf+fdf"] + }, + "name": "filtered-parser", + "bufferSize": 4096 + } + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/test-loader-config.json b/tika-serialization/src/test/resources/configs/test-loader-config.json new file mode 100644 index 000000000..1c1db9688 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-loader-config.json @@ -0,0 +1,25 @@ +{ + "parsers": [ + { + "configurable-test-parser": { + "name": "configured-parser", + "bufferSize": 2048, + "enabled": true, + "mode": "advanced" + } + }, + { + "fallback-test-parser": { + "_decorate": { + "mimeInclude": ["application/test+fallback"], + "fallbacks": ["minimal-test-parser"] + }, + "message": "primary parser", + "failOnPurpose": false + } + }, + { + "minimal-test-parser": {} + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json b/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json new file mode 100644 index 000000000..ec8ee1464 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-no-duplicate-parsers.json @@ -0,0 +1,14 @@ +{ + "parsers": [ + { + "configurable-test-parser": { + "name": "explicitly-configured", + "bufferSize": 4096, + "enabled": true + } + }, + { + "default-parser": {} + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json b/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json new file mode 100644 index 000000000..a11d1849d --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-no-spi-fallback.json @@ -0,0 +1,11 @@ +{ + "parsers": [ + { + "configurable-test-parser": { + "name": "no-spi-fallback", + "bufferSize": 512, + "enabled": true + } + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/test-with-default-parser.json b/tika-serialization/src/test/resources/configs/test-with-default-parser.json new file mode 100644 index 000000000..3303c4dba --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-with-default-parser.json @@ -0,0 +1,14 @@ +{ + "parsers": [ + { + "configurable-test-parser": { + "name": "with-default-config", + "bufferSize": 1024, + "enabled": true + } + }, + { + "default-parser": {} + } + ] +} diff --git a/tika-serialization/src/test/resources/config/tika-config-json.xml b/tika-serialization/src/test/resources/configs/tika-config-json.xml similarity index 100% rename from tika-serialization/src/test/resources/config/tika-config-json.xml rename to tika-serialization/src/test/resources/configs/tika-config-json.xml
