This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545 in repository https://gitbox.apache.org/repos/asf/tika.git
commit dab213d05aa46d84f3a881458bbac969497284fe Author: tallison <[email protected]> AuthorDate: Tue Nov 25 11:17:31 2025 -0500 TIKA-4545 -- baseline improvements to support integration --- tika-annotation-processor/pom.xml | 9 - .../tika/annotation/TikaComponentProcessor.java | 68 ++- .../java/org/apache/tika/config/TikaComponent.java | 5 +- tika-core/pom.xml | 21 + .../org/apache/tika/parser/AutoDetectParser.java | 10 + .../org/apache/tika/parser/CompositeParser.java | 4 + tika-serialization/pom.xml | 7 + .../apache/tika/config/loader/ConfigLoader.java | 356 ++++++++++++ .../tika/config/loader/KebabCaseConverter.java | 72 +++ .../apache/tika/config/loader/ParserLoader.java | 2 + .../apache/tika/config/loader/TikaJsonConfig.java | 12 +- .../org/apache/tika/config/loader/TikaLoader.java | 37 ++ .../tika/config/loader/ConfigLoaderTest.java | 646 +++++++++++++++++++++ .../test/resources/configs/test-config-loader.json | 32 + .../resources/configs/test-interface-no-type.json | 6 + .../test/resources/configs/test-invalid-class.json | 3 + .../resources/configs/test-partial-config.json | 15 + .../resources/configs/test-unexpected-field.json | 8 + .../test/resources/configs/test-wrong-type.json | 3 + 19 files changed, 1288 insertions(+), 28 deletions(-) diff --git a/tika-annotation-processor/pom.xml b/tika-annotation-processor/pom.xml index 9c93eb9ac..459dc4ea6 100644 --- a/tika-annotation-processor/pom.xml +++ b/tika-annotation-processor/pom.xml @@ -37,15 +37,6 @@ </description> <url>https://tika.apache.org</url> - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - </dependencies> - <build> <plugins> <plugin> diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index 3a1800679..02424f862 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -19,6 +19,7 @@ package org.apache.tika.annotation; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedHashSet; @@ -54,7 +55,7 @@ import org.apache.tika.config.TikaComponent; * to avoid generating SPI files for utility interfaces like Serializable, Initializable, etc. */ @SupportedAnnotationTypes("org.apache.tika.config.TikaComponent") -@SupportedSourceVersion(SourceVersion.RELEASE_11) +@SupportedSourceVersion(SourceVersion.RELEASE_17) public class TikaComponentProcessor extends AbstractProcessor { /** @@ -145,22 +146,24 @@ public class TikaComponentProcessor extends AbstractProcessor { .add(className); } - // Always add to index files (regardless of SPI setting) - String indexFileName = SERVICE_INTERFACES.get(serviceInterface); - if (indexFileName != null) { - Map<String, String> index = indexFiles.computeIfAbsent(indexFileName, - k -> new LinkedHashMap<>()); - - // Check for duplicate names - if (index.containsKey(componentName)) { - String existingClass = index.get(componentName); - if (!existingClass.equals(className)) { - messager.printMessage(Diagnostic.Kind.ERROR, - "Duplicate component name '" + componentName + "' for classes: " + - existingClass + " and " + className, element); + // Add to index files only if spi = true + if (includeSpi) { + String indexFileName = SERVICE_INTERFACES.get(serviceInterface); + if (indexFileName != null) { + Map<String, String> index = indexFiles.computeIfAbsent(indexFileName, + k -> new LinkedHashMap<>()); + + // Check for duplicate names + if (index.containsKey(componentName)) { + String existingClass = index.get(componentName); + if (!existingClass.equals(className)) { + messager.printMessage(Diagnostic.Kind.ERROR, + "Duplicate component name '" + componentName + "' for classes: " + + existingClass + " and " + className, element); + } + } else { + index.put(componentName, className); } - } else { - index.put(componentName, className); } } } @@ -219,14 +222,20 @@ public class TikaComponentProcessor extends AbstractProcessor { String serviceInterface = entry.getKey(); Set<String> implementations = entry.getValue(); + // Sort implementations alphabetically for deterministic output + List<String> sortedImplementations = new ArrayList<>(implementations); + Collections.sort(sortedImplementations); + try { FileObject file = filer.createResource(StandardLocation.CLASS_OUTPUT, "", "META-INF/services/" + serviceInterface); try (Writer writer = file.openWriter()) { + writeApacheLicenseHeader(writer); + writer.write("\n\n"); writer.write("# Generated by TikaComponentProcessor\n"); writer.write("# Do not edit manually\n"); - for (String impl : implementations) { + for (String impl : sortedImplementations) { writer.write(impl); writer.write("\n"); } @@ -256,6 +265,7 @@ public class TikaComponentProcessor extends AbstractProcessor { "META-INF/tika/" + fileName + ".idx"); try (Writer writer = file.openWriter()) { + writeApacheLicenseHeader(writer); writer.write("# Generated by TikaComponentProcessor\n"); writer.write("# Do not edit manually\n"); writer.write("# Format: component-name=fully.qualified.ClassName\n"); @@ -277,4 +287,28 @@ public class TikaComponentProcessor extends AbstractProcessor { } } } + + /** + * Writes the Apache License 2.0 header to a file. + */ + private void writeApacheLicenseHeader(Writer writer) throws IOException { + String header = """ + # Licensed to the Apache Software Foundation (ASF) under one or more + # contributor license agreements. See the NOTICE file distributed with + # this work for additional information regarding copyright ownership. + # The ASF licenses this file to You under the Apache License, Version 2.0 + # (the "License"); you may not use this file except in compliance with + # the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + """; + writer.write(header); + } } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java similarity index 93% rename from tika-core/src/main/java/org/apache/tika/config/TikaComponent.java rename to tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index 8696ab2db..6632bdeb7 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -34,6 +34,9 @@ import java.lang.annotation.Target; * <li>Component index files (META-INF/tika/{type}.idx) for name-based lookup</li> * </ul> * + * <p>This annotation is only used at compile time by the annotation processor. + * It is retained in .class files for tooling but not loaded by the runtime JVM. + * * <p>Example usage: * <pre> * {@code @TikaComponent} @@ -54,7 +57,7 @@ import java.lang.annotation.Target; * * @since 3.1.0 */ -@Retention(RetentionPolicy.RUNTIME) +@Retention(RetentionPolicy.CLASS) @Target(ElementType.TYPE) public @interface TikaComponent { diff --git a/tika-core/pom.xml b/tika-core/pom.xml index e0c408e34..7e97e367d 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -64,6 +64,14 @@ <scope>provided</scope> </dependency> + <!-- Annotation processor - contains @TikaComponent and ensures build order --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <!-- Test dependencies --> <dependency> <groupId>com.google.guava</groupId> @@ -130,6 +138,19 @@ </execution> </executions> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <annotationProcessorPaths> + <path> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + </path> + </annotationProcessorPaths> + </configuration> + </plugin> <plugin> <groupId>org.apache.felix</groupId> <artifactId>maven-bundle-plugin</artifactId> diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 86eae692a..7fd9e0b0b 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -90,6 +90,16 @@ public class AutoDetectParser extends CompositeParser { setAutoDetectParserConfig(AutoDetectParserConfig.DEFAULT); } + public AutoDetectParser(CompositeParser parser, Detector detector, AutoDetectParserConfig autoDetectParserConfig) { + super(parser); + setDetector(detector); + setAutoDetectParserConfig(autoDetectParserConfig); + } + + public static Parser build(CompositeParser parser, Detector detector, AutoDetectParserConfig autoDetectParserConfig) { + return new AutoDetectParser(parser, detector, autoDetectParserConfig); + } + public AutoDetectParser(TikaConfig config) { super(config.getMediaTypeRegistry(), getParser(config)); setFallback(buildFallbackParser(config)); diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java index 3b50b4da7..b3aaebf7a 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java @@ -97,6 +97,10 @@ public class CompositeParser implements Parser { this(new MediaTypeRegistry()); } + public CompositeParser(CompositeParser compositeParser) { + this(compositeParser.registry, compositeParser); + } + public Map<MediaType, Parser> getParsers(ParseContext context) { Map<MediaType, Parser> map = new HashMap<>(); for (Parser parser : parsers) { diff --git a/tika-serialization/pom.xml b/tika-serialization/pom.xml index f5eea54f9..38849fdf7 100644 --- a/tika-serialization/pom.xml +++ b/tika-serialization/pom.xml @@ -47,6 +47,13 @@ <version>${project.version}</version> <scope>provided</scope> </dependency> + <!-- Annotation processor - contains @TikaComponent --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-core</artifactId> diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java new file mode 100644 index 000000000..62db2d168 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.lang.reflect.Modifier; +import java.util.Set; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Loader for simple configuration objects from JSON. + * <p> + * This class handles straightforward POJOs that can be deserialized directly from JSON. + * For complex components like Parsers, Detectors, etc., use the specific methods on + * {@link TikaLoader} instead (e.g., {@code loadParsers()}, {@code loadDetectors()}). + * + * <p>Usage: + * <pre> + * TikaLoader loader = TikaLoader.load(configPath); + * + * // Load by explicit key + * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class); + * + * // Load by class name (auto-converts to kebab-case) + * HandlerConfig config = loader.configs().load(HandlerConfig.class); + * </pre> + */ +public class ConfigLoader { + + /** + * Reserved keys for complex components that require special handling. + * These cannot be loaded via ConfigLoader - use TikaLoader methods instead. + */ + private static final Set<String> PROHIBITED_KEYS = Set.of( + "parsers", + "detectors", + "encoding-detectors", + "encodingDetectors", + "metadata-filters", + "metadataFilters", + "renderers", + "translators" + ); + + private final TikaJsonConfig config; + private final ObjectMapper objectMapper; + + ConfigLoader(TikaJsonConfig config, ObjectMapper objectMapper) { + this.config = config; + this.objectMapper = objectMapper; + } + + /** + * Loads a configuration object using the class name converted to kebab-case. + * <p> + * For example, {@code HandlerConfig.class} will look for key "handler-config". + * Class name suffixes like "Config", "Configuration", "Settings" are stripped first. + * <p> + * For interfaces, the JSON must specify the implementation (see {@link #load(String, Class)}). + * + * @param clazz The class to deserialize into (can be interface, abstract, or concrete) + * @param <T> The type to load + * @return the deserialized object, or null if key not found in config + * @throws TikaConfigException if loading fails or class is not instantiable + */ + public <T> T load(Class<T> clazz) throws TikaConfigException { + String key = deriveKeyFromClass(clazz); + return load(key, clazz); + } + + /** + * Loads a configuration object using the class name, with a default value. + * + * @param clazz The class to deserialize into + * @param defaultValue The value to return if key not found in config + * @param <T> The type to load + * @return the deserialized object, or defaultValue if not present + * @throws TikaConfigException if loading fails or class is not instantiable + */ + public <T> T load(Class<T> clazz, T defaultValue) throws TikaConfigException { + T result = load(clazz); + return result != null ? result : defaultValue; + } + + /** + * Loads a configuration object from the specified JSON key. + * <p> + * Supports three formats for interfaces: + * <ul> + * <li>String value: treated as class name or component name to look up</li> + * <li>Object with "@class": explicit type specification</li> + * <li>Object without "@class": attempts direct deserialization (works for concrete classes)</li> + * </ul> + * + * @param key The JSON key to load from + * @param clazz The class to deserialize into (can be interface, abstract, or concrete) + * @param <T> The type to load + * @return the deserialized object, or null if key not found + * @throws TikaConfigException if loading fails or class cannot be instantiated + */ + public <T> T load(String key, Class<T> clazz) throws TikaConfigException { + validateKey(key); + validateClass(clazz); + + JsonNode node = config.getRootNode().get(key); + if (node == null || node.isNull()) { + return null; + } + + try { + // Strategy 1: String value - treat as class name + if (node.isTextual()) { + return loadFromClassName(node.asText(), clazz); + } + + // Strategy 2: Object with @class field - explicit type + if (node.isObject() && node.has("@class")) { + String className = node.get("@class").asText(); + Class<?> targetClass = Class.forName(className); + if (!clazz.isAssignableFrom(targetClass)) { + throw new TikaConfigException( + "Class " + className + " is not assignable to " + clazz.getName()); + } + // Remove @class field before deserializing (Jackson doesn't recognize it) + ObjectNode objectNode = ((ObjectNode) node).deepCopy(); + objectNode.remove("@class"); + return objectMapper.treeToValue(objectNode, (Class<T>) targetClass); + } + + // Strategy 3: Direct deserialization (for concrete classes) + if (clazz.isInterface() || Modifier.isAbstract(clazz.getModifiers())) { + throw new TikaConfigException( + "Cannot deserialize " + clazz.getName() + " - it is " + + (clazz.isInterface() ? "an interface" : "abstract") + ". " + + "Specify implementation using:\n" + + " - String value: \"" + key + "\": \"com.example.MyImpl\"\n" + + " - Object with @class: \"" + key + "\": {\"@class\": \"com.example.MyImpl\", ...}"); + } + + return objectMapper.treeToValue(node, clazz); + } catch (ClassNotFoundException e) { + throw new TikaConfigException( + "Class not found for '" + key + "': " + e.getMessage(), e); + } catch (JsonProcessingException e) { + throw new TikaConfigException( + "Failed to deserialize '" + key + "' into " + clazz.getName(), e); + } + } + + /** + * Loads a class from a string (fully qualified class name). + */ + @SuppressWarnings("unchecked") + private <T> T loadFromClassName(String className, Class<T> expectedType) + throws TikaConfigException { + try { + Class<?> clazz = Class.forName(className); + if (!expectedType.isAssignableFrom(clazz)) { + throw new TikaConfigException( + "Class " + className + " is not assignable to " + expectedType.getName()); + } + + // Try to instantiate with no-arg constructor + return (T) clazz.getDeclaredConstructor().newInstance(); + } catch (ClassNotFoundException e) { + throw new TikaConfigException("Class not found: " + className, e); + } catch (ReflectiveOperationException e) { + throw new TikaConfigException( + "Failed to instantiate " + className + + ". Ensure it has a public no-argument constructor.", e); + } + } + + /** + * Loads a configuration object from the specified JSON key, with a default value. + * + * @param key The JSON key to load from + * @param clazz The class to deserialize into + * @param defaultValue The value to return if key not found in config + * @param <T> The type to load + * @return the deserialized object, or defaultValue if not present + * @throws TikaConfigException if loading fails or class is not instantiable + */ + public <T> T load(String key, Class<T> clazz, T defaultValue) throws TikaConfigException { + T result = load(key, clazz); + return result != null ? result : defaultValue; + } + + /** + * Loads a configuration object by merging JSON properties into a copy of the default instance. + * <p> + * This allows partial configuration where only some properties are specified in JSON, + * and the rest retain their default values. The original defaultValue object is NOT modified. + * + * <p>Example: + * <pre> + * HandlerConfig defaults = new HandlerConfig(); + * defaults.setTimeout(30000); + * defaults.setRetries(2); + * defaults.setEnabled(false); + * + * // JSON: { "enabled": true } + * // Result: timeout=30000, retries=2, enabled=true (merged!) + * // Note: 'defaults' object remains unchanged + * HandlerConfig config = loader.configs().loadWithDefaults("handler-config", + * HandlerConfig.class, + * defaults); + * </pre> + * + * @param key The JSON key to load from + * @param clazz The class type (not used for deserialization, but for type safety) + * @param defaultValue The object with default values (will NOT be modified) + * @param <T> The type to load + * @return a new object with defaults merged with JSON properties, or the original default if key not found + * @throws TikaConfigException if loading fails + */ + public <T> T loadWithDefaults(String key, Class<T> clazz, T defaultValue) + throws TikaConfigException { + validateKey(key); + validateClass(clazz); + + JsonNode node = config.getRootNode().get(key); + if (node == null || node.isNull()) { + return defaultValue; + } + + try { + // Create a deep copy of defaultValue to avoid mutating the original + // Using convertValue is efficient and doesn't require serializing to bytes + @SuppressWarnings("unchecked") + T copy = objectMapper.convertValue(defaultValue, (Class<T>) defaultValue.getClass()); + + // Merge JSON properties into the copy + return objectMapper.readerForUpdating(copy).readValue(node); + } catch (Exception e) { + throw new TikaConfigException( + "Failed to merge '" + key + "' into " + clazz.getName(), e); + } + } + + /** + * Loads a configuration object by class name with defaults, merging JSON properties. + * + * @param clazz The class to deserialize into + * @param defaultValue The object with default values to merge into + * @param <T> The type to load + * @return the default object updated with JSON properties, or the original default if key not found + * @throws TikaConfigException if loading fails + */ + public <T> T loadWithDefaults(Class<T> clazz, T defaultValue) throws TikaConfigException { + String key = deriveKeyFromClass(clazz); + return loadWithDefaults(key, clazz, defaultValue); + } + + /** + * Checks if a configuration key exists in the JSON config. + * + * @param key The JSON key to check + * @return true if the key exists and is not null + */ + public boolean hasKey(String key) { + JsonNode node = config.getRootNode().get(key); + return node != null && !node.isNull(); + } + + /** + * Derives a kebab-case key from a class name. + * <p> + * Uses the full class name converted to kebab-case for consistency with + * the annotation processor's component naming. + * + * @param clazz the class to derive the key from + * @return kebab-case version of the class name + */ + private String deriveKeyFromClass(Class<?> clazz) { + String simpleName = clazz.getSimpleName(); + return toKebabCase(simpleName); + } + + /** + * Converts a camelCase or PascalCase string to kebab-case. + * Delegates to {@link KebabCaseConverter} for consistent behavior + * with the annotation processor. + */ + private String toKebabCase(String name) { + return KebabCaseConverter.toKebabCase(name); + } + + /** + * Validates that the key is not reserved for complex components. + */ + private void validateKey(String key) throws TikaConfigException { + if (PROHIBITED_KEYS.contains(key)) { + throw new TikaConfigException( + "Cannot load '" + key + "' via ConfigLoader. " + + "This is a complex component that requires special handling. " + + "Use TikaLoader.load" + toPascalCase(key) + "() instead."); + } + } + + /** + * Validates that complex Tika components aren't loaded via this method. + * Interfaces and abstract classes are allowed, but require explicit type info in JSON. + */ + private void validateClass(Class<?> clazz) throws TikaConfigException { + // Check for known complex component types (defense in depth) + String className = clazz.getName(); + if (className.equals("org.apache.tika.parser.Parser") || + className.equals("org.apache.tika.detect.Detector") || + className.equals("org.apache.tika.renderer.Renderer") || + className.equals("org.apache.tika.detect.EncodingDetector") || + className.equals("org.apache.tika.metadata.filter.MetadataFilter")) { + throw new TikaConfigException( + clazz.getSimpleName() + " is a Tika component interface. " + + "Use the appropriate TikaLoader method (e.g., loadParsers(), loadDetectors())."); + } + } + + /** + * Converts kebab-case to PascalCase for error messages. + */ + private String toPascalCase(String kebabCase) { + StringBuilder result = new StringBuilder(); + boolean capitalizeNext = true; + for (char c : kebabCase.toCharArray()) { + if (c == '-') { + capitalizeNext = true; + } else if (capitalizeNext) { + result.append(Character.toUpperCase(c)); + capitalizeNext = false; + } else { + result.append(c); + } + } + return result.toString(); + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java new file mode 100644 index 000000000..8a12a5033 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.util.Locale; + +/** + * Utility for converting Java class names to kebab-case. + * Used for automatic component name generation from class names. + * + * <p><strong>Note:</strong> This is a copy of the implementation in + * {@code org.apache.tika.annotation.KebabCaseConverter} to avoid + * a runtime dependency on the annotation processor module. The two + * implementations must be kept in sync. + * + * <p>Examples: + * <ul> + * <li>PDFParser → pdf-parser</li> + * <li>OCRParser → ocr-parser</li> + * <li>HTMLParser → html-parser</li> + * <li>DefaultParser → default-parser</li> + * <li>TesseractOCRParser → tesseract-ocr-parser</li> + * </ul> + */ +class KebabCaseConverter { + + private KebabCaseConverter() { + // Utility class + } + + /** + * Converts a Java class name to kebab-case. + * + * @param className the simple class name (without package) + * @return the kebab-case version of the name + */ + static String toKebabCase(String className) { + if (className == null || className.isEmpty()) { + return className; + } + + // Insert hyphen before uppercase letters that follow lowercase letters + // or before uppercase letters that are followed by lowercase letters + String result = className + // Insert hyphen between lowercase and uppercase: "aB" -> "a-B" + .replaceAll("([a-z])([A-Z])", "$1-$2") + // Insert hyphen before uppercase letter followed by lowercase + // in a sequence of uppercase letters: "HTMLParser" -> "HTML-Parser" + .replaceAll("([A-Z]+)([A-Z][a-z])", "$1-$2") + // Insert hyphen between letter and digit: "PDF2Text" -> "PDF2-Text" + .replaceAll("([a-zA-Z])(\\d)", "$1-$2") + // Insert hyphen between digit and letter: "2Text" -> "2-Text" + .replaceAll("(\\d)([a-zA-Z])", "$1-$2") + .toLowerCase(Locale.ROOT); + + return result; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java index 96f668af8..5cfc3cd11 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -211,12 +211,14 @@ public class ParserLoader { @SuppressWarnings("unchecked") private Parser instantiateParser(Class<?> parserClass, String configJson) throws TikaConfigException { + try { // Try constructor with String parameter (JSON config) try { Constructor<?> constructor = parserClass.getConstructor(String.class); return (Parser) constructor.newInstance(configJson); } catch (NoSuchMethodException e) { + // TODO -- entrypoint for actual configuration // Fall back to zero-arg constructor return (Parser) ServiceLoaderUtils.newInstance(parserClass, new org.apache.tika.config.ServiceLoader(classLoader)); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 7b7bf1c0b..851e6fc2d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -26,10 +26,13 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.Parser; /** * Parsed representation of a Tika JSON configuration file. @@ -57,7 +60,14 @@ import org.apache.tika.exception.TikaConfigException; */ public class TikaJsonConfig { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); + + private static ObjectMapper createObjectMapper() { + ObjectMapper mapper = new ObjectMapper(); + // Fail on unknown properties to catch configuration errors early + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); + return mapper; + } private final JsonNode rootNode; private final Map<String, Map<String, JsonNode>> componentsByType; diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 483596199..6e3a3feab 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -16,9 +16,12 @@ */ package org.apache.tika.config.loader; +import java.io.IOException; import java.nio.file.Path; import java.util.List; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.detect.CompositeDetector; @@ -29,6 +32,9 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.AutoDetectParserConfig; +import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; @@ -78,6 +84,7 @@ public class TikaLoader { private EncodingDetector encodingDetectors; private MetadataFilter metadataFilters; private Renderer renderers; + private ConfigLoader configLoader; private TikaLoader(TikaJsonConfig config, ClassLoader classLoader, MediaTypeRegistry mediaTypeRegistry) { @@ -222,6 +229,36 @@ public class TikaLoader { return renderers; } + public Parser loadAutoDetectParser() throws TikaConfigException, IOException { + AutoDetectParserConfig adpConfig = configs().load(AutoDetectParserConfig.class); + if (adpConfig == null) { + adpConfig = new AutoDetectParserConfig(); + } + return AutoDetectParser.build((CompositeParser)loadParsers(), loadDetectors(), adpConfig); + } + + /** + * Returns a ConfigLoader for loading simple configuration objects. + * <p> + * Use this for POJOs and simple config classes. For complex components like + * Parsers, Detectors, etc., use the specific load methods on TikaLoader. + * + * <p>Usage: + * <pre> + * HandlerConfig config = loader.configs().load("handler-config", HandlerConfig.class); + * // Or use kebab-case auto-conversion: + * HandlerConfig config = loader.configs().load(HandlerConfig.class); + * </pre> + * + * @return the ConfigLoader instance + */ + public synchronized ConfigLoader configs() { + if (configLoader == null) { + configLoader = new ConfigLoader(config, objectMapper); + } + return configLoader; + } + /** * Gets the underlying JSON configuration. * diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java new file mode 100644 index 000000000..be207dba3 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java @@ -0,0 +1,646 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Unit tests for {@link ConfigLoader}. + */ +public class ConfigLoaderTest { + + private TikaLoader tikaLoader; + private ConfigLoader configLoader; + + @BeforeEach + public void setUp() throws Exception { + Path configPath = Paths.get( + getClass().getResource("/configs/test-config-loader.json").toURI()); + tikaLoader = TikaLoader.load(configPath); + configLoader = tikaLoader.configs(); + } + + // ==================== Test POJOs ==================== + + /** + * Simple config POJO with properties. + */ + public static class HandlerConfig { + private int timeout; + private int retries; + private boolean enabled; + + public int getTimeout() { + return timeout; + } + + public void setTimeout(int timeout) { + this.timeout = timeout; + } + + public int getRetries() { + return retries; + } + + public void setRetries(int retries) { + this.retries = retries; + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + } + + /** + * Config class with suffix that should be stripped. + */ + public static class TikaTaskTimeout { + private long millis; + + public long getMillis() { + return millis; + } + + public void setMillis(long millis) { + this.millis = millis; + } + } + + /** + * Config class with "Settings" suffix. + */ + public static class MyFeatureSettings { + private String featureName; + private int priority; + + public String getFeatureName() { + return featureName; + } + + public void setFeatureName(String featureName) { + this.featureName = featureName; + } + + public int getPriority() { + return priority; + } + + public void setPriority(int priority) { + this.priority = priority; + } + } + + /** + * Interface for testing interface handling. + */ + public interface TestHandler { + String getName(); + } + + /** + * Simple implementation with no-arg constructor. + */ + public static class SimpleHandlerImpl implements TestHandler { + public SimpleHandlerImpl() { + } + + @Override + public String getName() { + return "simple"; + } + } + + /** + * Implementation with configuration properties. + */ + public static class ConfiguredHandlerImpl implements TestHandler { + private int maxSize; + private String prefix; + + public ConfiguredHandlerImpl() { + } + + @Override + public String getName() { + return "configured"; + } + + public int getMaxSize() { + return maxSize; + } + + public void setMaxSize(int maxSize) { + this.maxSize = maxSize; + } + + public String getPrefix() { + return prefix; + } + + public void setPrefix(String prefix) { + this.prefix = prefix; + } + } + + /** + * Abstract class for testing abstract class handling. + */ + public abstract static class AbstractHandler implements TestHandler { + public abstract void doSomething(); + } + + // ==================== Tests ==================== + + @Test + public void testLoadByExplicitKey() throws Exception { + HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + + assertNotNull(config); + assertEquals(5000, config.getTimeout()); + assertEquals(3, config.getRetries()); + assertTrue(config.isEnabled()); + } + + @Test + public void testLoadByClassNameKebabCase() throws Exception { + HandlerConfig config = configLoader.load(HandlerConfig.class); + + assertNotNull(config); + assertEquals(5000, config.getTimeout()); + } + + @Test + public void testLoadByClassNameTikaTaskTimeout() throws Exception { + // TikaTaskTimeout -> "tika-task-timeout" (no suffix stripping) + // JSON has "tika-task-timeout" + TikaTaskTimeout timeout = configLoader.load(TikaTaskTimeout.class); + + assertNotNull(timeout); + assertEquals(30000, timeout.getMillis()); + } + + @Test + public void testLoadByClassNameMyFeatureSettings() throws Exception { + // MyFeatureSettings -> "my-feature-settings" (full name, no suffix stripping) + // JSON has "my-feature-settings" + MyFeatureSettings settings = configLoader.load(MyFeatureSettings.class); + + assertNotNull(settings); + assertEquals("test-feature", settings.getFeatureName()); + assertEquals(10, settings.getPriority()); + } + + @Test + public void testLoadWithDefaultValue() throws Exception { + HandlerConfig config = configLoader.load("handler-config", HandlerConfig.class); + assertNotNull(config); + + // Non-existent key with default + HandlerConfig defaultConfig = new HandlerConfig(); + defaultConfig.setTimeout(9999); + + HandlerConfig result = configLoader.load("non-existent", HandlerConfig.class, defaultConfig); + assertEquals(9999, result.getTimeout()); + } + + @Test + public void testLoadMissingKeyReturnsNull() throws Exception { + HandlerConfig config = configLoader.load("non-existent-key", HandlerConfig.class); + assertNull(config); + } + + @Test + public void testLoadInterfaceAsString() throws Exception { + // JSON: "simple-handler": "org.apache.tika.config.loader.ConfigLoaderTest$SimpleHandlerImpl" + TestHandler handler = configLoader.load("simple-handler", TestHandler.class); + + assertNotNull(handler); + assertTrue(handler instanceof SimpleHandlerImpl); + assertEquals("simple", handler.getName()); + } + + @Test + public void testLoadInterfaceWithAtClassAndProperties() throws Exception { + // JSON: "configured-handler": { "@class": "...", "maxSize": 100000, ... } + TestHandler handler = configLoader.load("configured-handler", TestHandler.class); + + assertNotNull(handler); + assertTrue(handler instanceof ConfiguredHandlerImpl); + assertEquals("configured", handler.getName()); + + ConfiguredHandlerImpl impl = (ConfiguredHandlerImpl) handler; + assertEquals(100000, impl.getMaxSize()); + assertEquals("test-", impl.getPrefix()); + } + + @Test + public void testLoadInterfaceWithoutTypeInfoFails() throws Exception { + // Create a minimal config with just properties, no @class + Path configPath = Paths.get( + getClass().getResource("/configs/test-interface-no-type.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + loader.configs().load("handler-no-type", TestHandler.class)); + + assertTrue(ex.getMessage().contains("interface")); + assertTrue(ex.getMessage().contains("@class")); + } + + @Test + public void testLoadAbstractClassFails() throws Exception { + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + configLoader.load("abstract-handler", AbstractHandler.class)); + + assertTrue(ex.getMessage().contains("abstract")); + } + + @Test + public void testLoadProhibitedKeyParsers() throws Exception { + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + configLoader.load("parsers", Object.class)); + + assertTrue(ex.getMessage().contains("Cannot load 'parsers'")); + assertTrue(ex.getMessage().contains("TikaLoader")); + } + + @Test + public void testLoadProhibitedKeyDetectors() throws Exception { + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + configLoader.load("detectors", Object.class)); + + assertTrue(ex.getMessage().contains("Cannot load 'detectors'")); + } + + @Test + public void testLoadProhibitedKeyMetadataFilters() throws Exception { + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + configLoader.load("metadata-filters", Object.class)); + + assertTrue(ex.getMessage().contains("Cannot load 'metadata-filters'")); + } + + @Test + public void testHasKey() throws Exception { + assertTrue(configLoader.hasKey("handler-config")); + assertTrue(configLoader.hasKey("simple-handler")); + assertFalse(configLoader.hasKey("non-existent")); + } + + @Test + public void testLoadInvalidClassName() throws Exception { + Path configPath = Paths.get( + getClass().getResource("/configs/test-invalid-class.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + loader.configs().load("handler", TestHandler.class)); + + assertTrue(ex.getMessage().contains("Class not found")); + } + + @Test + public void testLoadWrongTypeAssignment() throws Exception { + // String class name that doesn't implement the interface + Path configPath = Paths.get( + getClass().getResource("/configs/test-wrong-type.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + loader.configs().load("handler", TestHandler.class)); + + assertTrue(ex.getMessage().contains("not assignable")); + } + + @Test + public void testLoadWithUnexpectedFieldFails() throws Exception { + // Verify that unexpected/unrecognized fields cause an exception + Path configPath = Paths.get( + getClass().getResource("/configs/test-unexpected-field.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + TikaConfigException ex = assertThrows(TikaConfigException.class, () -> + loader.configs().load("handler-config", HandlerConfig.class)); + + // Should contain information about the unrecognized field + assertTrue(ex.getMessage().contains("handler-config") || + ex.getCause().getMessage().contains("Unrecognized") || + ex.getCause().getMessage().contains("unexpectedField"), + "Exception should mention the unrecognized field"); + } + + @Test + public void testKebabCaseConversion() throws Exception { + // Test that kebab-case conversion works correctly + // MyFeatureSettings should look for "my-feature-settings" (full kebab-case, no stripping) + MyFeatureSettings settings = configLoader.load(MyFeatureSettings.class); + assertNotNull(settings); + assertEquals("test-feature", settings.getFeatureName()); + } + + @Test + public void testLoadByClassWithDefault() throws Exception { + HandlerConfig config = configLoader.load(HandlerConfig.class); + assertNotNull(config); + + // Non-existent class + TikaTaskTimeout defaultTimeout = new TikaTaskTimeout(); + defaultTimeout.setMillis(60000); + + // Use a class name that won't match + TikaTaskTimeout result = configLoader.load("NonExistentConfig.class", + TikaTaskTimeout.class, + defaultTimeout); + assertEquals(60000, result.getMillis()); + } + + // ==================== Tests for loadWithDefaults (Partial Config) ==================== + + @Test + public void testLoadWithDefaultsPartialConfig() throws Exception { + // Load config that merges defaults with partial JSON + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + // Set up defaults + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // JSON only has: { "enabled": true } + HandlerConfig config = loader.configs().loadWithDefaults("handler-config", + HandlerConfig.class, + defaults); + + assertNotNull(config); + assertEquals(30000, config.getTimeout()); // ✅ From defaults + assertEquals(2, config.getRetries()); // ✅ From defaults + assertTrue(config.isEnabled()); // ✅ From JSON (overridden) + } + + @Test + public void testLoadWithDefaultsFullOverride() throws Exception { + // Test that JSON can override all defaults + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // JSON has: { "timeout": 10000, "retries": 5, "enabled": false } + HandlerConfig config = loader.configs().loadWithDefaults("handler-config-full", + HandlerConfig.class, + defaults); + + assertNotNull(config); + assertEquals(10000, config.getTimeout()); // All overridden + assertEquals(5, config.getRetries()); + assertFalse(config.isEnabled()); + } + + @Test + public void testLoadWithDefaultsMissingKey() throws Exception { + // When key doesn't exist, should return original defaults unchanged + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + HandlerConfig config = configLoader.loadWithDefaults("non-existent-key", + HandlerConfig.class, + defaults); + + assertNotNull(config); + assertEquals(30000, config.getTimeout()); + assertEquals(2, config.getRetries()); + assertFalse(config.isEnabled()); + } + + @Test + public void testLoadWithDefaultsByClass() throws Exception { + // Test the class-name version + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // Uses kebab-case: HandlerConfig -> "handler-config" + HandlerConfig config = loader.configs().loadWithDefaults(HandlerConfig.class, defaults); + + assertNotNull(config); + assertEquals(30000, config.getTimeout()); + assertEquals(2, config.getRetries()); + assertTrue(config.isEnabled()); // Overridden from JSON + } + + @Test + public void testLoadVsLoadWithDefaults() throws Exception { + // Demonstrate difference between load() and loadWithDefaults() + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // Using load() - creates new object, loses defaults + HandlerConfig config1 = loader.configs().load("handler-config", HandlerConfig.class); + assertEquals(0, config1.getTimeout()); // ❌ Lost default! + assertEquals(0, config1.getRetries()); // ❌ Lost default! + assertTrue(config1.isEnabled()); // ✅ From JSON + + // Using loadWithDefaults() - merges into defaults + HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config", + HandlerConfig.class, + defaults); + assertEquals(30000, config2.getTimeout()); // ✅ Kept default! + assertEquals(2, config2.getRetries()); // ✅ Kept default! + assertTrue(config2.isEnabled()); // ✅ From JSON + } + + // ==================== Immutability Tests ==================== + + @Test + public void testLoadWithDefaultsDoesNotMutateOriginal() throws Exception { + // Verify that the original defaults object is NOT modified + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // Load config with partial override (JSON only has "enabled": true) + HandlerConfig result = loader.configs().loadWithDefaults("handler-config", + HandlerConfig.class, + defaults); + + // Verify result has merged values + assertEquals(30000, result.getTimeout()); + assertEquals(2, result.getRetries()); + assertTrue(result.isEnabled()); // Overridden from JSON + + // CRITICAL: Verify original defaults object is unchanged + assertEquals(30000, defaults.getTimeout()); // ✅ Still original value + assertEquals(2, defaults.getRetries()); // ✅ Still original value + assertFalse(defaults.isEnabled()); // ✅ Still original value (NOT changed!) + + // Verify they are different objects + assertNotEquals(System.identityHashCode(defaults), + System.identityHashCode(result), + "Result should be a different object than defaults"); + } + + @Test + public void testLoadWithDefaultsReusableDefaults() throws Exception { + // Verify defaults can be safely reused for multiple loads + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + // Load multiple times with same defaults + HandlerConfig config1 = loader.configs().loadWithDefaults("handler-config", + HandlerConfig.class, + defaults); + HandlerConfig config2 = loader.configs().loadWithDefaults("handler-config-full", + HandlerConfig.class, + defaults); + + // Verify results are different + assertTrue(config1.isEnabled()); // From partial config + assertFalse(config2.isEnabled()); // From full config + + // Verify defaults still unchanged and can be used again + assertEquals(30000, defaults.getTimeout()); + assertEquals(2, defaults.getRetries()); + assertFalse(defaults.isEnabled()); + + // Use defaults one more time + HandlerConfig config3 = loader.configs().loadWithDefaults("non-existent", + HandlerConfig.class, + defaults); + assertEquals(defaults, config3); // Should return original when key missing + } + + @Test + public void testLoadWithDefaultsComplexObjectImmutability() throws Exception { + // Test with nested/complex objects to ensure deep copy works + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + TikaTaskTimeout defaults = new TikaTaskTimeout(); + defaults.setMillis(60000); + + // Note: tika-task-timeout in JSON has millis: 30000 + TikaTaskTimeout result = loader.configs().loadWithDefaults("tika-task-timeout", + TikaTaskTimeout.class, + defaults); + + // Result should have JSON value + assertEquals(30000, result.getMillis()); + + // Original should be unchanged + assertEquals(60000, defaults.getMillis()); + } + + @Test + public void testLoadWithDefaultsMissingKeyDoesNotClone() throws Exception { + // When key is missing, should return the original object (no unnecessary cloning) + HandlerConfig defaults = new HandlerConfig(); + defaults.setTimeout(30000); + defaults.setRetries(2); + defaults.setEnabled(false); + + HandlerConfig result = configLoader.loadWithDefaults("non-existent-key", + HandlerConfig.class, + defaults); + + // Should return the exact same object when key is missing + assertEquals(defaults, result); + assertEquals(System.identityHashCode(defaults), + System.identityHashCode(result), + "Should return same object when key missing (no unnecessary clone)"); + } + + @Test + public void testLoadWithDefaultsThreadSafety() throws Exception { + // Demonstrate that defaults can be safely shared across threads + Path configPath = Paths.get( + getClass().getResource("/configs/test-partial-config.json").toURI()); + TikaLoader loader = TikaLoader.load(configPath); + + // Shared defaults object + HandlerConfig sharedDefaults = new HandlerConfig(); + sharedDefaults.setTimeout(30000); + sharedDefaults.setRetries(2); + sharedDefaults.setEnabled(false); + + // Simulate concurrent usage (not a real concurrency test, just demonstrates safety) + HandlerConfig result1 = loader.configs().loadWithDefaults("handler-config", + HandlerConfig.class, + sharedDefaults); + HandlerConfig result2 = loader.configs().loadWithDefaults("handler-config-full", + HandlerConfig.class, + sharedDefaults); + + // Both results should be valid + assertNotNull(result1); + assertNotNull(result2); + + // Shared defaults should still be unchanged + assertEquals(30000, sharedDefaults.getTimeout()); + assertEquals(2, sharedDefaults.getRetries()); + assertFalse(sharedDefaults.isEnabled()); + } +} diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json new file mode 100644 index 000000000..8f6e89a8c --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-config-loader.json @@ -0,0 +1,32 @@ +{ + "handler-config": { + "timeout": 5000, + "retries": 3, + "enabled": true + }, + + "simple-handler": "org.apache.tika.config.loader.ConfigLoaderTest$SimpleHandlerImpl", + + "configured-handler": { + "@class": "org.apache.tika.config.loader.ConfigLoaderTest$ConfiguredHandlerImpl", + "maxSize": 100000, + "prefix": "test-" + }, + + "tika-task-timeout": { + "millis": 30000 + }, + + "parsers": [ + {"pdf-parser": {}} + ], + + "my-feature-settings": { + "featureName": "test-feature", + "priority": 10 + }, + + "abstract-handler": { + "someProperty": "value" + } +} diff --git a/tika-serialization/src/test/resources/configs/test-interface-no-type.json b/tika-serialization/src/test/resources/configs/test-interface-no-type.json new file mode 100644 index 000000000..15a3d35b2 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-interface-no-type.json @@ -0,0 +1,6 @@ +{ + "handler-no-type": { + "maxSize": 50000, + "prefix": "no-type-" + } +} diff --git a/tika-serialization/src/test/resources/configs/test-invalid-class.json b/tika-serialization/src/test/resources/configs/test-invalid-class.json new file mode 100644 index 000000000..f0bf4bf4e --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-invalid-class.json @@ -0,0 +1,3 @@ +{ + "handler": "com.example.NonExistentClass" +} diff --git a/tika-serialization/src/test/resources/configs/test-partial-config.json b/tika-serialization/src/test/resources/configs/test-partial-config.json new file mode 100644 index 000000000..fb010c3e8 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-partial-config.json @@ -0,0 +1,15 @@ +{ + "handler-config": { + "enabled": true + }, + + "handler-config-full": { + "timeout": 10000, + "retries": 5, + "enabled": false + }, + + "tika-task-timeout": { + "millis": 30000 + } +} diff --git a/tika-serialization/src/test/resources/configs/test-unexpected-field.json b/tika-serialization/src/test/resources/configs/test-unexpected-field.json new file mode 100644 index 000000000..ada7f9bdf --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-unexpected-field.json @@ -0,0 +1,8 @@ +{ + "handler-config": { + "timeout": 5000, + "retries": 3, + "enabled": true, + "unexpectedField": "this should cause an error" + } +} diff --git a/tika-serialization/src/test/resources/configs/test-wrong-type.json b/tika-serialization/src/test/resources/configs/test-wrong-type.json new file mode 100644 index 000000000..b25e9f644 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/test-wrong-type.json @@ -0,0 +1,3 @@ +{ + "handler": "java.lang.String" +}
