This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4562 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6964ee6704eccaca01f3e0fcf3922e64b6abebc5 Author: tallison <[email protected]> AuthorDate: Wed Dec 10 11:26:53 2025 -0500 TIKA-4562 -- checkpoint commit --- .../tika/annotation/TikaComponentProcessor.java | 82 +++++++-- .../java/org/apache/tika/config/TikaComponent.java | 25 +++ .../org/apache/tika/config/ConfigContainer.java | 5 +- .../org/apache/tika/config/TikaTaskTimeout.java | 30 +++- .../java/org/apache/tika/parser/ParseContext.java | 28 ++++ .../org/apache/tika/pipes/api/HandlerConfig.java | 4 +- .../org/apache/tika/pipes/core/PipesClient.java | 2 +- .../apache/tika/pipes/core/server/PipesServer.java | 3 + .../apache/tika/config/loader/ComponentInfo.java | 38 +++-- .../tika/config/loader/ComponentRegistry.java | 102 +++++++++-- .../apache/tika/config/loader/ConfigLoader.java | 11 +- .../apache/tika/config/loader/JsonMergeUtils.java | 104 ++++++++++++ .../tika/serialization/ConfigDeserializer.java | 15 +- .../serialization/ParseContextDeserializer.java | 57 +++---- .../tika/serialization/ParseContextSerializer.java | 104 ++++++++---- .../tika/serialization/ParseContextUtils.java | 186 +++++++++++++++++++++ .../TestParseContextSerialization.java | 181 ++++++++++---------- tika-server/tika-server-core/pom.xml | 5 + .../tika/server/core/ServerStatusWatcher.java | 9 +- .../server/core/resource/DetectorResource.java | 1 + .../tika/server/core/resource/PipesResource.java | 3 + .../tika/server/core/resource/TikaResource.java | 135 ++++++++++++--- .../org/apache/tika/server/core/TikaPipesTest.java | 1 - .../server/core/TikaServerIntegrationTest.java | 56 +++++-- .../core/TikaServerPipesIntegrationTest.java | 44 +++++ .../configs/tika-config-server-basic.json | 3 +- 26 files changed, 957 insertions(+), 277 deletions(-) diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index e85a2d1f6e..f1f79943ee 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -34,7 +34,10 @@ import javax.annotation.processing.RoundEnvironment; import javax.annotation.processing.SupportedAnnotationTypes; import javax.annotation.processing.SupportedSourceVersion; import javax.lang.model.SourceVersion; +import javax.lang.model.element.AnnotationMirror; +import javax.lang.model.element.AnnotationValue; import javax.lang.model.element.Element; +import javax.lang.model.element.ExecutableElement; import javax.lang.model.element.TypeElement; import javax.lang.model.type.DeclaredType; import javax.lang.model.type.TypeMirror; @@ -125,17 +128,31 @@ public class TikaComponentProcessor extends AbstractProcessor { // Check if component should be included in SPI boolean includeSpi = annotation.spi(); + // Get contextKey if specified (need to use mirror API for Class types) + String contextKey = getContextKeyFromAnnotation(element); + messager.printMessage(Diagnostic.Kind.NOTE, "Processing @TikaComponent: " + className + " -> " + componentName + - " (SPI: " + includeSpi + ")"); + " (SPI: " + includeSpi + ", contextKey: " + contextKey + ")"); // Find all implemented service interfaces List<String> serviceInterfaces = findServiceInterfaces(element); + // Build the index entry value (className or className:key=X) + String indexValue = className; + if (contextKey != null) { + indexValue = className + ":key=" + contextKey; + } + if (serviceInterfaces.isEmpty()) { - messager.printMessage(Diagnostic.Kind.WARNING, - "Class " + className + " annotated with @TikaComponent " + - "but does not implement any known Tika service interface", element); + // No known service interface - put in other-configs.idx + messager.printMessage(Diagnostic.Kind.NOTE, + "Class " + className + " does not implement known service interface, " + + "adding to other-configs.idx", element); + + Map<String, String> index = indexFiles.computeIfAbsent("other-configs", + k -> new LinkedHashMap<>()); + addToIndex(index, componentName, indexValue, className, element); return; } @@ -152,20 +169,57 @@ public class TikaComponentProcessor extends AbstractProcessor { if (indexFileName != null) { Map<String, String> index = indexFiles.computeIfAbsent(indexFileName, k -> new LinkedHashMap<>()); + addToIndex(index, componentName, indexValue, className, element); + } + } + } - // Check for duplicate names - if (index.containsKey(componentName)) { - String existingClass = index.get(componentName); - if (!existingClass.equals(className)) { - messager.printMessage(Diagnostic.Kind.ERROR, - "Duplicate component name '" + componentName + "' for classes: " + - existingClass + " and " + className, element); + /** + * Adds an entry to an index, checking for duplicates. + */ + private void addToIndex(Map<String, String> index, String componentName, + String indexValue, String className, TypeElement element) { + if (index.containsKey(componentName)) { + String existingValue = index.get(componentName); + // Extract class name from value (may have :key= suffix) + String existingClass = existingValue.contains(":") + ? existingValue.substring(0, existingValue.indexOf(":")) + : existingValue; + if (!existingClass.equals(className)) { + messager.printMessage(Diagnostic.Kind.ERROR, + "Duplicate component name '" + componentName + "' for classes: " + + existingClass + " and " + className, element); + } + } else { + index.put(componentName, indexValue); + } + } + + /** + * Gets the contextKey value from the annotation using the mirror API. + * Returns null if contextKey is void.class (the default). + */ + private String getContextKeyFromAnnotation(TypeElement element) { + for (AnnotationMirror mirror : element.getAnnotationMirrors()) { + DeclaredType annotationType = mirror.getAnnotationType(); + if (annotationType.toString().equals(TikaComponent.class.getName())) { + for (Map.Entry<? extends ExecutableElement, ? extends AnnotationValue> entry + : mirror.getElementValues().entrySet()) { + if (entry.getKey().getSimpleName().toString().equals("contextKey")) { + // The value is a TypeMirror for Class types + Object value = entry.getValue().getValue(); + if (value instanceof TypeMirror) { + String typeName = value.toString(); + // void.class is the default, meaning "auto-detect" + if (!"void".equals(typeName) && !"java.lang.Void".equals(typeName)) { + return typeName; + } + } } - } else { - index.put(componentName, className); } } } + return null; } /** @@ -267,7 +321,7 @@ public class TikaComponentProcessor extends AbstractProcessor { writeApacheLicenseHeader(writer); writer.write("# Generated by TikaComponentProcessor\n"); writer.write("# Do not edit manually\n"); - writer.write("# Format: component-name=fully.qualified.ClassName\n"); + writer.write("# Format: component-name=fully.qualified.ClassName[:key=contextKeyClass]\n"); for (Map.Entry<String, String> component : components.entrySet()) { writer.write(component.getKey()); writer.write("="); diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index 6632bdeb7f..e7f35814cb 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -53,6 +53,11 @@ import java.lang.annotation.Target; * public class DWGReadParser extends AbstractParser { * // available by name, but NOT auto-loaded by default-parser * } + * + * {@code @TikaComponent(contextKey = MetadataFilter.class)} + * public class MyFilter implements MetadataFilter, AnotherInterface { + * // explicit ParseContext key when class implements multiple known interfaces + * } * </pre> * * @since 3.1.0 @@ -81,4 +86,24 @@ public @interface TikaComponent { * @return true to include in SPI (default), false to require explicit config */ boolean spi() default true; + + /** + * The class to use as the key when adding this component to ParseContext. + * <p> + * By default ({@code void.class}), the key is auto-detected: + * <ul> + * <li>If the component implements a known interface (e.g., MetadataFilter), + * that interface is used as the key</li> + * <li>Otherwise, the component's own class is used as the key</li> + * </ul> + * <p> + * Use this attribute to explicitly specify the key when: + * <ul> + * <li>The component implements multiple known interfaces (ambiguous)</li> + * <li>You need a specific interface/class that isn't auto-detected</li> + * </ul> + * + * @return the class to use as ParseContext key, or void.class for auto-detection + */ + Class<?> contextKey() default void.class; } diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigContainer.java b/tika-core/src/main/java/org/apache/tika/config/ConfigContainer.java index abc668f73d..f4eedd5bb2 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ConfigContainer.java +++ b/tika-core/src/main/java/org/apache/tika/config/ConfigContainer.java @@ -16,6 +16,7 @@ */ package org.apache.tika.config; +import java.io.Serializable; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -32,7 +33,9 @@ import java.util.Set; * and other components to look up their config by friendly name (e.g., "pdf-parser", * "fs-fetcher-1") and deserialize it on-demand. */ -public class ConfigContainer { +public class ConfigContainer implements Serializable { + + private static final long serialVersionUID = 1L; private final Map<String, String> configs = new HashMap<>(); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java index 842c7a432d..cf3928fca2 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java @@ -20,10 +20,34 @@ import java.io.Serializable; import org.apache.tika.parser.ParseContext; +/** + * Configuration class for specifying parse task timeout. + * <pre> + * { + * "parse-context": { + * "tika-task-timeout": { + * "timeoutMillis": 30000 + * } + * } + * } + * </pre> + */ +@TikaComponent(spi = false) public class TikaTaskTimeout implements Serializable { - private final long timeoutMillis; + private long timeoutMillis; + + /** + * No-arg constructor for Jackson deserialization. + */ + public TikaTaskTimeout() { + } + /** + * Constructor with timeout value. + * + * @param timeoutMillis timeout in milliseconds + */ public TikaTaskTimeout(long timeoutMillis) { this.timeoutMillis = timeoutMillis; } @@ -32,6 +56,10 @@ public class TikaTaskTimeout implements Serializable { return timeoutMillis; } + public void setTimeoutMillis(long timeoutMillis) { + this.timeoutMillis = timeoutMillis; + } + public static long getTimeoutMillis(ParseContext context, long defaultTimeoutMillis) { if (context == null) { return defaultTimeoutMillis; diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 0393bec52c..6e16391c27 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -21,6 +21,8 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; +import org.apache.tika.config.ConfigContainer; + /** * Parse context. Used to pass context information to Tika parsers. * @@ -84,6 +86,32 @@ public class ParseContext implements Serializable { } } + /** + * Adds a configuration by friendly name for serialization. + * <p> + * This is a convenience method for adding configs that will be serialized + * and resolved at runtime. The config is stored in a {@link ConfigContainer} + * and will be resolved to an actual object via the component registry. + * <p> + * Example: + * <pre> + * parseContext.addConfig("tika-task-timeout", "{\"timeoutMillis\": 5000}"); + * parseContext.addConfig("handler-config", "{\"type\": \"XML\", \"parseMode\": \"RMETA\"}"); + * </pre> + * + * @param key the friendly name of the config (e.g., "tika-task-timeout", "handler-config") + * @param json the JSON configuration string + * @since Apache Tika 4.0 + */ + public void addConfig(String key, String json) { + ConfigContainer container = get(ConfigContainer.class); + if (container == null) { + container = new ConfigContainer(); + set(ConfigContainer.class, container); + } + container.set(key, json); + } + public boolean isEmpty() { return context.isEmpty(); } diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java index b58c58c05a..b336f1a4fc 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/HandlerConfig.java @@ -22,7 +22,9 @@ import java.util.Objects; import org.apache.tika.sax.BasicContentHandlerFactory; -//TODO -- convert this back to a record +/** + * Configuration for content handler behavior during parsing. + */ public class HandlerConfig implements Serializable { /** diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java index c4a2c07136..1551ae37d9 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java @@ -194,7 +194,7 @@ public class PipesClient implements Closeable { } catch (InterruptedException | SecurityException e) { throw e; } catch (Exception e) { - LOG.error("exception waiting for server to complete task: {} ", t.getId()); + LOG.error("exception waiting for server to complete task: {} ", t.getId(), e); shutItAllDown(); return buildFatalResult(t.getId(), t.getEmitKey(), UNSPECIFIED_CRASH, intermediateResult.get()); } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index e84945b925..c971367ab5 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -58,6 +58,7 @@ import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; +import org.apache.tika.serialization.ParseContextUtils; import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; @@ -286,6 +287,8 @@ public class PipesServer implements AutoCloseable { CountDownLatch countDownLatch = new CountDownLatch(1); FetchEmitTuple fetchEmitTuple = readFetchEmitTuple(); + // Resolve friendly-named configs in ParseContext to actual objects + ParseContextUtils.resolveAll(fetchEmitTuple.getParseContext(), getClass().getClassLoader()); PipesWorker pipesWorker = getPipesWorker(intermediateResult, fetchEmitTuple, countDownLatch); executorCompletionService.submit(pipesWorker); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java similarity index 50% rename from tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java rename to tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java index 88a9e102f7..72c7156b2c 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java @@ -14,24 +14,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.server.core.config; +package org.apache.tika.config.loader; -import jakarta.ws.rs.core.MultivaluedMap; - -import org.apache.tika.config.TikaTaskTimeout; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.server.core.ParseContextConfig; - -public class TimeoutConfig implements ParseContextConfig { - - public static final String X_TIKA_TIMEOUT_MILLIS = "X-Tika-Timeout-Millis"; - - @Override - public void configure(MultivaluedMap<String, String> httpHeaders, Metadata metadata, ParseContext context) { - if (httpHeaders.containsKey(X_TIKA_TIMEOUT_MILLIS)) { - long timeout = Long.parseLong(httpHeaders.getFirst(X_TIKA_TIMEOUT_MILLIS)); - context.set(TikaTaskTimeout.class, new TikaTaskTimeout(timeout)); - } +/** + * Information about a registered Tika component. + * + * @param componentClass the component's class + * @param selfConfiguring whether the component implements SelfConfiguring + * (reads its own config from ConfigContainer) + * @param contextKey the class to use as the key when adding to ParseContext, + * or null to auto-detect based on known interfaces + */ +public record ComponentInfo( + Class<?> componentClass, + boolean selfConfiguring, + Class<?> contextKey +) { + /** + * Creates a ComponentInfo with no explicit context key (auto-detect). + */ + public ComponentInfo(Class<?> componentClass, boolean selfConfiguring) { + this(componentClass, selfConfiguring, null); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java index a8097caf0c..2978f09513 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -28,6 +28,7 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; +import org.apache.tika.config.SelfConfiguring; import org.apache.tika.exception.TikaConfigException; /** @@ -35,6 +36,13 @@ import org.apache.tika.exception.TikaConfigException; * Loads component name-to-class mappings from META-INF/tika/*.idx files * generated by the {@code @TikaComponent} annotation processor. * <p> + * The registry tracks: + * <ul> + * <li>Component class</li> + * <li>Whether the component is self-configuring (implements {@link SelfConfiguring})</li> + * <li>Optional explicit context key for ParseContext</li> + * </ul> + * <p> * Also includes built-in aliases for external dependencies that cannot be * annotated with @TikaComponent. */ @@ -51,10 +59,13 @@ public class ComponentRegistry { // Alias for org.gagravarr:vorbis-java-tika dependency // TODO -- make this configurable aliases.put("ogg-detector", "org.gagravarr.tika.OggDetector"); + // HandlerConfig is in tika-pipes-api which can't depend on tika-core for @TikaComponent + aliases.put("handler-config", "org.apache.tika.pipes.api.HandlerConfig"); return Collections.unmodifiableMap(aliases); } - private final Map<String, Class<?>> components; + private final Map<String, ComponentInfo> components; + private final Map<Class<?>, String> classToName; // Reverse lookup private final ClassLoader classLoader; /** @@ -69,6 +80,11 @@ public class ComponentRegistry { throws TikaConfigException { this.classLoader = classLoader; this.components = loadComponents(indexFileName); + // Build reverse lookup + this.classToName = new HashMap<>(); + for (Map.Entry<String, ComponentInfo> entry : components.entrySet()) { + classToName.put(entry.getValue().componentClass(), entry.getKey()); + } } /** @@ -79,20 +95,32 @@ public class ComponentRegistry { * @throws TikaConfigException if the component name is not found */ public Class<?> getComponentClass(String name) throws TikaConfigException { - Class<?> clazz = components.get(name); - if (clazz == null) { + ComponentInfo info = getComponentInfo(name); + return info.componentClass(); + } + + /** + * Looks up full component information by name. + * + * @param name the component name (e.g., "pdf-parser") + * @return the component info including class, selfConfiguring flag, and contextKey + * @throws TikaConfigException if the component name is not found + */ + public ComponentInfo getComponentInfo(String name) throws TikaConfigException { + ComponentInfo info = components.get(name); + if (info == null) { throw new TikaConfigException("Unknown component name: '" + name + "'. " + "Available components: " + components.keySet()); } - return clazz; + return info; } /** * Returns all registered component names. * - * @return unmodifiable map of component names to classes + * @return unmodifiable map of component names to component info */ - public Map<String, Class<?>> getAllComponents() { + public Map<String, ComponentInfo> getAllComponents() { return Collections.unmodifiableMap(components); } @@ -106,9 +134,19 @@ public class ComponentRegistry { return components.containsKey(name); } - private Map<String, Class<?>> loadComponents(String indexFileName) + /** + * Looks up a component's friendly name by its class. + * + * @param clazz the component class + * @return the friendly name, or null if not registered + */ + public String getFriendlyName(Class<?> clazz) { + return classToName.get(clazz); + } + + private Map<String, ComponentInfo> loadComponents(String indexFileName) throws TikaConfigException { - Map<String, Class<?>> result = new LinkedHashMap<>(); + Map<String, ComponentInfo> result = new LinkedHashMap<>(); String resourcePath = "META-INF/tika/" + indexFileName + ".idx"; try { @@ -133,11 +171,12 @@ public class ComponentRegistry { return result; } - private void loadBuiltinAliases(Map<String, Class<?>> result) { + private void loadBuiltinAliases(Map<String, ComponentInfo> result) { for (Map.Entry<String, String> alias : BUILTIN_ALIASES.entrySet()) { try { Class<?> clazz = Class.forName(alias.getValue(), false, classLoader); - result.put(alias.getKey(), clazz); + boolean selfConfiguring = SelfConfiguring.class.isAssignableFrom(clazz); + result.put(alias.getKey(), new ComponentInfo(clazz, selfConfiguring, null)); } catch (ClassNotFoundException e) { // External dependency not on classpath - skip this alias // This is expected behavior, not an error @@ -145,7 +184,7 @@ public class ComponentRegistry { } } - private void loadFromUrl(URL url, Map<String, Class<?>> result) throws TikaConfigException { + private void loadFromUrl(URL url, Map<String, ComponentInfo> result) throws TikaConfigException { try (InputStream in = url.openStream(); BufferedReader reader = new BufferedReader( new InputStreamReader(in, StandardCharsets.UTF_8))) { @@ -162,7 +201,7 @@ public class ComponentRegistry { continue; } - // Parse: component-name=fully.qualified.ClassName + // Parse: component-name=fully.qualified.ClassName[:key=contextKeyClass] int equalsIndex = line.indexOf('='); if (equalsIndex == -1) { throw new TikaConfigException( @@ -171,18 +210,49 @@ public class ComponentRegistry { } String name = line.substring(0, equalsIndex).trim(); - String className = line.substring(equalsIndex + 1).trim(); + String value = line.substring(equalsIndex + 1).trim(); - if (name.isEmpty() || className.isEmpty()) { + if (name.isEmpty() || value.isEmpty()) { throw new TikaConfigException( "Invalid index file format at " + url + " line " + lineNumber + ": name or class is empty"); } - // Load the class + // Parse value: className or className:key=contextKeyClass + String className = value; + String contextKeyClassName = null; + + int colonIndex = value.indexOf(':'); + if (colonIndex != -1) { + className = value.substring(0, colonIndex); + String suffix = value.substring(colonIndex + 1); + if (suffix.startsWith("key=")) { + contextKeyClassName = suffix.substring(4); + } else { + throw new TikaConfigException( + "Invalid index file format at " + url + " line " + lineNumber + + ": unknown suffix '" + suffix + "', expected 'key=...'"); + } + } + + // Load the component class try { Class<?> clazz = classLoader.loadClass(className); - result.put(name, clazz); + boolean selfConfiguring = SelfConfiguring.class.isAssignableFrom(clazz); + + // Load the context key class if specified + Class<?> contextKey = null; + if (contextKeyClassName != null) { + try { + contextKey = classLoader.loadClass(contextKeyClassName); + } catch (ClassNotFoundException e) { + throw new TikaConfigException( + "Context key class not found: " + contextKeyClassName + + " (from " + url + ")", e); + } + } + + result.put(name, new ComponentInfo(clazz, selfConfiguring, contextKey)); } catch (ClassNotFoundException e) { throw new TikaConfigException( "Component class not found: " + className + " (from " + url + ")", e); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java index c9045c9d79..bb9e2b6d30 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -16,6 +16,7 @@ */ package org.apache.tika.config.loader; +import java.io.IOException; import java.util.Set; import com.fasterxml.jackson.core.JsonProcessingException; @@ -241,14 +242,8 @@ public class ConfigLoader { } try { - // Create a deep copy of defaultValue to avoid mutating the original - // Using convertValue is efficient and doesn't require serializing to bytes - @SuppressWarnings("unchecked") - T copy = objectMapper.convertValue(defaultValue, (Class<T>) defaultValue.getClass()); - - // Merge JSON properties into the copy - return objectMapper.readerForUpdating(copy).readValue(node); - } catch (Exception e) { + return JsonMergeUtils.mergeWithDefaults(objectMapper, node, clazz, defaultValue); + } catch (IOException e) { throw new TikaConfigException( "Failed to merge '" + key + "' into " + clazz.getName(), e); } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/JsonMergeUtils.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/JsonMergeUtils.java new file mode 100644 index 0000000000..be00ccb064 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/JsonMergeUtils.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import java.io.IOException; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * Utility methods for merging JSON configurations with default values. + * <p> + * Provides a consistent pattern for deserializing JSON while preserving + * default values for unspecified fields. The original default object is + * never modified - a deep copy is created first. + */ +public final class JsonMergeUtils { + + private JsonMergeUtils() { + // Utility class + } + + /** + * Deserializes JSON and merges it with a default configuration object. + * <p> + * Creates a deep copy of the default object, then applies the JSON properties + * on top. Fields not specified in the JSON retain their default values. + * The original defaultConfig is never modified. + * + * @param mapper the ObjectMapper to use + * @param json the JSON string to deserialize + * @param configClass the configuration class + * @param defaultConfig the default configuration (will NOT be modified) + * @param <T> the configuration type + * @return a new object with defaults merged with JSON properties + * @throws IOException if deserialization fails + */ + public static <T> T mergeWithDefaults(ObjectMapper mapper, String json, + Class<T> configClass, T defaultConfig) throws IOException { + if (defaultConfig == null) { + return mapper.readValue(json, configClass); + } + + // Create a deep copy of defaultConfig to preserve immutability + T copy = mapper.convertValue(defaultConfig, configClass); + + // Merge JSON properties into the copy + return mapper.readerForUpdating(copy).readValue(json); + } + + /** + * Deserializes a JsonNode and merges it with a default configuration object. + * + * @param mapper the ObjectMapper to use + * @param node the JsonNode to deserialize + * @param configClass the configuration class + * @param defaultConfig the default configuration (will NOT be modified) + * @param <T> the configuration type + * @return a new object with defaults merged with JSON properties + * @throws IOException if deserialization fails + */ + public static <T> T mergeWithDefaults(ObjectMapper mapper, JsonNode node, + Class<T> configClass, T defaultConfig) throws IOException { + if (defaultConfig == null) { + return mapper.treeToValue(node, configClass); + } + + // Create a deep copy of defaultConfig to preserve immutability + @SuppressWarnings("unchecked") + T copy = mapper.convertValue(defaultConfig, (Class<T>) defaultConfig.getClass()); + + // Merge JSON properties into the copy + return mapper.readerForUpdating(copy).readValue(node); + } + + /** + * Deserializes JSON to a configuration object without merging. + * + * @param mapper the ObjectMapper to use + * @param json the JSON string to deserialize + * @param configClass the configuration class + * @param <T> the configuration type + * @return the deserialized object + * @throws IOException if deserialization fails + */ + public static <T> T deserialize(ObjectMapper mapper, String json, + Class<T> configClass) throws IOException { + return mapper.readValue(json, configClass); + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java index 3c13696e47..d7223b3359 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ConfigDeserializer.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.loader.JsonMergeUtils; import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; import org.apache.tika.parser.ParseContext; @@ -80,20 +81,8 @@ public class ConfigDeserializer { if (jsonConfig == null) { return defaultConfig; } - String configJson = jsonConfig.json(); - // If there's a default config, merge the user config on top of it - if (defaultConfig != null) { - // IMPORTANT: Clone the default config first to preserve immutability - // Never modify the original defaultConfig as it may be reused across requests - T configCopy = MAPPER.convertValue(defaultConfig, configClass); - - // Now update the copy with user config - return MAPPER.readerForUpdating(configCopy).readValue(configJson); - } else { - // No default config, just deserialize the user config - return MAPPER.readValue(configJson, configClass); - } + return JsonMergeUtils.mergeWithDefaults(MAPPER, jsonConfig.json(), configClass, defaultConfig); } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index 4cfb95ff72..72b535ead0 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -20,7 +20,6 @@ import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT import java.io.IOException; import java.util.Iterator; -import java.util.Map; import com.fasterxml.jackson.core.JacksonException; import com.fasterxml.jackson.core.JsonParser; @@ -31,6 +30,20 @@ import com.fasterxml.jackson.databind.JsonNode; import org.apache.tika.config.ConfigContainer; import org.apache.tika.parser.ParseContext; +/** + * Deserializes ParseContext from JSON using friendly names. + * <p> + * All fields are stored in ConfigContainer for later resolution. + * Components are resolved at runtime via {@link ParseContextUtils#resolveAll}. + * <p> + * Example input: + * <pre> + * { + * "pdf-parser": {"extractActions": true}, + * "tika-task-timeout": {"timeoutMillis": 5000} + * } + * </pre> + */ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { @Override @@ -43,11 +56,12 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { /** * Deserializes a ParseContext from a JsonNode. - * Uses a properly configured ObjectMapper with polymorphic type handling - * to ensure objects in the ParseContext are deserialized correctly. + * <p> + * All fields are stored as JSON in ConfigContainer. Resolution to actual + * objects happens later via {@link ParseContextUtils#resolveAll}. * * @param jsonNode the JSON node containing the ParseContext data - * @return the deserialized ParseContext + * @return the deserialized ParseContext with ConfigContainer populated * @throws IOException if deserialization fails */ public static ParseContext readParseContext(JsonNode jsonNode) throws IOException { @@ -61,39 +75,15 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { ParseContext parseContext = new ParseContext(); - // Handle legacy "objects" field - deserialize directly into ParseContext - if (contextNode.has("objects")) { - JsonNode objectsNode = contextNode.get("objects"); - for (Map.Entry<String, JsonNode> entry : objectsNode.properties()) { - String superClassName = entry.getKey(); - JsonNode objectNode = entry.getValue(); - - try { - Class<?> superClass = Class.forName(superClassName); - - // Let Jackson handle polymorphic deserialization with type info - // Security is enforced by the PolymorphicTypeValidator in the mapper - Object deserializedObject = ParseContextSerializer.POLYMORPHIC_MAPPER.treeToValue(objectNode, Object.class); - - parseContext.set((Class) superClass, deserializedObject); - } catch (ClassNotFoundException ex) { - throw new IOException("Class not found: " + superClassName, ex); - } - } - } - - // Store all non-"objects" fields as named configurations in ConfigContainer - // This allows parsers to look up their config by friendly name (e.g., "pdf-parser") - // matching the same format used in tika-config.json + // Store all fields as named configurations in ConfigContainer + // Resolution to actual objects happens via ParseContextUtils.resolveAll() ConfigContainer configContainer = null; for (Iterator<String> it = contextNode.fieldNames(); it.hasNext(); ) { String fieldName = it.next(); - if (!"objects".equals(fieldName)) { - if (configContainer == null) { - configContainer = new ConfigContainer(); - } - configContainer.set(fieldName, contextNode.get(fieldName).toString()); + if (configContainer == null) { + configContainer = new ConfigContainer(); } + configContainer.set(fieldName, contextNode.get(fieldName).toString()); } if (configContainer != null) { @@ -102,5 +92,4 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } - } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java index 998e5c49e7..a4bb139d62 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java @@ -17,7 +17,9 @@ package org.apache.tika.serialization; import java.io.IOException; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonSerializer; @@ -27,63 +29,95 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; +import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; +/** + * Serializes ParseContext to JSON using friendly names. + * <p> + * Serializes: + * <ul> + * <li>ConfigContainer contents (JSON strings) - written as-is</li> + * <li>Objects in ParseContext that have registered friendly names - serialized via Jackson</li> + * </ul> + * <p> + * Example output: + * <pre> + * { + * "pdf-parser": {"extractActions": true}, + * "tika-task-timeout": {"timeoutMillis": 5000}, + * "handler-config": {"type": "XML", "parseMode": "RMETA"} + * } + * </pre> + */ public class ParseContextSerializer extends JsonSerializer<ParseContext> { - private static final Logger LOG = LoggerFactory.getLogger(ParseContextSerializer.class); + private static final Logger LOG = LoggerFactory.getLogger(ParseContextSerializer.class); public static final String PARSE_CONTEXT = "parseContext"; - /** - * Static ObjectMapper configured for polymorphic serialization/deserialization. - * Initialized once when the class is loaded to avoid creating a new mapper on each call. - * Package-private to allow ParseContextDeserializer to use the same mapper. - */ - static final ObjectMapper POLYMORPHIC_MAPPER = PolymorphicObjectMapperFactory.getMapper(); + private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + + // Lazily loaded registry for looking up friendly names + private static volatile ComponentRegistry registry; + + private static ComponentRegistry getRegistry() { + if (registry == null) { + synchronized (ParseContextSerializer.class) { + if (registry == null) { + try { + registry = new ComponentRegistry("other-configs", + ParseContextSerializer.class.getClassLoader()); + } catch (TikaConfigException e) { + LOG.warn("Failed to load component registry for serialization", e); + // Return null - objects without friendly names won't be serialized + } + } + } + } + return registry; + } @Override public void serialize(ParseContext parseContext, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { jsonGenerator.writeStartObject(); - Map<String, Object> contextMap = parseContext.getContextMap(); + Set<String> writtenKeys = new HashSet<>(); ConfigContainer configContainer = parseContext.get(ConfigContainer.class); - // Serialize objects stored directly in ParseContext (legacy format) - // These are objects set via context.set(SomeClass.class, someObject) - boolean hasNonConfigObjects = contextMap.size() > (configContainer != null ? 1 : 0); - if (hasNonConfigObjects) { - jsonGenerator.writeFieldName("objects"); - jsonGenerator.writeStartObject(); + // First, write ConfigContainer contents (these are already JSON strings) + if (configContainer != null) { + for (String key : configContainer.getKeys()) { + jsonGenerator.writeFieldName(key); + jsonGenerator.writeRawValue(configContainer.get(key).get().json()); + writtenKeys.add(key); + } + } + // Then, serialize objects from ParseContext that have registered friendly names + ComponentRegistry reg = getRegistry(); + if (reg != null) { + Map<String, Object> contextMap = parseContext.getContextMap(); for (Map.Entry<String, Object> entry : contextMap.entrySet()) { - String className = entry.getKey(); - if (className.equals(ConfigContainer.class.getName())) { + // Skip ConfigContainer - already handled above + if (entry.getKey().equals(ConfigContainer.class.getName())) { continue; } Object value = entry.getValue(); + if (value == null) { + continue; + } - // Write the field name (superclass/interface name from key) - jsonGenerator.writeFieldName(className); - - // Let Jackson handle type information and serialization - // Use writerFor(Object.class) to ensure polymorphic type info is added - POLYMORPHIC_MAPPER.writerFor(Object.class).writeValue(jsonGenerator, value); - } - - jsonGenerator.writeEndObject(); - } - - // Write ConfigContainer fields as top-level properties (new friendly-name format) - // Each field contains a JSON string representing a parser/component configuration - // using the same friendly names as tika-config.json (e.g., "pdf-parser", "html-parser") - if (configContainer != null) { - for (String key : configContainer.getKeys()) { - jsonGenerator.writeFieldName(key); - // Write the JSON string as raw JSON (not as a quoted string) - jsonGenerator.writeRawValue(configContainer.get(key).get().json()); + // Look up friendly name for this object's class + String friendlyName = reg.getFriendlyName(value.getClass()); + if (friendlyName != null && !writtenKeys.contains(friendlyName)) { + jsonGenerator.writeFieldName(friendlyName); + MAPPER.writeValue(jsonGenerator, value); + writtenKeys.add(friendlyName); + } } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java new file mode 100644 index 0000000000..a69798a3f6 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.ConfigContainer; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.loader.ComponentInfo; +import org.apache.tika.config.loader.ComponentRegistry; +import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.parser.ParseContext; + +/** + * Utility methods for working with ParseContext objects in JSON-based configurations. + * <p> + * Supports both legacy verbose format and new friendly-name format: + * <pre> + * // Legacy format: + * "parse-context": { + * "objects": { + * "org.apache.tika.config.TikaTaskTimeout": { + * "@class": "org.apache.tika.config.TikaTaskTimeout", + * "timeoutMillis": 30000 + * } + * } + * } + * + * // New friendly-name format: + * "parse-context": { + * "tika-task-timeout": { + * "timeoutMillis": 30000 + * } + * } + * </pre> + * <p> + * Components that implement {@link org.apache.tika.config.SelfConfiguring} are skipped + * during resolution - they read their own config from ConfigContainer at runtime. + */ +public class ParseContextUtils { + + private static final Logger LOG = LoggerFactory.getLogger(ParseContextUtils.class); + private static final ObjectMapper MAPPER = PolymorphicObjectMapperFactory.getMapper(); + + /** + * Known interfaces that should be used as ParseContext keys. + * When a component implements one of these interfaces, the interface is used as + * the key in ParseContext instead of the concrete class. + * <p> + * These are NOT auto-discovered via SPI - they require explicit configuration. + */ + private static final List<Class<?>> KNOWN_CONTEXT_INTERFACES = List.of( + MetadataFilter.class + // Add other known interfaces as needed + ); + + /** + * Resolves all friendly-named components from ConfigContainer and adds them to ParseContext. + * <p> + * Iterates through all entries in ConfigContainer, looks up the friendly name in ComponentRegistry, + * deserializes the JSON, and adds the instance to ParseContext. + * <p> + * Components that implement {@link org.apache.tika.config.SelfConfiguring} are skipped - + * they read their own config from ConfigContainer at runtime. + * <p> + * The ParseContext key is determined by: + * <ol> + * <li>Explicit contextKey from @TikaComponent annotation (if specified)</li> + * <li>Auto-detected from {@link #KNOWN_CONTEXT_INTERFACES} (if component implements one)</li> + * <li>The component's own class (default)</li> + * </ol> + * + * @param context the ParseContext to populate + * @param classLoader the ClassLoader to use for loading component classes + */ + public static void resolveAll(ParseContext context, ClassLoader classLoader) { + if (context == null) { + return; + } + + ConfigContainer container = context.get(ConfigContainer.class); + if (container == null) { + return; + } + + try { + // Load the "other-configs" registry which includes parse-context components + ComponentRegistry registry = new ComponentRegistry("other-configs", classLoader); + + // Iterate through all configs in the container + for (String friendlyName : container.getKeys()) { + JsonConfig jsonConfig = container.get(friendlyName, null); + if (jsonConfig == null) { + continue; + } + + ComponentInfo info = null; + try { + // Try to find this friendly name in the registry + info = registry.getComponentInfo(friendlyName); + + // Skip self-configuring components - they handle their own config + if (info.selfConfiguring()) { + LOG.debug("'{}' is self-configuring, skipping resolution", friendlyName); + continue; + } + + // Determine the context key + Class<?> contextKey = determineContextKey(info, friendlyName); + + // Deserialize and add to ParseContext + Object instance = MAPPER.readValue(jsonConfig.json(), info.componentClass()); + context.set((Class) contextKey, instance); + + LOG.debug("Resolved '{}' -> {} with key {}", + friendlyName, info.componentClass().getName(), contextKey.getName()); + } catch (TikaConfigException e) { + // Not a registered component - that's okay, might be used for something else + LOG.debug("'{}' not found in other-configs registry, skipping", friendlyName); + } catch (IOException e) { + LOG.warn("Failed to deserialize component '{}' of type {}", friendlyName, + info != null ? info.componentClass().getName() : "unknown", e); + } + } + } catch (TikaConfigException e) { + LOG.warn("Failed to load other-configs registry for parse-context resolution", e); + } + } + + /** + * Determines the ParseContext key for a component. + * + * @param info the component info + * @param friendlyName the component's friendly name (for error messages) + * @return the class to use as ParseContext key + * @throws TikaConfigException if the component implements multiple known interfaces + * and no explicit contextKey is specified + */ + private static Class<?> determineContextKey(ComponentInfo info, String friendlyName) + throws TikaConfigException { + // Use explicit contextKey if provided + if (info.contextKey() != null) { + return info.contextKey(); + } + + // Auto-detect from known interfaces + List<Class<?>> matches = new ArrayList<>(); + for (Class<?> iface : KNOWN_CONTEXT_INTERFACES) { + if (iface.isAssignableFrom(info.componentClass())) { + matches.add(iface); + } + } + + if (matches.size() > 1) { + throw new TikaConfigException( + "Component '" + friendlyName + "' (" + info.componentClass().getName() + + ") implements multiple known context interfaces: " + matches + + ". Use @TikaComponent(contextKey=...) to specify which one to use."); + } + + // Use the single matched interface, or fall back to the component class + return matches.isEmpty() ? info.componentClass() : matches.get(0); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index 61f7386233..fdd1ecbff5 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -17,12 +17,12 @@ package org.apache.tika.serialization; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.StringWriter; import java.io.Writer; -import java.util.List; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; @@ -32,12 +32,14 @@ import org.junit.jupiter.api.Test; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.loader.PolymorphicObjectMapperFactory; -import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; -import org.apache.tika.metadata.filter.CompositeMetadataFilter; -import org.apache.tika.metadata.filter.DateNormalizingMetadataFilter; -import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; +/** + * Tests for ParseContext serialization/deserialization. + * <p> + * All configs use friendly names and are stored in ConfigContainer. + * Components are resolved at runtime via ParseContextUtils.resolveAll(). + */ public class TestParseContextSerialization { private ObjectMapper createMapper() { @@ -65,41 +67,24 @@ public class TestParseContextSerialization { } } - @Test - public void testBasic() throws Exception { - MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new DateNormalizingMetadataFilter())); + public void testEmptyParseContext() throws Exception { ParseContext pc = new ParseContext(); - pc.set(MetadataFilter.class, metadataFilter); + String json = serializeParseContext(pc); - ConfigContainer configContainer = new ConfigContainer(); - configContainer.set(EmbeddedDocumentBytesHandler.class, """ - {"k1":1,"k2":"val3" } - """); - pc.set(ConfigContainer.class, configContainer); + // Empty ParseContext should serialize to empty object ObjectMapper mapper = createMapper(); - String json; - try (Writer writer = new StringWriter()) { - try (JsonGenerator jsonGenerator = mapper - .getFactory() - .createGenerator(writer)) { - ParseContextSerializer serializer = new ParseContextSerializer(); - serializer.serialize(pc, jsonGenerator, null); - } - json = writer.toString(); - } + JsonNode root = mapper.readTree(json); + assertEquals(0, root.size(), "Empty ParseContext should have no fields"); + // Verify round-trip ParseContext deserialized = mapper.readValue(json, ParseContext.class); - MetadataFilter dMetadataFilter = deserialized.get(MetadataFilter.class); - assertTrue(dMetadataFilter instanceof CompositeMetadataFilter); - List<MetadataFilter> metadataFilters = ((CompositeMetadataFilter) dMetadataFilter).getFilters(); - assertEquals(1, metadataFilters.size()); - assertTrue(metadataFilters.get(0) instanceof DateNormalizingMetadataFilter); + assertNotNull(deserialized); } @Test public void testFriendlyNameFormat() throws Exception { - // Test the new friendly-name format matching tika-config.json + // Test the friendly-name format ParseContext pc = new ParseContext(); ConfigContainer configContainer = new ConfigContainer(); @@ -111,10 +96,11 @@ public class TestParseContextSerialization { String json = serializeParseContext(pc); - // Verify JSON structure + // Verify JSON structure - should have flat friendly names, no "objects" wrapper ObjectMapper mapper = createMapper(); JsonNode root = mapper.readTree(json); + assertFalse(root.has("objects"), "Should NOT have objects field"); assertTrue(root.has("pdf-parser"), "Should have pdf-parser field"); assertTrue(root.has("html-parser"), "Should have html-parser field"); assertEquals("AUTO", root @@ -139,58 +125,30 @@ public class TestParseContextSerialization { } @Test - public void testLegacyObjectsFormat() throws Exception { - // Test the legacy format with "objects" field - MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new DateNormalizingMetadataFilter())); + public void testTikaTaskTimeoutFormat() throws Exception { + // Test serializing tika-task-timeout configuration ParseContext pc = new ParseContext(); - pc.set(MetadataFilter.class, metadataFilter); - - String json = serializeParseContext(pc); - - // Verify JSON has "objects" field - ObjectMapper mapper = createMapper(); - JsonNode root = mapper.readTree(json); - assertTrue(root.has("objects"), "Should have objects field for legacy format"); - - // Verify round-trip - ParseContext deserialized = mapper.readValue(json, ParseContext.class); - MetadataFilter deserializedFilter = deserialized.get(MetadataFilter.class); - assertNotNull(deserializedFilter); - assertTrue(deserializedFilter instanceof CompositeMetadataFilter); - } - - @Test - public void testMixedFormat() throws Exception { - // Test that both legacy objects and new friendly names can coexist - MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new DateNormalizingMetadataFilter())); - ParseContext pc = new ParseContext(); - pc.set(MetadataFilter.class, metadataFilter); - ConfigContainer configContainer = new ConfigContainer(); - configContainer.set("pdf-parser", "{\"ocrStrategy\":\"NO_OCR\"}"); + + configContainer.set("tika-task-timeout", "{\"timeoutMillis\":30000}"); pc.set(ConfigContainer.class, configContainer); String json = serializeParseContext(pc); - // Verify both formats are present ObjectMapper mapper = createMapper(); JsonNode root = mapper.readTree(json); - assertTrue(root.has("objects"), "Should have objects field"); - assertTrue(root.has("pdf-parser"), "Should have pdf-parser field"); + + assertTrue(root.has("tika-task-timeout"), "Should have tika-task-timeout field"); + assertEquals(30000, root + .get("tika-task-timeout") + .get("timeoutMillis") + .asInt()); // Verify round-trip ParseContext deserialized = mapper.readValue(json, ParseContext.class); - - // Check legacy object - MetadataFilter deserializedFilter = deserialized.get(MetadataFilter.class); - assertNotNull(deserializedFilter); - assertTrue(deserializedFilter instanceof CompositeMetadataFilter); - - // Check friendly-name config ConfigContainer deserializedConfig = deserialized.get(ConfigContainer.class); - assertNotNull(deserializedConfig); assertTrue(deserializedConfig - .get("pdf-parser") + .get("tika-task-timeout") .isPresent()); } @@ -208,10 +166,9 @@ public class TestParseContextSerialization { // Test hasConfig assertTrue(ConfigDeserializer.hasConfig(pc, "pdf-parser")); + assertFalse(ConfigDeserializer.hasConfig(pc, "non-existent")); - // Test getConfig with a simple JSON deserialization - // We can't use actual PDFParserConfig here since we don't have the dependency, - // but we can verify the JSON is retrieved correctly + // Test getConfig retrieves JSON correctly String retrievedConfig = pc .get(ConfigContainer.class) .get("pdf-parser").get().json(); @@ -246,7 +203,7 @@ public class TestParseContextSerialization { .get("html-parser") .isPresent()); - // Verify the JSON content + // Verify the JSON content is preserved String pdfParserJson = config .get("pdf-parser").get().json(); assertTrue(pdfParserJson.contains("AUTO")); @@ -254,34 +211,72 @@ public class TestParseContextSerialization { } @Test - public void testDeserializeMixedFromJSON() throws Exception { - // Test deserializing JSON with both legacy objects and friendly names - // First create the ParseContext and serialize it to get the correct format - MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new DateNormalizingMetadataFilter())); - ParseContext pc = new ParseContext(); - pc.set(MetadataFilter.class, metadataFilter); + public void testDeserializeWithParseContextWrapper() throws Exception { + // Test deserializing with optional "parseContext" wrapper + String json = """ + { + "parseContext": { + "pdf-parser": { + "ocrStrategy": "NO_OCR" + } + } + } + """; + + ObjectMapper mapper = createMapper(); + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + ConfigContainer config = deserialized.get(ConfigContainer.class); + assertNotNull(config); + assertTrue(config + .get("pdf-parser") + .isPresent()); + } + + @Test + public void testMultipleConfigs() throws Exception { + // Test with multiple different config types + ParseContext pc = new ParseContext(); ConfigContainer configContainer = new ConfigContainer(); + configContainer.set("pdf-parser", "{\"ocrStrategy\":\"AUTO\"}"); + configContainer.set("html-parser", "{\"extractScripts\":true}"); + configContainer.set("tika-task-timeout", "{\"timeoutMillis\":5000}"); + configContainer.set("my-custom-config", "{\"enabled\":true,\"maxRetries\":3}"); + pc.set(ConfigContainer.class, configContainer); - // Serialize to JSON + String json = serializeParseContext(pc); + + // Verify all are present ObjectMapper mapper = createMapper(); - String json = mapper.writeValueAsString(pc); + JsonNode root = mapper.readTree(json); + + assertEquals(4, root.size(), "Should have 4 config fields"); + assertTrue(root.has("pdf-parser")); + assertTrue(root.has("html-parser")); + assertTrue(root.has("tika-task-timeout")); + assertTrue(root.has("my-custom-config")); - // Now deserialize it back + // Verify round-trip ParseContext deserialized = mapper.readValue(json, ParseContext.class); + ConfigContainer deserializedConfig = deserialized.get(ConfigContainer.class); + assertEquals(4, deserializedConfig.getKeys().size()); + } - // Verify legacy object was deserialized - MetadataFilter filter = deserialized.get(MetadataFilter.class); - assertNotNull(filter); - assertTrue(filter instanceof CompositeMetadataFilter); + @Test + public void testProgrammaticObjectsWithoutFriendlyName() throws Exception { + // Objects without a registered friendly name are NOT serialized + ParseContext pc = new ParseContext(); - // Verify friendly-name config was stored - ConfigContainer config = deserialized.get(ConfigContainer.class); - assertNotNull(config); - assertTrue(config - .get("pdf-parser") - .isPresent()); + // String doesn't have a @TikaComponent annotation, so it won't serialize + pc.set(String.class, "test-value"); + + String json = serializeParseContext(pc); + + // Should be empty - String doesn't have a friendly name + ObjectMapper mapper = createMapper(); + JsonNode root = mapper.readTree(json); + assertEquals(0, root.size(), "Objects without friendly names should not be serialized"); } } diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index c92a0d0a82..d15fbfce50 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -42,6 +42,11 @@ <artifactId>tika-pipes-core</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-translate</artifactId> diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java index 0f1a9d005a..59ce60f35c 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java @@ -22,8 +22,6 @@ import java.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.server.core.config.TimeoutConfig; - public class ServerStatusWatcher implements Runnable { @@ -67,9 +65,10 @@ public class ServerStatusWatcher implements Runnable { if (millisElapsed > status.timeoutMillis) { serverStatus.setStatus(ServerStatus.STATUS.TIMEOUT); LOG.error("Timeout task {}, millis elapsed {}; " + - "consider increasing the allowable time with the " - + "<taskTimeoutMillis/> parameter or the {} header", - status.task.toString(), millisElapsed, TimeoutConfig.X_TIKA_TIMEOUT_MILLIS); + "consider increasing the allowable time via: " + + "server config (\"taskTimeoutMillis\") or " + + "per-request config (\"tika-task-timeout\" in parse-context)", + status.task.toString(), millisElapsed); shutdown(); } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java index 8d2d90eab0..ce048cd6b9 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java @@ -29,6 +29,7 @@ import jakarta.ws.rs.core.UriInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesResource.java index 77d807902a..1ffd7d05d8 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesResource.java @@ -45,6 +45,7 @@ import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.core.PipesParser; import org.apache.tika.pipes.core.serialization.JsonFetchEmitTuple; +import org.apache.tika.serialization.ParseContextUtils; @Path("/pipes") public class PipesResource { @@ -91,6 +92,8 @@ public class PipesResource { try (Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8)) { t = JsonFetchEmitTuple.fromJson(reader); } + // Resolve friendly-named configs in ParseContext to actual objects + ParseContextUtils.resolveAll(t.getParseContext(), getClass().getClassLoader()); return processTuple(t); } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 8bdb469e4b..b107c87d2d 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -27,11 +27,17 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.lang.reflect.Field; import java.lang.reflect.Method; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; @@ -56,6 +62,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Strings; import org.apache.cxf.attachment.ContentDisposition; import org.apache.cxf.jaxrs.ext.multipart.Attachment; +import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; import org.apache.cxf.jaxrs.impl.MetadataMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,6 +80,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.serialization.ParseContextDeserializer; +import org.apache.tika.serialization.ParseContextUtils; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; @@ -364,27 +373,6 @@ public class TikaResource { } } - protected static long getTaskTimeout(ParseContext parseContext) { - - TikaTaskTimeout tikaTaskTimeout = parseContext.get(TikaTaskTimeout.class); - long timeoutMillis = TIKA_SERVER_CONFIG.getTaskTimeoutMillis(); - - if (tikaTaskTimeout != null) { - if (tikaTaskTimeout.getTimeoutMillis() > TIKA_SERVER_CONFIG.getTaskTimeoutMillis()) { - throw new IllegalArgumentException( - "Can't request a timeout ( " + tikaTaskTimeout.getTimeoutMillis() + "ms) greater than the taskTimeoutMillis set in the server config (" + - TIKA_SERVER_CONFIG.getTaskTimeoutMillis() + "ms)"); - } - timeoutMillis = tikaTaskTimeout.getTimeoutMillis(); - if (timeoutMillis < TIKA_SERVER_CONFIG.getMinimumTimeoutMillis()) { - throw new WebApplicationException(new IllegalArgumentException( - "taskTimeoutMillis must be > " + "minimumTimeoutMillis, currently set to (" + TIKA_SERVER_CONFIG.getMinimumTimeoutMillis() + "ms)"), - Response.Status.BAD_REQUEST); - } - } - return timeoutMillis; - } - public static void checkIsOperating() { //check that server is not in shutdown mode if (!SERVER_STATUS.isOperating()) { @@ -415,6 +403,10 @@ public class TikaResource { return DEFAULT_HANDLER_CONFIG.isThrowOnWriteLimitReached(); } + public static long getTaskTimeout(ParseContext parseContext) { + return TikaTaskTimeout.getTimeoutMillis(parseContext, TIKA_SERVER_CONFIG.getTaskTimeoutMillis()); + } + @GET @Produces("text/plain") public String getMessage() { @@ -427,9 +419,53 @@ public class TikaResource { @Produces("text/plain") @Path("form") public StreamingOutput getTextFromMultipart(Attachment att, @Context HttpHeaders httpHeaders, @Context final UriInfo info) throws TikaConfigException, IOException { + LOG.info("===== getTextFromMultipart (single Attachment) CALLED ====="); return produceText(att.getObject(InputStream.class), new Metadata(), preparePostHeaderMap(att, httpHeaders), info); } + // Greenfield test endpoint for multipart with config + @POST + @Consumes("multipart/form-data") + @Produces("text/plain") + @Path("test-config") + public StreamingOutput testMultipartWithConfig( + List<Attachment> attachments, + @Context HttpHeaders httpHeaders, + @Context final UriInfo info) throws TikaConfigException, IOException { + LOG.info("===== testMultipartWithConfig CALLED with {} attachments =====", attachments.size()); + + // Find the file and config attachments + Attachment fileAtt = null; + Attachment configAtt = null; + + for (Attachment att : attachments) { + ContentDisposition cd = att.getContentDisposition(); + if (cd != null) { + String name = cd.getParameter("name"); + LOG.info("Found attachment with name: {}", name); + if ("file".equals(name)) { + fileAtt = att; + } else if ("config".equals(name)) { + configAtt = att; + } + } + } + + if (fileAtt == null) { + throw new IllegalArgumentException("Missing 'file' attachment"); + } + if (configAtt == null) { + throw new IllegalArgumentException("Missing 'config' attachment"); + } + + final Metadata metadata = new Metadata(); + MultivaluedMap<String, String> headers = preparePostHeaderMap(fileAtt, httpHeaders); + return produceTextWithConfig( + getInputStream(fileAtt.getObject(InputStream.class), metadata, httpHeaders, info), + configAtt.getObject(InputStream.class), + metadata, headers, info); + } + //this is equivalent to text-main in tika-app @PUT @Consumes("*/*") @@ -471,10 +507,63 @@ public class TikaResource { @Consumes("*/*") @Produces("text/plain") public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) throws TikaConfigException, IOException { + LOG.info("===== getText (PUT, no @Path) CALLED ====="); final Metadata metadata = new Metadata(); return produceText(getInputStream(is, metadata, httpHeaders, info), metadata, httpHeaders.getRequestHeaders(), info); } + /** + * Produces text output with per-request ParseContext configuration. + * Extracts only the parse-context section from the config to allow per-request + * configuration of ParseContext objects (e.g., timeout, handler settings). + * Uses the server's configured parser to preserve any parser configuration from startup. + */ + private StreamingOutput produceTextWithConfig(final InputStream fileStream, InputStream configStream, + final Metadata metadata, MultivaluedMap<String, String> httpHeaders, final UriInfo info) + throws TikaConfigException, IOException { + + // Use the server's configured parser (not a new one from the config) + final Parser parser = createParser(); + final ParseContext context = new ParseContext(); + + // Read the config JSON to extract only the parse-context section + String configJson = new String(configStream.readAllBytes(), StandardCharsets.UTF_8); + ObjectMapper mapper = new ObjectMapper(); + JsonNode rootNode = mapper.readTree(configJson); + + JsonNode parseContextNode = rootNode.get("parse-context"); + LOG.info("found parseContext: " + parseContextNode); + if (parseContextNode != null) { + // Deserialize parseContext section + ParseContext configuredContext = ParseContextDeserializer.readParseContext(parseContextNode); + + // Resolve all friendly-named components from ConfigContainer + // For example: "tika-task-timeout" -> TikaTaskTimeout instance + ParseContextUtils.resolveAll(configuredContext, Thread.currentThread().getContextClassLoader()); + + // Merge configured context into our context + for (Map.Entry<String, Object> entry : configuredContext.getContextMap().entrySet()) { + try { + Class<?> clazz = Class.forName(entry.getKey()); + context.set((Class) clazz, entry.getValue()); + } catch (ClassNotFoundException e) { + LOG.warn("Could not load class for parseContext entry: " + entry.getKey(), e); + } + } + } + + fillMetadata(parser, metadata, httpHeaders); + fillParseContext(httpHeaders, metadata, context); + + logRequest(LOG, "/tika (with config)", metadata); + + return outputStream -> { + Writer writer = new OutputStreamWriter(outputStream, UTF_8); + BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer)); + parse(parser, LOG, info.getPath(), fileStream, body, metadata, context); + }; + } + public StreamingOutput produceText(final InputStream is, final Metadata metadata, MultivaluedMap<String, String> httpHeaders, final UriInfo info) throws TikaConfigException, IOException { final Parser parser = createParser(); @@ -499,6 +588,8 @@ public class TikaResource { @Produces("text/html") @Path("form") public StreamingOutput getHTMLFromMultipart(Attachment att, @Context HttpHeaders httpHeaders, @Context final UriInfo info) throws TikaConfigException, IOException { + LOG.info("loading multipart html"); + return produceOutput(att.getObject(InputStream.class), new Metadata(), preparePostHeaderMap(att, httpHeaders), info, "html"); } @@ -515,6 +606,8 @@ public class TikaResource { @Produces("text/xml") @Path("form") public StreamingOutput getXMLFromMultipart(Attachment att, @Context HttpHeaders httpHeaders, @Context final UriInfo info) throws TikaConfigException, IOException { + LOG.info("loading multipart xml"); + return produceOutput(att.getObject(InputStream.class), new Metadata(), preparePostHeaderMap(att, httpHeaders), info, "xml"); } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java index 74eec7574f..7bff9149c0 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaPipesTest.java @@ -210,7 +210,6 @@ public class TikaPipesTest extends CXFTestBase { new EmitKey(EMITTER_JSON_ID, ""), userMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); StringWriter writer = new StringWriter(); JsonFetchEmitTuple.toJson(t, writer); - String getUrl = endPoint + PIPES_PATH; Response response = WebClient .create(getUrl) diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java index 77608bc7d8..50b341eaab 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java @@ -42,6 +42,9 @@ import org.apache.cxf.configuration.security.KeyManagersType; import org.apache.cxf.configuration.security.KeyStoreType; import org.apache.cxf.configuration.security.TrustManagersType; import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.cxf.jaxrs.ext.multipart.Attachment; +import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition; +import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; import org.apache.cxf.transport.http.HTTPConduit; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -52,7 +55,6 @@ import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.serialization.JsonMetadataList; -import org.apache.tika.server.core.config.TimeoutConfig; import org.apache.tika.utils.ProcessUtils; public class TikaServerIntegrationTest extends IntegrationTestBase { @@ -120,19 +122,10 @@ public class TikaServerIntegrationTest extends IntegrationTestBase { testStopped(2000); } - @Test - public void testMinimumTimeoutInHeader() throws Exception { - startProcess(new String[]{"-config", getConfig("tika-config-server-basic.json")}); - awaitServerStartup(); - - Response response = WebClient - .create(endPoint + RMETA_PATH) - .accept("application/json") - .header(TimeoutConfig.X_TIKA_TIMEOUT_MILLIS, 1) - .put(ClassLoader.getSystemResourceAsStream(TEST_HEAVY_HANG)); - assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), response.getStatus()); - } - + /** + * Test legacy header-based timeout configuration. + * @deprecated This tests the old header-based approach. Use testTaskTimeoutMultipart instead. + */ @Test public void testTaskTimeoutHeader() throws Exception { @@ -153,6 +146,41 @@ public class TikaServerIntegrationTest extends IntegrationTestBase { testStopped(2000); } + /** + * Test JSON-based timeout configuration via multipart request. + * This is the new recommended approach for configuring timeouts. + */ + @Test + public void testTaskTimeoutMultipart() throws Exception { + startProcess(new String[]{"-config", getConfig("tika-config-server-basic.json")}); + awaitServerStartup(); + + // Create multipart form with file and config + ContentDisposition fileCD = new ContentDisposition("form-data; name=\"file\"; filename=\"heavy_hang_30000.xml\""); + ContentDisposition configCD = new ContentDisposition("form-data; name=\"config\"; filename=\"tika-config-timeout-100ms.json\""); + + MultipartBody multipart = new MultipartBody(List.of( + new Attachment("file", ClassLoader.getSystemResourceAsStream(TEST_HEAVY_HANG), fileCD), + new Attachment("config", ClassLoader.getSystemResourceAsStream("configs/tika-config-timeout-100ms.json"), configCD) + )); + + Response response = null; + try { + LOG.info("TEST: Sending request to: {}", endPoint + "/tika/test-config"); + response = WebClient + .create(endPoint + "/tika/test-config") + .accept("text/plain") + .type("multipart/form-data") + .post(multipart); + LOG.info("TEST: Response status: {}", response != null ? response.getStatus() : "null"); + } catch (Exception e) { + LOG.info("TEST: Exception during request", e); + //timeout may or may not cause an exception depending on timing + } + //give some time for the server to crash/terminate itself + testStopped(2000); + } + private void testStopped(long millis) throws InterruptedException { Thread.sleep(millis); try { diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java index b9b5ae88e2..5ba494c44d 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java @@ -188,6 +188,50 @@ public class TikaServerPipesIntegrationTest extends IntegrationTestBase { .asText()); } + @Test + public void testPerRequestTimeout() throws Exception { + // Start server with 5000ms timeout (TIKA_CONFIG) + // but send a request with 100ms per-request timeout + // This should timeout after 100ms, not 5000ms + startProcess(new String[]{ + "-config", ProcessUtils.escapeCommandLine(TIKA_CONFIG + .toAbsolutePath() + .toString())}); + JsonNode node = testOneWithPerRequestTimeout("heavy_hang_30000.xml", 100); + assertEquals("process_crash", node + .get("status") + .asText()); + assertContains("TIMEOUT", node + .get("type") + .asText()); + } + + private JsonNode testOneWithPerRequestTimeout(String fileName, long timeoutMillis) throws Exception { + awaitServerStartup(); + Response response = WebClient + .create(endPoint + "/pipes") + .accept("application/json") + .post(getJsonStringWithTimeout(fileName, timeoutMillis)); + if (response.getStatus() == 200) { + Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + return new ObjectMapper().readTree(reader); + } + return null; + } + + private String getJsonStringWithTimeout(String fileName, long timeoutMillis) throws IOException { + ParseContext parseContext = new ParseContext(); + parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); + parseContext.addConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}"); + + FetchEmitTuple t = new FetchEmitTuple(fileName, + new FetchKey(CXFTestBase.FETCHER_ID, fileName), + new EmitKey(CXFTestBase.EMITTER_JSON_ID, ""), + new Metadata(), + parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); + return JsonFetchEmitTuple.toJson(t); + } private JsonNode testOne(String fileName, boolean shouldFileExist) throws Exception { return testOne(fileName, shouldFileExist, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT); diff --git a/tika-server/tika-server-core/src/test/resources/configs/tika-config-server-basic.json b/tika-server/tika-server-core/src/test/resources/configs/tika-config-server-basic.json index 5075150c45..7cc6ce5411 100644 --- a/tika-server/tika-server-core/src/test/resources/configs/tika-config-server-basic.json +++ b/tika-server/tika-server-core/src/test/resources/configs/tika-config-server-basic.json @@ -6,7 +6,8 @@ "port": 9999, "endpoints": [ "rmeta", - "status" + "status", + "tika" ] } }
