This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4639 in repository https://gitbox.apache.org/repos/asf/tika.git
commit ecf3a7cf1a1c59945f267e7be60687dbc9aee7a2 Author: tallison <[email protected]> AuthorDate: Fri Jan 30 08:14:29 2026 -0500 simplify configurations --- .../tika/annotation/TikaComponentProcessor.java | 118 +++++++++++++++------ .../java/org/apache/tika/config/TikaComponent.java | 25 +++++ .../org/apache/tika/config/EmbeddedLimits.java | 2 +- .../java/org/apache/tika/config/OutputLimits.java | 2 +- .../java/org/apache/tika/config/TimeoutLimits.java | 2 +- .../java/org/apache/tika/digest/DigestHelper.java | 12 +-- .../org/apache/tika/digest/DigesterFactory.java | 18 ++-- .../org/apache/tika/parser/AutoDetectParser.java | 2 +- .../tika/sax/BasicContentHandlerFactory.java | 2 +- .../java/org/apache/tika/sax/SAXOutputConfig.java | 3 + .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 2 +- .../digestutils/BouncyCastleDigesterFactory.java | 4 +- .../parser/digestutils/CommonsDigesterFactory.java | 4 +- .../tika/parser/AutoDetectParserConfigTest.java | 5 +- .../tika/parser/digest/DigestConfigTest.java | 2 +- .../digest/SkipContainerDocumentDigestTest.java | 2 +- .../src/test/resources/configs/tika-4533.json | 6 +- .../configs/tika-config-bc-digests-base32.json | 6 +- .../configs/tika-config-bc-digests-basic.json | 6 +- .../configs/tika-config-bc-digests-multiple.json | 6 +- .../configs/tika-config-commons-digests-basic.json | 6 +- .../configs/tika-config-digests-pdf-only.json | 6 +- .../tika-config-digests-skip-container.json | 6 +- .../resources/configs/tika-config-digests.json | 14 ++- .../resources/configs/tika-config-md5-digest.json | 6 +- .../resources/configs/tika-config-no-names.json | 6 +- ...a-config-upcasing-custom-handler-decorator.json | 12 +-- .../resources/configs/tika-config-with-names.json | 6 +- .../configs/tika-config-write-filter.json | 28 +++-- .../tika/pipes/core/server/ParseHandler.java | 2 +- .../apache/tika/pipes/core/server/PipesServer.java | 2 +- .../src/test/resources/configs/tika-4533.json | 6 +- .../test/resources/configs/tika-config-basic.json | 8 +- .../resources/configs/tika-config-passback.json | 8 +- .../resources/configs/tika-config-truncate.json | 14 +-- .../resources/configs/tika-config-uppercasing.json | 8 +- .../configs/tika-config-write-limiter.json | 16 ++- .../apache/tika/config/loader/ComponentInfo.java | 15 ++- .../tika/config/loader/ComponentRegistry.java | 41 +++++-- .../apache/tika/config/loader/ConfigLoader.java | 33 +++--- .../apache/tika/config/loader/TikaJsonConfig.java | 12 +-- .../org/apache/tika/config/loader/TikaLoader.java | 111 ++++++++++++++----- .../config/loader/TikaObjectMapperFactory.java | 2 +- .../tika/serialization/ParseContextUtils.java | 4 +- .../java/org/apache/tika/config/AllLimitsTest.java | 16 ++- .../writefilter/StandardMetadataLimiterTest.java | 10 +- .../TestParseContextSerialization.java | 2 +- .../test/resources/configs/TIKA-3695-exclude.json | 8 +- .../test/resources/configs/TIKA-3695-fields.json | 20 ++-- .../src/test/resources/configs/TIKA-3695.json | 14 ++- .../test/resources/configs/all-limits-test.json | 14 ++- .../resources/configs/embedded-limits-test.json | 2 +- .../test/resources/configs/output-limits-test.json | 2 +- .../test/resources/configs/test-config-loader.json | 2 +- .../resources/configs/test-interface-no-type.json | 2 +- .../test/resources/configs/test-invalid-class.json | 2 +- .../resources/configs/test-partial-config.json | 2 +- .../resources/configs/test-unexpected-field.json | 2 +- .../test/resources/configs/test-wrong-type.json | 2 +- .../resources/configs/timeout-limits-test.json | 2 +- .../server/core/resource/MetadataResource.java | 2 +- .../tika/server/core/resource/TikaResource.java | 8 +- .../org/apache/tika/server/core/CXFTestBase.java | 12 +-- .../resources/configs/cxf-test-base-template.json | 8 +- .../resources/configs/cxf-test-base-template.json | 8 +- .../configs/tika-config-for-server-tests.json | 8 +- .../tika-config-langdetect-opennlp-filter.json | 8 +- .../tika-config-langdetect-optimaize-filter.json | 8 +- 68 files changed, 429 insertions(+), 336 deletions(-) diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index cd757664e8..bcf862c4c3 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -64,6 +64,10 @@ public class TikaComponentProcessor extends AbstractProcessor { /** * Known Tika service interfaces for SPI generation. * Only classes implementing these interfaces will have SPI files generated. + * <p> + * Note: DigesterFactory and ContentHandlerFactory are NOT in this map because + * they are parse-context components, not top-level service interfaces. + * Their implementations go to parse-context.idx instead. */ private static final Map<String, String> SERVICE_INTERFACES = new LinkedHashMap<>(); @@ -76,11 +80,19 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.language.translate.Translator", "translators"); SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); - SERVICE_INTERFACES.put("org.apache.tika.digest.DigesterFactory", "digester-factories"); - SERVICE_INTERFACES.put("org.apache.tika.sax.ContentHandlerFactory", - "content-handler-factories"); } + /** + * Interfaces whose implementations should go to parse-context.idx. + * These are factory interfaces used via ParseContext, not loaded via SPI. + */ + private static final Set<String> PARSE_CONTEXT_INTERFACES = Set.of( + "org.apache.tika.digest.DigesterFactory", + "org.apache.tika.sax.ContentHandlerFactory", + "org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory", + "org.apache.tika.extractor.EmbeddedDocumentExtractorFactory" + ); + private Messager messager; private Filer filer; @@ -134,45 +146,60 @@ public class TikaComponentProcessor extends AbstractProcessor { // Get contextKey if specified (need to use mirror API for Class types) String contextKey = getContextKeyFromAnnotation(element); + // Get defaultFor if specified (need to use mirror API for Class types) + String defaultFor = getDefaultForFromAnnotation(element); + messager.printMessage(Diagnostic.Kind.NOTE, "Processing @TikaComponent: " + className + " -> " + componentName + - " (SPI: " + includeSpi + ", contextKey: " + contextKey + ")"); + " (SPI: " + includeSpi + ", contextKey: " + contextKey + + ", defaultFor: " + defaultFor + ")"); - // Find all implemented service interfaces + // Find all implemented service interfaces (both SPI and parse-context) List<String> serviceInterfaces = findServiceInterfaces(element); + List<String> parseContextInterfaces = findParseContextInterfaces(element); + + // Combine all interfaces for context key detection + List<String> allInterfaces = new ArrayList<>(serviceInterfaces); + allInterfaces.addAll(parseContextInterfaces); - // Build the index entry value (className or className:key=X) + // Build the index entry value (className or className:key=X[:default]) // Auto-detect contextKey from service interface if not explicitly specified String indexValue = className; if (contextKey != null) { // Explicit contextKey specified indexValue = className + ":key=" + contextKey; - } else if (serviceInterfaces.size() == 1) { - // Auto-detect contextKey from single service interface - indexValue = className + ":key=" + serviceInterfaces.get(0); + } else if (allInterfaces.size() == 1) { + // Auto-detect contextKey from single interface + indexValue = className + ":key=" + allInterfaces.get(0); messager.printMessage(Diagnostic.Kind.NOTE, - "Auto-detected contextKey=" + serviceInterfaces.get(0) + " for " + className); - } else if (serviceInterfaces.size() > 1) { + "Auto-detected contextKey=" + allInterfaces.get(0) + " for " + className); + } else if (allInterfaces.size() > 1) { // Multiple interfaces - warn that contextKey should be specified messager.printMessage(Diagnostic.Kind.WARNING, - "Class " + className + " implements multiple service interfaces: " + - serviceInterfaces + ". Consider specifying @TikaComponent(contextKey=...) " + + "Class " + className + " implements multiple interfaces: " + + allInterfaces + ". Consider specifying @TikaComponent(contextKey=...) " + "to select which one to use as ParseContext key.", element); } - if (serviceInterfaces.isEmpty()) { - // No known service interface - put in other-configs.idx + // Add :default marker if defaultFor is specified + if (defaultFor != null) { + indexValue = indexValue + ":default"; + } + + // Check if this is a parse-context component (implements a parse-context interface + // or doesn't implement any known service interface) + if (!parseContextInterfaces.isEmpty() || serviceInterfaces.isEmpty()) { + // Put in parse-context.idx messager.printMessage(Diagnostic.Kind.NOTE, - "Class " + className + " does not implement known service interface, " + - "adding to other-configs.idx", element); + "Class " + className + " is a parse-context component, " + + "adding to parse-context.idx", element); - Map<String, String> index = indexFiles.computeIfAbsent("other-configs", + Map<String, String> index = indexFiles.computeIfAbsent("parse-context", k -> new LinkedHashMap<>()); addToIndex(index, componentName, indexValue, className, element); - return; } - // Process each service interface + // Process SPI service interfaces (these also get their own idx files) for (String serviceInterface : serviceInterfaces) { // Add to SPI services only if spi = true if (includeSpi) { @@ -216,17 +243,33 @@ public class TikaComponentProcessor extends AbstractProcessor { * Returns null if contextKey is void.class (the default). */ private String getContextKeyFromAnnotation(TypeElement element) { + return getClassAttributeFromAnnotation(element, "contextKey"); + } + + /** + * Gets the defaultFor value from the annotation using the mirror API. + * Returns null if defaultFor is void.class (the default). + */ + private String getDefaultForFromAnnotation(TypeElement element) { + return getClassAttributeFromAnnotation(element, "defaultFor"); + } + + /** + * Gets a Class-typed attribute value from the annotation using the mirror API. + * Returns null if the attribute is void.class (the default). + */ + private String getClassAttributeFromAnnotation(TypeElement element, String attributeName) { for (AnnotationMirror mirror : element.getAnnotationMirrors()) { DeclaredType annotationType = mirror.getAnnotationType(); if (annotationType.toString().equals(TikaComponent.class.getName())) { for (Map.Entry<? extends ExecutableElement, ? extends AnnotationValue> entry : mirror.getElementValues().entrySet()) { - if (entry.getKey().getSimpleName().toString().equals("contextKey")) { + if (entry.getKey().getSimpleName().toString().equals(attributeName)) { // The value is a TypeMirror for Class types Object value = entry.getValue().getValue(); if (value instanceof TypeMirror) { String typeName = value.toString(); - // void.class is the default, meaning "auto-detect" + // void.class is the default, meaning "not specified" if (!"void".equals(typeName) && !"java.lang.Void".equals(typeName)) { return typeName; } @@ -244,15 +287,30 @@ public class TikaComponentProcessor extends AbstractProcessor { private List<String> findServiceInterfaces(TypeElement element) { List<String> result = new ArrayList<>(); Set<String> visited = new LinkedHashSet<>(); - findServiceInterfacesRecursive(element.asType(), result, visited); + findInterfacesRecursive(element.asType(), result, visited, SERVICE_INTERFACES.keySet()); + return result; + } + + /** + * Finds all parse-context interfaces implemented by the given type element. + */ + private List<String> findParseContextInterfaces(TypeElement element) { + List<String> result = new ArrayList<>(); + Set<String> visited = new LinkedHashSet<>(); + findInterfacesRecursive(element.asType(), result, visited, PARSE_CONTEXT_INTERFACES); return result; } /** - * Recursively searches for service interfaces in the type hierarchy. + * Recursively searches for interfaces in the type hierarchy. + * + * @param type the type to search from + * @param result list to add found interfaces to + * @param visited set of already visited types (to avoid infinite loops) + * @param targetInterfaces the set of interface names to look for */ - private void findServiceInterfacesRecursive(TypeMirror type, List<String> result, - Set<String> visited) { + private void findInterfacesRecursive(TypeMirror type, List<String> result, + Set<String> visited, Set<String> targetInterfaces) { if (type == null || !(type instanceof DeclaredType)) { return; } @@ -266,8 +324,8 @@ public class TikaComponentProcessor extends AbstractProcessor { return; } - // Check if this is a service interface - if (SERVICE_INTERFACES.containsKey(typeName)) { + // Check if this is a target interface + if (targetInterfaces.contains(typeName)) { if (!result.contains(typeName)) { result.add(typeName); } @@ -275,11 +333,11 @@ public class TikaComponentProcessor extends AbstractProcessor { // Check superclass TypeMirror superclass = typeElement.getSuperclass(); - findServiceInterfacesRecursive(superclass, result, visited); + findInterfacesRecursive(superclass, result, visited, targetInterfaces); // Check interfaces for (TypeMirror interfaceType : typeElement.getInterfaces()) { - findServiceInterfacesRecursive(interfaceType, result, visited); + findInterfacesRecursive(interfaceType, result, visited, targetInterfaces); } } diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java index 69e42570b3..cce8466e20 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/config/TikaComponent.java @@ -58,6 +58,11 @@ import java.lang.annotation.Target; * public class MyFilter implements MetadataFilter, AnotherInterface { * // explicit ParseContext key when class implements multiple known interfaces * } + * + * {@code @TikaComponent(defaultFor = ContentHandlerFactory.class)} + * public class BasicContentHandlerFactory implements ContentHandlerFactory { + * // marks this as the default implementation for ContentHandlerFactory + * } * </pre> * * @since 3.1.0 @@ -106,4 +111,24 @@ public @interface TikaComponent { * @return the class to use as ParseContext key, or void.class for auto-detection */ Class<?> contextKey() default void.class; + + /** + * Marks this component as the default implementation for the specified interface. + * <p> + * When set, this component will be used as the default when loading a ParseContext + * with defaults (via {@code loadParseContextWithDefaults()}) and no explicit + * configuration is provided for the interface. + * <p> + * The specified class should be an interface that this component implements. + * For example: + * <pre> + * {@code @TikaComponent(defaultFor = ContentHandlerFactory.class)} + * public class BasicContentHandlerFactory implements ContentHandlerFactory { + * // This will be instantiated by default when no ContentHandlerFactory is configured + * } + * </pre> + * + * @return the interface this component is the default for, or void.class if not a default + */ + Class<?> defaultFor() default void.class; } diff --git a/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java b/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java index 104e0fe8a0..74cde0dca1 100644 --- a/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java +++ b/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java @@ -54,7 +54,7 @@ import org.apache.tika.parser.ParseContext; * Example configuration: * <pre> * { - * "other-configs": { + * "parse-context": { * "embedded-limits": { * "maxDepth": 10, * "throwOnMaxDepth": false, diff --git a/tika-core/src/main/java/org/apache/tika/config/OutputLimits.java b/tika-core/src/main/java/org/apache/tika/config/OutputLimits.java index d33bc87432..4069b019b1 100644 --- a/tika-core/src/main/java/org/apache/tika/config/OutputLimits.java +++ b/tika-core/src/main/java/org/apache/tika/config/OutputLimits.java @@ -46,7 +46,7 @@ import org.apache.tika.parser.ParseContext; * Example configuration: * <pre> * { - * "other-configs": { + * "parse-context": { * "output-limits": { * "writeLimit": 100000, * "throwOnWriteLimit": false, diff --git a/tika-core/src/main/java/org/apache/tika/config/TimeoutLimits.java b/tika-core/src/main/java/org/apache/tika/config/TimeoutLimits.java index b43b98360c..c0dffbc423 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TimeoutLimits.java +++ b/tika-core/src/main/java/org/apache/tika/config/TimeoutLimits.java @@ -43,7 +43,7 @@ import org.apache.tika.parser.ParseContext; * Example configuration: * <pre> * { - * "other-configs": { + * "parse-context": { * "timeout-limits": { * "taskTimeoutMillis": 120000 * } diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java index a06d8393cc..bbcd6b1242 100644 --- a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java +++ b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java @@ -33,14 +33,12 @@ import org.apache.tika.parser.ParseContext; * Utility class for computing digests on streams. * <p> * The DigesterFactory is retrieved from ParseContext. Configure it via - * the "other-configs" section in tika-config.json: + * the "parse-context" section in tika-config.json: * <pre> - * "other-configs": { - * "digester-factory": { - * "commons-digester-factory": { - * "digests": [{ "algorithm": "SHA256" }], - * "skipContainerDocumentDigest": true - * } + * "parse-context": { + * "commons-digester-factory": { + * "digests": [{ "algorithm": "SHA256" }], + * "skipContainerDocumentDigest": true * } * } * </pre> diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java index 0c35d33c01..0a2fc05600 100644 --- a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java @@ -21,22 +21,20 @@ package org.apache.tika.digest; * Implementations should be annotated with {@code @TikaComponent} and * provide bean properties for configuration (e.g., digests). * <p> - * Configure this factory in the "other-configs" section of tika-config.json. + * Configure this factory in the "parse-context" section of tika-config.json. * The factory is loaded into the ParseContext and used by AutoDetectParser * during parsing to compute digests. * <p> * Example JSON configuration: * <pre> * { - * "other-configs": { - * "digester-factory": { - * "commons-digester-factory": { - * "digests": [ - * { "algorithm": "MD5" }, - * { "algorithm": "SHA256", "encoding": "BASE32" } - * ], - * "skipContainerDocumentDigest": true - * } + * "parse-context": { + * "commons-digester-factory": { + * "digests": [ + * { "algorithm": "MD5" }, + * { "algorithm": "SHA256", "encoding": "BASE32" } + * ], + * "skipContainerDocumentDigest": true * } * } * } diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 752c0c2e35..5205010f55 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -146,7 +146,7 @@ public class AutoDetectParser extends CompositeParser { public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Compute digests before type detection if configured - // DigesterFactory is retrieved from ParseContext (configured via other-configs) + // DigesterFactory is retrieved from ParseContext (configured via parse-context) DigestHelper.maybeDigest(tis, metadata, context); // Automatically detect the MIME type of the document diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 4d62e1fdac..16195b9de5 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -36,7 +36,7 @@ import org.apache.tika.parser.ParseContext; * Implements {@link StreamingContentHandlerFactory} to support both in-memory * content extraction and streaming output to an OutputStream. */ -@TikaComponent +@TikaComponent(defaultFor = ContentHandlerFactory.class) public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter { private HANDLER_TYPE type = HANDLER_TYPE.TEXT; diff --git a/tika-core/src/main/java/org/apache/tika/sax/SAXOutputConfig.java b/tika-core/src/main/java/org/apache/tika/sax/SAXOutputConfig.java index b85e1b1fd8..e748b89c22 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SAXOutputConfig.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SAXOutputConfig.java @@ -18,6 +18,8 @@ package org.apache.tika.sax; import java.io.Serializable; +import org.apache.tika.config.TikaComponent; + /** * Configuration for SAX output behavior. * <p> @@ -25,6 +27,7 @@ import java.io.Serializable; * how content handlers and embedded document extractors generate output. * </p> */ +@TikaComponent(spi = false) public class SAXOutputConfig implements Serializable { private static final long serialVersionUID = 1L; diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index a21c32a77e..2885ff0020 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -105,7 +105,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath); - // Load PipesConfig directly from root level (not from "other-configs") + // Load PipesConfig directly from root level (not from "parse-context") pipesConfig = tikaJsonConfig.deserialize("pipes", PipesConfig.class); if (pipesConfig == null) { pipesConfig = new PipesConfig(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java index d62e38e843..b5f4bf4e87 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java @@ -32,10 +32,10 @@ import org.apache.tika.digest.DigesterFactory; * BouncyCastle supports additional algorithms beyond the standard Java ones, * such as SHA3-256, SHA3-384, SHA3-512. * <p> - * Example JSON configuration (in other-configs section): + * Example JSON configuration (in parse-context section): * <pre> * { - * "other-configs": { + * "parse-context": { * "digester-factory": { * "bouncy-castle-digester-factory": { * "digests": [ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java index 5c0c81a54d..9f8399a52b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java @@ -29,10 +29,10 @@ import org.apache.tika.digest.DigesterFactory; * <p> * Default: MD5 with HEX encoding. * <p> - * Example JSON configuration (in other-configs section): + * Example JSON configuration (in parse-context section): * <pre> * { - * "other-configs": { + * "parse-context": { * "digester-factory": { * "commons-digester-factory": { * "digests": [ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index 0049b03f1f..1e21fbef25 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -31,7 +31,6 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; public class AutoDetectParserConfigTest extends TikaTest { @@ -84,9 +83,7 @@ public class AutoDetectParserConfigTest extends TikaTest { public void testWriteFilter() throws Exception { TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-write-filter.json"); Parser p = loader.loadAutoDetectParser(); - MetadataWriteLimiterFactory factory = loader.configs().load(MetadataWriteLimiterFactory.class); - ParseContext parseContext = new ParseContext(); - parseContext.set(MetadataWriteLimiterFactory.class, factory); + ParseContext parseContext = loader.loadParseContext(); Metadata metadata = Metadata.newInstance(parseContext); List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, metadata, parseContext, true); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java index e5fa61735d..267c677716 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java @@ -105,7 +105,7 @@ public class DigestConfigTest extends TikaTest { @Test public void testCommonsDigesterSkipContainer() throws Exception { - // Tests skipContainerDocumentDigest on the factory (configured in other-configs) + // Tests skipContainerDocumentDigest on the factory (configured in parse-context) TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json"); Parser p = loader.loadAutoDetectParser(); ParseContext context = loader.loadParseContext(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java index 52904b6589..8b25189d89 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java @@ -35,7 +35,7 @@ import org.apache.tika.parser.digestutils.CommonsDigesterFactory; /** * Tests for SkipContainerDocumentDigest functionality with MockParser and embedded documents. - * DigesterFactory is now configured via ParseContext (via other-configs in JSON). + * DigesterFactory is now configured via ParseContext (via parse-context in JSON). */ public class SkipContainerDocumentDigestTest extends TikaTest { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json index f87df79434..e5a9c850b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json @@ -2,13 +2,11 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "SHA256" } ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json index 25ffe85de7..e1ca5547cf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json @@ -2,13 +2,11 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "bouncy-castle-digester-factory": { + "parse-context": { + "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "SHA1", "encoding": "BASE32" } ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json index 9098c8607f..78e69166de 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json @@ -2,9 +2,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "bouncy-castle-digester-factory": { + "parse-context": { + "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "MD2" }, { "algorithm": "MD5" }, @@ -15,5 +14,4 @@ ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json index c8d4c29aa5..3ea4238c33 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json @@ -2,9 +2,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "bouncy-castle-digester-factory": { + "parse-context": { + "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA256" }, @@ -14,5 +13,4 @@ ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json index b0ae0326e9..6bbdb3f70f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json @@ -2,9 +2,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD2" }, { "algorithm": "MD5" }, @@ -15,5 +14,4 @@ ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index b14d19509b..6753b80a2a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -11,14 +11,12 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json index 23186720bc..30115de6f8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json @@ -2,9 +2,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } @@ -12,5 +11,4 @@ "skipContainerDocumentDigest": true } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json index 0671621fc0..95cf6dd5fa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json @@ -2,14 +2,12 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "SHA256", "encoding": "BASE32" }, - { "algorithm": "MD5" } - ] - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "SHA256", "encoding": "BASE32" }, + { "algorithm": "MD5" } + ] } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json index 3aa9e04375..7d922943b5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json @@ -1,11 +1,9 @@ { - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" } ] } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json index 33fcd5ffd7..feaa6f4494 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json @@ -1,9 +1,7 @@ { - "other-configs": { - "embedded-document-extractor-factory": { - "standard-extractor-factory": { + "parse-context": { + "standard-extractor-factory": { "writeFileNameToContent": false } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index 8e9b5b6012..66f81f80a7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@ -3,14 +3,10 @@ "contentHandlerDecoratorFactory": "upcasing-content-handler-decorator-factory", "throwOnZeroBytes": true }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": {} - }, - "embedded-document-extractor-factory": { - "standard-extractor-factory": { - "writeFileNameToContent": true - } + "parse-context": { + "commons-digester-factory": {}, + "standard-extractor-factory": { + "writeFileNameToContent": true } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json index 28f542245b..721ee36e35 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json @@ -1,9 +1,7 @@ { - "other-configs": { - "embedded-document-extractor-factory": { - "standard-extractor-factory": { + "parse-context": { + "standard-extractor-factory": { "writeFileNameToContent": true } } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index 3179f5aceb..1d96edb631 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -2,23 +2,19 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "SHA256", "encoding": "BASE32" }, - { "algorithm": "MD5" } - ], - "skipContainerDocumentDigest": true - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "SHA256", "encoding": "BASE32" }, + { "algorithm": "MD5" } + ], + "skipContainerDocumentDigest": true }, - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "includeFields": [ - "X-TIKA:content", - "dc:creator" - ] - } + "standard-metadata-limiter-factory": { + "includeFields": [ + "X-TIKA:content", + "dc:creator" + ] } } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 6e86502d2b..0b91c7a458 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -122,7 +122,7 @@ class ParseHandler { private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, ParseContext parseContext) { - // Get DigesterFactory from ParseContext (configured via other-configs) + // Get DigesterFactory from ParseContext (configured via parse-context) DigesterFactory digesterFactory = parseContext.get(DigesterFactory.class); if (digesterFactory != null && !digesterFactory.isSkipContainerDocumentDigest()) { try { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index d5a6c72497..4fb69bd2c3 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -180,7 +180,7 @@ public class PipesServer implements AutoCloseable { MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters(); ContentHandlerFactory contentHandlerFactory = tikaLoader.loadContentHandlerFactory(); - MetadataWriteLimiterFactory metadataWriteLimiterFactory = tikaLoader.configs().load(MetadataWriteLimiterFactory.class); + MetadataWriteLimiterFactory metadataWriteLimiterFactory = tikaLoader.loadParseContext().get(MetadataWriteLimiterFactory.class); PipesServer pipesServer = new PipesServer(pipesClientId, tikaLoader, pipesConfig, socket, dis, dos, metadataFilter, contentHandlerFactory, metadataWriteLimiterFactory); pipesServer.initializeResources(); LOG.debug("pipesClientId={}: PipesServer loaded and ready", pipesClientId); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-4533.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-4533.json index b741ae8921..0f9e359070 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-4533.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-4533.json @@ -2,18 +2,16 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { + "parse-context": { "output-limits": { "zipBombRatio": 100, "maxXmlDepth": 100, "maxPackageEntryDepth": 100 }, - "digester-factory": { - "commons-digester-factory": { + "commons-digester-factory": { "digests": [ { "algorithm": "SHA256" } ] } } - } } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json index fd6bfa852c..0a8d40b1fa 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json @@ -46,10 +46,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "mock-digester-factory": {} - } - }, + "parse-context": { + "mock-digester-factory": {} + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json index c55fd2a026..3cbbab6950 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json @@ -45,10 +45,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "mock-digester-factory": {} - } - }, + "parse-context": { + "mock-digester-factory": {} + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index d8acd13939..88b4cc1978 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@ -46,15 +46,11 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "mock-digester-factory": {} - }, - "embedded-document-extractor-factory": { - "runpack-extractor-factory": { - "writeFileNameToContent": false, - "maxEmbeddedBytesForExtraction": 10 - } + "parse-context": { + "mock-digester-factory": {}, + "runpack-extractor-factory": { + "writeFileNameToContent": false, + "maxEmbeddedBytesForExtraction": 10 } }, "plugin-roots": "PLUGINS_PATHS" diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json index a7549f9385..2dd4c0c31b 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json @@ -42,10 +42,8 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "mock-digester-factory": {} - } - }, + "parse-context": { + "mock-digester-factory": {} + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json index 0e2a8e85ab..dbfbcd4f31 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json @@ -46,15 +46,13 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "includeFields": ["dc:creator", "Content-Type", "X-TIKA:content"], - "maxKeySize": 100, - "maxFieldSize": 1000, - "maxTotalBytes": 10000, - "maxValuesPerField": 5 - } + "parse-context": { + "standard-metadata-limiter-factory": { + "includeFields": ["dc:creator", "Content-Type", "X-TIKA:content"], + "maxKeySize": 100, + "maxFieldSize": 1000, + "maxTotalBytes": 10000, + "maxValuesPerField": 5 } }, "plugin-roots": "PLUGINS_PATHS" diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java index fa96517aa5..5dbad73518 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentInfo.java @@ -24,16 +24,25 @@ package org.apache.tika.config.loader; * (reads its own config from ParseContext's jsonConfigs) * @param contextKey the class to use as the key when adding to ParseContext, * or null to auto-detect based on known interfaces + * @param isDefault whether this component is the default implementation for its contextKey */ public record ComponentInfo( Class<?> componentClass, boolean selfConfiguring, - Class<?> contextKey + Class<?> contextKey, + boolean isDefault ) { /** - * Creates a ComponentInfo with no explicit context key (auto-detect). + * Creates a ComponentInfo with no explicit context key (auto-detect) and not default. */ public ComponentInfo(Class<?> componentClass, boolean selfConfiguring) { - this(componentClass, selfConfiguring, null); + this(componentClass, selfConfiguring, null, false); + } + + /** + * Creates a ComponentInfo with explicit context key but not default. + */ + public ComponentInfo(Class<?> componentClass, boolean selfConfiguring, Class<?> contextKey) { + this(componentClass, selfConfiguring, contextKey, false); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java index cbd9b932b8..576dc2e88e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -122,6 +122,21 @@ public class ComponentRegistry { return Collections.unmodifiableMap(components); } + /** + * Returns all components marked as defaults. + * + * @return unmodifiable map of component names to component info for default implementations + */ + public Map<String, ComponentInfo> getDefaultComponents() { + Map<String, ComponentInfo> defaults = new LinkedHashMap<>(); + for (Map.Entry<String, ComponentInfo> entry : components.entrySet()) { + if (entry.getValue().isDefault()) { + defaults.put(entry.getKey(), entry.getValue()); + } + } + return Collections.unmodifiableMap(defaults); + } + /** * Checks if a component with the given name is registered. * @@ -201,20 +216,28 @@ public class ComponentRegistry { ": name or class is empty"); } - // Parse value: className or className:key=contextKeyClass + // Parse value: className or className:key=contextKeyClass[:default] String className = value; String contextKeyClassName = null; + boolean isDefault = false; + // Parse suffixes (e.g., :key=SomeClass:default) int colonIndex = value.indexOf(':'); if (colonIndex != -1) { className = value.substring(0, colonIndex); - String suffix = value.substring(colonIndex + 1); - if (suffix.startsWith("key=")) { - contextKeyClassName = suffix.substring(4); - } else { - throw new TikaConfigException( - "Invalid index file format at " + url + " line " + lineNumber + - ": unknown suffix '" + suffix + "', expected 'key=...'"); + String suffixes = value.substring(colonIndex + 1); + + // Parse each colon-separated suffix + for (String suffix : suffixes.split(":")) { + if (suffix.startsWith("key=")) { + contextKeyClassName = suffix.substring(4); + } else if (suffix.equals("default")) { + isDefault = true; + } else if (!suffix.isEmpty()) { + throw new TikaConfigException( + "Invalid index file format at " + url + " line " + lineNumber + + ": unknown suffix '" + suffix + "', expected 'key=...' or 'default'"); + } } } @@ -235,7 +258,7 @@ public class ComponentRegistry { } } - result.put(name, new ComponentInfo(clazz, selfConfiguring, contextKey)); + result.put(name, new ComponentInfo(clazz, selfConfiguring, contextKey, isDefault)); } catch (ClassNotFoundException e) { throw new TikaConfigException( "Component class not found: " + className + " (from " + url + ")", e); diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java index e262bd6412..1b494b398a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ConfigLoader.java @@ -26,11 +26,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.exception.TikaConfigException; /** - * Loader for custom configuration objects from the "other-configs" section. + * Loader for configuration objects from the "parse-context" section. * <p> - * This class handles custom POJOs and test configurations that are not part of - * Tika's official configuration schema. All configurations loaded via ConfigLoader - * must be placed under the "other-configs" top-level node in the JSON. + * This class handles ParseContext components and configuration POJOs that are loaded + * into a ParseContext for use during parsing. All configurations loaded via ConfigLoader + * must be placed under the "parse-context" top-level node in the JSON. * <p> * For official Tika components and configurations (parsers, detectors, async, server, etc.), * use the specific methods on {@link TikaLoader} or load directly from {@link TikaJsonConfig}. @@ -55,14 +55,17 @@ import org.apache.tika.exception.TikaConfigException; * "pipes": {...}, * "server": {...}, * - * // Custom configs MUST be in "other-configs" (loaded via configs()) - * "other-configs": { - * "my-config": { - * "timeout": 5000, - * "retries": 3 + * // ParseContext configs in "parse-context" (loaded via configs()) + * "parse-context": { + * "embedded-limits": { + * "maxDepth": 10, + * "maxCount": 1000 * }, - * "my-custom-config": { - * "enabled": true + * "output-limits": { + * "writeLimit": 100000 + * }, + * "commons-digester-factory": { + * "algorithms": ["MD5", "SHA-256"] * } * } * } @@ -277,16 +280,16 @@ public class ConfigLoader { } /** - * Gets a node by key from the "other-configs". + * Gets a node by key from the "parse-context" section. * * @param key The JSON key to look for * @return the node, or null if not found */ private JsonNode getNode(String key) { - JsonNode otherConfigs = config.getRootNode().get("other-configs"); - if (otherConfigs != null && otherConfigs.isObject()) { - return otherConfigs.get(key); + JsonNode parseContext = config.getRootNode().get("parse-context"); + if (parseContext != null && parseContext.isObject()) { + return parseContext.get(key); } return null; diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index df536e159f..344faa66c7 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -343,12 +343,10 @@ public class TikaJsonConfig { } /** - * Validates that all top-level configuration keys are known or custom extensions. + * Validates that all top-level configuration keys are known. * <p> * This catches typos like "parser" instead of "parsers" or "pipes-reporter" * instead of "pipes-reporters". - * <p> - * The "other-configs" node is allowed for custom configurations. * * @throws TikaConfigException if unknown keys are found */ @@ -363,11 +361,6 @@ public class TikaJsonConfig { while (fieldNames.hasNext()) { String key = fieldNames.next(); - // Ignore custom configs node - if (key.equals("other-configs")) { - continue; - } - // Must be a known key if (!KNOWN_KEYS.contains(key)) { unknownKeys.add(key); @@ -377,8 +370,7 @@ public class TikaJsonConfig { if (!unknownKeys.isEmpty()) { throw new TikaConfigException( "Unknown configuration key(s): " + unknownKeys + ". " + - "Valid keys: " + KNOWN_KEYS + " " + - "(or use 'other-configs' node for custom keys)"); + "Valid keys: " + KNOWN_KEYS); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 0277a82e85..44c3c2b4a5 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import com.fasterxml.jackson.core.StreamReadConstraints; @@ -31,23 +32,17 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; -import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.config.GlobalSettings; -import org.apache.tika.config.OutputLimits; -import org.apache.tika.config.TimeoutLimits; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.digest.DigesterFactory; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; -import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; @@ -59,7 +54,6 @@ import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; -import org.apache.tika.sax.SAXOutputConfig; import org.apache.tika.serialization.ComponentConfig; import org.apache.tika.serialization.ComponentNameResolver; import org.apache.tika.serialization.JsonMetadata; @@ -374,7 +368,7 @@ public class TikaLoader { */ public synchronized Parser loadAutoDetectParser() throws TikaConfigException, IOException { if (autoDetectParser == null) { - // Load directly from root-level config (not via configs() which only looks in "other-configs") + // Load directly from root-level config (not via configs() which only looks in "parse-context") AutoDetectParserConfig adpConfig = loadAutoDetectParserConfig(); if (adpConfig == null) { adpConfig = new AutoDetectParserConfig(); @@ -385,13 +379,10 @@ public class TikaLoader { } /** - * Loads and returns a ParseContext populated with components from the "other-configs" section. + * Loads and returns a ParseContext populated with components from the "parse-context" section. * <p> - * This method loads components that should be passed via ParseContext, such as: - * <ul> - * <li>DigesterFactory (from "digester-factory")</li> - * <li>MetadataWriteLimiterFactory (from "metadata-write-limiter-factory")</li> - * </ul> + * This method only loads explicitly configured items from the JSON configuration. + * For loading with defaults for missing items, use {@link #loadParseContextWithDefaults()}. * <p> * Use this method when you need a pre-configured ParseContext for parsing operations. * @@ -408,22 +399,88 @@ public class TikaLoader { * @throws TikaConfigException if loading fails */ public ParseContext loadParseContext() throws TikaConfigException { - ParseContext context = new ParseContext(); - loadOne(DigesterFactory.class, context); - loadOne(MetadataWriteLimiterFactory.class, context); - loadOne(EmbeddedDocumentExtractorFactory.class, context); - loadOne(EmbeddedLimits.class, context); - loadOne(OutputLimits.class, context); - loadOne(TimeoutLimits.class, context); - loadOne(SAXOutputConfig.class, context); - return context; + return loadParseContextInternal(false); } - private <T> void loadOne(Class<T> clazz, ParseContext context) throws TikaConfigException { - T instnce = configs().load(clazz); - if (instnce != null) { - context.set(clazz, instnce); + /** + * Loads and returns a ParseContext populated with components from the "parse-context" section, + * plus default implementations for any missing items. + * <p> + * This method loads explicitly configured items from JSON, then instantiates + * default implementations (marked with {@code @TikaComponent(defaultFor=...)}) + * for any interface that wasn't explicitly configured. + * + * @return a ParseContext populated with configured and default components + * @throws TikaConfigException if loading fails + */ + public ParseContext loadParseContextWithDefaults() throws TikaConfigException { + return loadParseContextInternal(true); + } + + /** + * Internal method to load ParseContext with optional defaults. + * + * @param includeDefaults whether to include default implementations for missing items + * @return a ParseContext populated with components + * @throws TikaConfigException if loading fails + */ + private ParseContext loadParseContextInternal(boolean includeDefaults) throws TikaConfigException { + ParseContext context = new ParseContext(); + Set<Class<?>> configuredKeys = new HashSet<>(); + + // Load the component registry for parse-context + ComponentRegistry registry; + try { + registry = new ComponentRegistry("parse-context", classLoader); + } catch (TikaConfigException e) { + // parse-context.idx might not exist yet (e.g., first build) + // In that case, just return an empty context + return context; } + + // Load explicitly configured items from JSON + JsonNode parseContextNode = config.getRootNode().get("parse-context"); + if (parseContextNode != null && parseContextNode.isObject()) { + java.util.Iterator<String> fieldNames = parseContextNode.fieldNames(); + while (fieldNames.hasNext()) { + String key = fieldNames.next(); + JsonNode valueNode = parseContextNode.get(key); + + try { + ComponentInfo info = registry.getComponentInfo(key); + Class<?> targetClass = info.componentClass(); + Class<?> contextKey = info.contextKey() != null ? info.contextKey() : targetClass; + + Object instance = objectMapper.treeToValue(valueNode, targetClass); + context.set((Class<Object>) contextKey, instance); + configuredKeys.add(contextKey); + } catch (TikaConfigException e) { + throw new TikaConfigException("Failed to load parse-context item: " + key, e); + } catch (Exception e) { + throw new TikaConfigException("Failed to deserialize parse-context item: " + key, e); + } + } + } + + // Add defaults for missing items (if requested) + if (includeDefaults) { + for (Map.Entry<String, ComponentInfo> entry : registry.getDefaultComponents().entrySet()) { + ComponentInfo info = entry.getValue(); + Class<?> contextKey = info.contextKey() != null ? info.contextKey() : info.componentClass(); + + if (!configuredKeys.contains(contextKey)) { + try { + Object instance = info.componentClass().getDeclaredConstructor().newInstance(); + context.set((Class<Object>) contextKey, instance); + } catch (ReflectiveOperationException e) { + throw new TikaConfigException( + "Failed to instantiate default component: " + info.componentClass().getName(), e); + } + } + } + } + + return context; } /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java index e65eee07b9..e832dc8d4b 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -50,7 +50,7 @@ public class TikaObjectMapperFactory { "translators", "digester-factories", "content-handler-factories", - "other-configs" + "parse-context" }; private static ObjectMapper MAPPER = null; diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java index 10fe4ac927..30826befa5 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -101,7 +101,7 @@ public class ParseContextUtils { } // First, process known array configs (e.g., "metadata-filters") - // These don't depend on the other-configs registry + // These don't depend on the parse-context registry for (String friendlyName : new ArrayList<>(jsonConfigs.keySet())) { if (ARRAY_CONFIGS.containsKey(friendlyName)) { JsonConfig jsonConfig = jsonConfigs.get(friendlyName); @@ -112,7 +112,7 @@ public class ParseContextUtils { } // Then, try to resolve single component configs using ComponentNameResolver - // This searches all registered component registries, not just "other-configs" + // This searches all registered component registries, not just "parse-context" for (Map.Entry<String, JsonConfig> entry : jsonConfigs.entrySet()) { String friendlyName = entry.getKey(); JsonConfig jsonConfig = entry.getValue(); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java index 6ab03f828a..2dd10b532f 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java @@ -31,13 +31,13 @@ import org.apache.tika.parser.ParseContext; * Tests loading all limit configurations from a single tika-config.json file. * <p> * This test demonstrates how to configure all limits in one place using - * the "other-configs" section of the JSON configuration. + * the "parse-context" section of the JSON configuration. * <p> * Configuration file: configs/all-limits-test.json * <pre> * { * "parsers": ["default-parser"], - * "other-configs": { + * "parse-context": { * "embedded-limits": { * "maxDepth": 10, * "throwOnMaxDepth": false, @@ -55,13 +55,11 @@ import org.apache.tika.parser.ParseContext; * "timeout-limits": { * "taskTimeoutMillis": 60000 * }, - * "metadata-write-limiter-factory": { - * "standard-metadata-limiter-factory": { - * "maxTotalBytes": 1048576, - * "maxFieldSize": 102400, - * "maxKeySize": 1024, - * "maxValuesPerField": 100 - * } + * "standard-metadata-limiter-factory": { + * "maxTotalBytes": 1048576, + * "maxFieldSize": 102400, + * "maxKeySize": 1024, + * "maxValuesPerField": 100 * } * } * } diff --git a/tika-serialization/src/test/java/org/apache/tika/metadata/writefilter/StandardMetadataLimiterTest.java b/tika-serialization/src/test/java/org/apache/tika/metadata/writefilter/StandardMetadataLimiterTest.java index c23e2b1045..5edfc80ea2 100644 --- a/tika-serialization/src/test/java/org/apache/tika/metadata/writefilter/StandardMetadataLimiterTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/metadata/writefilter/StandardMetadataLimiterTest.java @@ -51,7 +51,8 @@ public class StandardMetadataLimiterTest extends TikaTest { public void testMetadataFactoryConfig() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3695.json")); AutoDetectParser parser = (AutoDetectParser) loader.loadAutoDetectParser(); - MetadataWriteLimiterFactory factory = loader.configs().load(MetadataWriteLimiterFactory.class); + ParseContext context = loader.loadParseContext(); + MetadataWriteLimiterFactory factory = context.get(MetadataWriteLimiterFactory.class); assertEquals(330, ((StandardMetadataLimiterFactory) factory).getMaxTotalBytes()); assertFalse(((StandardMetadataLimiterFactory) factory).getIncludeFields().isEmpty(), "includeFields should not be empty"); @@ -84,7 +85,8 @@ public class StandardMetadataLimiterTest extends TikaTest { public void testMetadataFactoryFieldsConfig() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3695-fields.json")); AutoDetectParser parser = (AutoDetectParser) loader.loadAutoDetectParser(); - MetadataWriteLimiterFactory factory = loader.configs().load(MetadataWriteLimiterFactory.class); + ParseContext context = loader.loadParseContext(); + MetadataWriteLimiterFactory factory = context.get(MetadataWriteLimiterFactory.class); assertEquals(421, ((StandardMetadataLimiterFactory) factory).getMaxTotalBytes()); assertEquals(999, ((StandardMetadataLimiterFactory) factory).getMaxKeySize()); assertEquals(10001, ((StandardMetadataLimiterFactory) factory).getMaxFieldSize()); @@ -285,9 +287,7 @@ public class StandardMetadataLimiterTest extends TikaTest { public void testExclude() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3695-exclude.json")); Parser parser = loader.loadAutoDetectParser(); - MetadataWriteLimiterFactory factory = loader.configs().load(MetadataWriteLimiterFactory.class); - ParseContext parseContext = new ParseContext(); - parseContext.set(MetadataWriteLimiterFactory.class, factory); + ParseContext parseContext = loader.loadParseContext(); String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>"; mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>"; diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index 1df1567073..dcd99f1b01 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -349,7 +349,7 @@ public class TestParseContextSerialization { * Test that BasicContentHandlerFactory can be configured via JSON, serialized, * deserialized, and resolved via ParseContextUtils.resolveAll(). * This verifies the fix for TIKA-4582 where ContentHandlerFactory was not being - * resolved because it wasn't in the "other-configs" registry. + * resolved because it wasn't in the "parse-context" registry. */ @Test public void testContentHandlerFactoryRoundTrip() throws Exception { diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json index 8dfe51784b..10d5ed9a37 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json @@ -2,11 +2,9 @@ "parsers": [ "default-parser" ], - "other-configs": { - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "excludeFields": ["subject"] - } + "parse-context": { + "standard-metadata-limiter-factory": { + "excludeFields": ["subject"] } } } diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json index 7e408b8aba..4216706846 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json @@ -2,17 +2,15 @@ "parsers": [ "default-parser" ], - "other-configs": { - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "maxKeySize": 999, - "maxFieldSize": 10001, - // maxTotalBytes accounts for ~244 bytes of ALWAYS_SET_FIELDS overhead - // (Content-Type, X-TIKA:Parsed-By, etc.) plus room for dc:title and 3 dc:creator values - "maxTotalBytes": 421, - "maxValuesPerField": 100, - "includeFields": ["dc:creator", "dc:title"] - } + "parse-context": { + "standard-metadata-limiter-factory": { + "maxKeySize": 999, + "maxFieldSize": 10001, + // maxTotalBytes accounts for ~244 bytes of ALWAYS_SET_FIELDS overhead + // (Content-Type, X-TIKA:Parsed-By, etc.) plus room for dc:title and 3 dc:creator values + "maxTotalBytes": 421, + "maxValuesPerField": 100, + "includeFields": ["dc:creator", "dc:title"] } } } diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695.json b/tika-serialization/src/test/resources/configs/TIKA-3695.json index ef95f8003b..a4b785fbec 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695.json @@ -2,14 +2,12 @@ "parsers": [ "default-parser" ], - "other-configs": { - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - // maxTotalBytes accounts for ~244 bytes of ALWAYS_SET_FIELDS overhead - // (Content-Type, X-TIKA:Parsed-By, etc.) plus room for 2 dc:creator values - "maxTotalBytes": 330, - "includeFields": ["dc:creator"] - } + "parse-context": { + "standard-metadata-limiter-factory": { + // maxTotalBytes accounts for ~244 bytes of ALWAYS_SET_FIELDS overhead + // (Content-Type, X-TIKA:Parsed-By, etc.) plus room for 2 dc:creator values + "maxTotalBytes": 330, + "includeFields": ["dc:creator"] } } } diff --git a/tika-serialization/src/test/resources/configs/all-limits-test.json b/tika-serialization/src/test/resources/configs/all-limits-test.json index 40faa856ec..daaf6be58f 100644 --- a/tika-serialization/src/test/resources/configs/all-limits-test.json +++ b/tika-serialization/src/test/resources/configs/all-limits-test.json @@ -2,7 +2,7 @@ "parsers": [ "default-parser" ], - "other-configs": { + "parse-context": { "embedded-limits": { "maxDepth": 10, "throwOnMaxDepth": false, @@ -20,13 +20,11 @@ "timeout-limits": { "taskTimeoutMillis": 60000 }, - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "maxTotalBytes": 1048576, - "maxFieldSize": 102400, - "maxKeySize": 1024, - "maxValuesPerField": 100 - } + "standard-metadata-limiter-factory": { + "maxTotalBytes": 1048576, + "maxFieldSize": 102400, + "maxKeySize": 1024, + "maxValuesPerField": 100 } } } diff --git a/tika-serialization/src/test/resources/configs/embedded-limits-test.json b/tika-serialization/src/test/resources/configs/embedded-limits-test.json index 14ce20d2be..a17c722cdb 100644 --- a/tika-serialization/src/test/resources/configs/embedded-limits-test.json +++ b/tika-serialization/src/test/resources/configs/embedded-limits-test.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "embedded-limits": { "maxDepth": 5, "throwOnMaxDepth": true, diff --git a/tika-serialization/src/test/resources/configs/output-limits-test.json b/tika-serialization/src/test/resources/configs/output-limits-test.json index ccd9fa0840..fdf95dbf6c 100644 --- a/tika-serialization/src/test/resources/configs/output-limits-test.json +++ b/tika-serialization/src/test/resources/configs/output-limits-test.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "output-limits": { "writeLimit": 50000, "throwOnWriteLimit": true, diff --git a/tika-serialization/src/test/resources/configs/test-config-loader.json b/tika-serialization/src/test/resources/configs/test-config-loader.json index c5c24254eb..2dc8eb30a8 100644 --- a/tika-serialization/src/test/resources/configs/test-config-loader.json +++ b/tika-serialization/src/test/resources/configs/test-config-loader.json @@ -3,7 +3,7 @@ {"pdf-parser": {}} ], - "other-configs": { + "parse-context": { "retry-config": { "timeout": 5000, "retries": 3, diff --git a/tika-serialization/src/test/resources/configs/test-interface-no-type.json b/tika-serialization/src/test/resources/configs/test-interface-no-type.json index da2e606bb5..af20c1dd25 100644 --- a/tika-serialization/src/test/resources/configs/test-interface-no-type.json +++ b/tika-serialization/src/test/resources/configs/test-interface-no-type.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "handler-no-type": { "maxSize": 50000, "prefix": "no-type-" diff --git a/tika-serialization/src/test/resources/configs/test-invalid-class.json b/tika-serialization/src/test/resources/configs/test-invalid-class.json index c927b6e1cc..3c290ddcdc 100644 --- a/tika-serialization/src/test/resources/configs/test-invalid-class.json +++ b/tika-serialization/src/test/resources/configs/test-invalid-class.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "handler": "com.example.NonExistentClass" } } diff --git a/tika-serialization/src/test/resources/configs/test-partial-config.json b/tika-serialization/src/test/resources/configs/test-partial-config.json index 5c5eab6992..50b8144867 100644 --- a/tika-serialization/src/test/resources/configs/test-partial-config.json +++ b/tika-serialization/src/test/resources/configs/test-partial-config.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "retry-config": { "enabled": true }, diff --git a/tika-serialization/src/test/resources/configs/test-unexpected-field.json b/tika-serialization/src/test/resources/configs/test-unexpected-field.json index 5946b399ea..bddae5db93 100644 --- a/tika-serialization/src/test/resources/configs/test-unexpected-field.json +++ b/tika-serialization/src/test/resources/configs/test-unexpected-field.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "retry-config": { "timeout": 5000, "retries": 3, diff --git a/tika-serialization/src/test/resources/configs/test-wrong-type.json b/tika-serialization/src/test/resources/configs/test-wrong-type.json index ece5fe3aeb..34cc7674a9 100644 --- a/tika-serialization/src/test/resources/configs/test-wrong-type.json +++ b/tika-serialization/src/test/resources/configs/test-wrong-type.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "handler": "java.lang.String" } } diff --git a/tika-serialization/src/test/resources/configs/timeout-limits-test.json b/tika-serialization/src/test/resources/configs/timeout-limits-test.json index edd158b5ba..ac9090e8f2 100644 --- a/tika-serialization/src/test/resources/configs/timeout-limits-test.json +++ b/tika-serialization/src/test/resources/configs/timeout-limits-test.json @@ -1,5 +1,5 @@ { - "other-configs": { + "parse-context": { "timeout-limits": { "taskTimeoutMillis": 120000 } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java index 3a516d7751..6a7e5a5b2d 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java @@ -172,7 +172,7 @@ public class MetadataResource { protected Metadata parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws IOException, TikaConfigException { - // Load default context from config (includes DigesterFactory from other-configs) + // Load default context from config (includes DigesterFactory from parse-context) final ParseContext context = TikaResource.createParseContext(); Parser parser = TikaResource.createParser(); fillMetadata(parser, metadata, httpHeaders); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index e3d86f3f3f..57c2d76d92 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -98,11 +98,7 @@ public class TikaResource { TIKA_LOADER = tikaLoader; SERVER_STATUS = serverStatus; PIPES_PARSING_HELPER = pipesParsingHelper; - try { - DEFAULT_METADATA_WRITE_LIMITER_FACTORY = tikaLoader.configs().load(MetadataWriteLimiterFactory.class); - } catch (TikaConfigException e) { - LOG.debug("No MetadataWriteLimiterFactory specified in the config", e); - } + // MetadataWriteLimiterFactory is now loaded dynamically via loadParseContext() } /** @@ -116,7 +112,7 @@ public class TikaResource { /** * Creates a new ParseContext with defaults loaded from tika-config. - * This loads components from "other-configs" such as DigesterFactory and MetadataWriteLimiterFactory. + * This loads components from "parse-context" such as DigesterFactory and MetadataWriteLimiterFactory. * * @return a new ParseContext with defaults applied */ diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index 63e0044baa..96035d0f78 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@ -80,13 +80,11 @@ public abstract class CXFTestBase { "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "MD5" } - ] - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" } + ] } } } diff --git a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json index 14e1c6c511..c360dd70c7 100644 --- a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json @@ -47,15 +47,13 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - } - }, + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json index 6c6f2c5df9..40bb34e201 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json @@ -49,15 +49,13 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - } - }, + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json index dfbcbb8ee4..174b340a78 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json @@ -12,16 +12,14 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - } - }, + }, "fetchers": { "file-system-fetcher": { "file-system-fetcher": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json index 6efc957936..9a6a9779a3 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json @@ -17,16 +17,14 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - } - }, + }, "fetchers": { "file-system-fetcher": { "file-system-fetcher": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json index 05a1cdf374..ef02357640 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json @@ -17,16 +17,14 @@ "auto-detect-parser": { "throwOnZeroBytes": false }, - "other-configs": { - "digester-factory": { - "commons-digester-factory": { + "parse-context": { + "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - } - }, + }, "fetchers": { "file-system-fetcher": { "file-system-fetcher": {
