This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4635 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d85014a1455d0bdce566f39cfa02ea17e2be8b84 Author: tallison <[email protected]> AuthorDate: Tue Jan 27 18:04:56 2026 -0500 TIKA-4635 -- refactor DigesterFactory to be standalone --- .../ROOT/pages/configuration/digesters.adoc | 192 +++++++++++++++++++++ docs/modules/ROOT/pages/configuration/index.adoc | 4 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 18 +- .../java/org/apache/tika/digest/DigestHelper.java | 41 +++-- .../org/apache/tika/digest/DigesterFactory.java | 39 +++-- .../org/apache/tika/parser/AutoDetectParser.java | 6 +- .../apache/tika/parser/AutoDetectParserConfig.java | 81 +-------- .../java/org/apache/tika/parser/ParseContext.java | 27 +++ .../digestutils/BouncyCastleDigesterFactory.java | 27 ++- .../parser/digestutils/CommonsDigesterFactory.java | 27 ++- .../tika/parser/AutoDetectParserConfigTest.java | 25 ++- .../apache/tika/parser/AutoDetectParserTest.java | 6 +- .../tika/parser/RecursiveParserWrapperTest.java | 9 +- .../tika/parser/digest/DigestConfigTest.java | 58 ++++--- .../digest/SkipContainerDocumentDigestTest.java | 92 ++++++---- .../parser/microsoft/ooxml/OOXMLParserTest.java | 6 +- .../src/test/resources/configs/tika-4533.json | 6 +- .../configs/tika-config-bc-digests-base32.json | 8 +- .../configs/tika-config-bc-digests-basic.json | 8 +- .../configs/tika-config-bc-digests-multiple.json | 8 +- .../configs/tika-config-commons-digests-basic.json | 8 +- .../configs/tika-config-digests-pdf-only.json | 8 +- .../tika-config-digests-skip-container.json | 14 +- .../resources/configs/tika-config-digests.json | 10 +- .../resources/configs/tika-config-md5-digest.json | 4 +- ...a-config-upcasing-custom-handler-decorator.json | 9 +- .../configs/tika-config-write-filter.json | 21 +-- .../apache/tika/pipes/core/server/EmitHandler.java | 9 +- .../tika/pipes/core/server/FetchHandler.java | 5 +- .../tika/pipes/core/server/ParseHandler.java | 10 +- .../apache/tika/pipes/core/server/PipesServer.java | 55 +++--- .../apache/tika/pipes/core/server/PipesWorker.java | 18 +- .../test/resources/configs/tika-config-basic.json | 9 +- .../resources/configs/tika-config-passback.json | 9 +- .../resources/configs/tika-config-truncate.json | 9 +- .../resources/configs/tika-config-uppercasing.json | 9 +- .../org/apache/tika/config/loader/TikaLoader.java | 35 ++++ .../org/apache/tika/server/core/CXFTestBase.java | 8 +- .../resources/configs/cxf-test-base-template.json | 10 +- .../resources/configs/cxf-test-base-template.json | 10 +- .../configs/tika-config-for-server-tests.json | 8 +- .../tika-config-langdetect-opennlp-filter.json | 8 +- .../tika-config-langdetect-optimaize-filter.json | 8 +- 43 files changed, 662 insertions(+), 320 deletions(-) diff --git a/docs/modules/ROOT/pages/configuration/digesters.adoc b/docs/modules/ROOT/pages/configuration/digesters.adoc new file mode 100644 index 0000000000..f09deb8446 --- /dev/null +++ b/docs/modules/ROOT/pages/configuration/digesters.adoc @@ -0,0 +1,192 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Configuring Digesters + +Tika can compute cryptographic digests (hashes) of documents during parsing. This is useful for +document deduplication, integrity verification, and forensic analysis. + +== Overview + +Digesters compute hash values of document content and store them in metadata. The digest value +is stored with a key like `X-TIKA:digest:SHA256` (for HEX encoding) or `X-TIKA:digest:SHA256:BASE32` +(for non-default encodings). + +Tika provides two digester implementations: + +* **CommonsDigesterFactory** - Uses Apache Commons Codec. Supports MD2, MD5, SHA1, SHA256, SHA384, SHA512. +* **BouncyCastleDigesterFactory** - Uses BouncyCastle provider. Supports all Commons algorithms plus SHA3-256, SHA3-384, SHA3-512. + +== JSON Configuration + +Configure digesters in the `other-configs.digester-factory` section of your tika-config.json. + +=== Basic Example with CommonsDigester + +This example configures multiple digest algorithms: + +.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json[tika-config-commons-digests-basic.json] +[source,json] +---- +{ + "other-configs": { + "digester-factory": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" }, + { "algorithm": "SHA256" }, + { "algorithm": "SHA512" } + ] + } + } + } +} +---- + +=== Using BouncyCastle for SHA3 Algorithms + +For SHA3 algorithms, use the BouncyCastle digester: + +.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json[tika-config-bc-digests-multiple.json] +[source,json] +---- +{ + "other-configs": { + "digester-factory": { + "bouncy-castle-digester-factory": { + "digests": [ + { "algorithm": "MD5" }, + { "algorithm": "SHA256" }, + { "algorithm": "SHA3_512" } + ] + } + } + } +} +---- + +=== Custom Encoding + +By default, digest values are encoded as lowercase hexadecimal. You can specify BASE32 or BASE64 encoding: + +.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json[tika-config-digests.json] +[source,json] +---- +{ + "other-configs": { + "digester-factory": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "SHA256", "encoding": "BASE32" }, + { "algorithm": "MD5" } + ] + } + } + } +} +---- + +Non-default encodings include the encoding in the metadata key: `X-TIKA:digest:SHA256:BASE32`. + +=== Skip Container Document Digest + +When processing documents with embedded content (e.g., a ZIP file with PDFs inside), you may +want to digest only the embedded documents, not the container. Set `skipContainerDocumentDigest` +to `true`: + +.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json[tika-config-digests-skip-container.json] +[source,json] +---- +{ + "other-configs": { + "digester-factory": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" } + ], + "skipContainerDocumentDigest": true + } + } + } +} +---- + +== Supported Algorithms + +[cols="1,1,1"] +|=== +|Algorithm |CommonsDigester |BouncyCastleDigester + +|MD2 |Yes |Yes +|MD5 |Yes |Yes +|SHA1 |Yes |Yes +|SHA256 |Yes |Yes +|SHA384 |Yes |Yes +|SHA512 |Yes |Yes +|SHA3_256 |No |Yes +|SHA3_384 |No |Yes +|SHA3_512 |No |Yes +|=== + +== Supported Encodings + +* **HEX** (default) - Lowercase hexadecimal +* **BASE32** - RFC 4648 Base32 +* **BASE64** - RFC 4648 Base64 + +== Programmatic Configuration + +You can also configure digesters programmatically via `ParseContext`: + +[source,java] +---- +// See: CommonsDigesterFactory.java +CommonsDigesterFactory factory = new CommonsDigesterFactory(); +factory.setDigests(Arrays.asList( + new DigestDef(DigestDef.Algorithm.SHA256), + new DigestDef(DigestDef.Algorithm.MD5, DigestDef.Encoding.BASE32) +)); +factory.setSkipContainerDocumentDigest(true); + +ParseContext context = new ParseContext(); +context.set(DigesterFactory.class, factory); + +// Use with AutoDetectParser +AutoDetectParser parser = new AutoDetectParser(); +parser.parse(inputStream, handler, metadata, context); +---- + +See link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java[CommonsDigesterFactory.java] and +link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java[BouncyCastleDigesterFactory.java] for implementation details. + +== Command Line Usage + +When using the Tika CLI (`tika-app`), you can enable digesting with the `--digest` flag: + +[source,bash] +---- +java -jar tika-app.jar --digest=SHA256 document.pdf +---- + +This computes a SHA256 digest of the document. The digest value appears in the metadata output. + +== Related Classes + +* link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java[DigesterFactory] - Factory interface +* link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java[DigestDef] - Algorithm and encoding definition +* link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/Digester.java[Digester] - Digester interface +* link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java[DigestConfigTest] - Test examples diff --git a/docs/modules/ROOT/pages/configuration/index.adoc b/docs/modules/ROOT/pages/configuration/index.adoc index 6ef39a6a59..393aa6e63c 100644 --- a/docs/modules/ROOT/pages/configuration/index.adoc +++ b/docs/modules/ROOT/pages/configuration/index.adoc @@ -34,6 +34,10 @@ xref:migration-to-4x/index.adoc[Migration Guide] for details on converting to JS * xref:configuration/parsers/pdf-parser.adoc[PDFParser] - PDF parsing options * xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - OCR options for image-based text extraction +=== Other Configuration + +* xref:configuration/digesters.adoc[Digesters] - Computing cryptographic hashes of documents + // Add links to specific topics as they are created // * xref:json-config.adoc[JSON Configuration Reference] // * xref:detectors.adoc[Configuring Detectors] diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 9bf9990271..20e666a23b 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -40,6 +40,7 @@ import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -72,7 +73,7 @@ import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.Detector; import org.apache.tika.digest.DigestDef; -import org.apache.tika.digest.Digester; +import org.apache.tika.digest.DigesterFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -95,7 +96,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.parser.digestutils.CommonsDigester; +import org.apache.tika.parser.digestutils.CommonsDigesterFactory; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; @@ -201,7 +202,7 @@ public class TikaCLI { * Password for opening encrypted documents, or <code>null</code>. */ private String password = System.getenv("TIKA_PASSWORD"); - private Digester digester = null; + private DigesterFactory digesterFactory = null; private boolean pipeMode = true; private boolean prettyPrint; private final OutputType XML = new OutputType() { @@ -433,7 +434,9 @@ public class TikaCLI { } else if (arg.startsWith("--digest=")) { String algorithmName = arg.substring("--digest=".length()).toUpperCase(Locale.ROOT); DigestDef.Algorithm algorithm = DigestDef.Algorithm.valueOf(algorithmName); - digester = new CommonsDigester(algorithm); + CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setDigests(Collections.singletonList(new DigestDef(algorithm))); + digesterFactory = factory; } else if (arg.startsWith("-e")) { encoding = arg.substring("-e".length()); } else if (arg.startsWith("--encoding=")) { @@ -730,9 +733,10 @@ public class TikaCLI { parser = new NetworkParser(networkURI); } else { parser = tikaLoader.loadAutoDetectParser(); - if (digester != null && parser instanceof AutoDetectParser) { - ((AutoDetectParser) parser).getAutoDetectParserConfig().digester(digester); - } + } + // Set DigesterFactory in ParseContext if configured via --digest= + if (digesterFactory != null) { + context.set(DigesterFactory.class, digesterFactory); } detector = tikaLoader.loadDetectors(); context.set(Parser.class, parser); diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java index d2bb3aee43..a06d8393cc 100644 --- a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java +++ b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java @@ -31,7 +31,19 @@ import org.apache.tika.parser.ParseContext; /** * Utility class for computing digests on streams. - * This follows the same pattern as AutoDetectParser's maybeSpool() method. + * <p> + * The DigesterFactory is retrieved from ParseContext. Configure it via + * the "other-configs" section in tika-config.json: + * <pre> + * "other-configs": { + * "digester-factory": { + * "commons-digester-factory": { + * "digests": [{ "algorithm": "SHA256" }], + * "skipContainerDocumentDigest": true + * } + * } + * } + * </pre> */ public class DigestHelper { @@ -39,32 +51,37 @@ public class DigestHelper { new DefaultEmbeddedStreamTranslator(); /** - * Computes digests on the stream if configured. + * Computes digests on the stream if a DigesterFactory is configured in ParseContext. + * <p> * This is called directly from AutoDetectParser.parse() before type detection. * - * @param tis the TikaInputStream to digest - * @param digester the digester to use (may be null) - * @param skipContainerDocumentDigest if true, skip digesting for top-level documents (depth 0) - * @param metadata metadata to read embedded depth from and write digests to - * @param context parse context (may contain SkipContainerDocumentDigest marker) + * @param tis the TikaInputStream to digest + * @param metadata metadata to read depth from and write digests to + * @param context parse context (should contain DigesterFactory, may contain SkipContainerDocumentDigest marker) * @throws IOException if an I/O error occurs */ public static void maybeDigest(TikaInputStream tis, - Digester digester, - boolean skipContainerDocumentDigest, Metadata metadata, ParseContext context) throws IOException { - if (digester == null) { + DigesterFactory digesterFactory = context.get(DigesterFactory.class); + + if (digesterFactory == null) { return; } - // Check both the config setting and the ParseContext marker - if (skipContainerDocumentDigest || SkipContainerDocumentDigest.shouldSkip(context)) { + + // Get skip setting from factory or ParseContext marker + boolean skipContainer = digesterFactory.isSkipContainerDocumentDigest() + || SkipContainerDocumentDigest.shouldSkip(context); + + if (skipContainer) { Integer depth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH); if (depth == null || depth == 0) { return; } } + Digester digester = digesterFactory.build(); + // Handle embedded stream translation if needed (e.g., for OLE2 objects in TikaInputStream's open container) if (EMBEDDED_STREAM_TRANSLATOR.shouldTranslate(tis, metadata)) { try (TemporaryResources tmp = new TemporaryResources()) { diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java index 1b9215d226..0c35d33c01 100644 --- a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java @@ -21,26 +21,31 @@ package org.apache.tika.digest; * Implementations should be annotated with {@code @TikaComponent} and * provide bean properties for configuration (e.g., digests). * <p> - * This is used in {@link org.apache.tika.parser.AutoDetectParserConfig} to - * configure digesting in the AutoDetectParser. + * Configure this factory in the "other-configs" section of tika-config.json. + * The factory is loaded into the ParseContext and used by AutoDetectParser + * during parsing to compute digests. * <p> * Example JSON configuration: * <pre> - * "auto-detect-parser": { - * "digesterFactory": { - * "commons-digester-factory": { - * "digests": [ - * { "algorithm": "MD5" }, - * { "algorithm": "SHA256", "encoding": "BASE32" } - * ] + * { + * "other-configs": { + * "digester-factory": { + * "commons-digester-factory": { + * "digests": [ + * { "algorithm": "MD5" }, + * { "algorithm": "SHA256", "encoding": "BASE32" } + * ], + * "skipContainerDocumentDigest": true + * } * } * } * } * </pre> + * <p> + * When using TikaLoader, call {@code loader.loadParseContext()} to get a + * ParseContext with the DigesterFactory already set. * * @see DigestDef - * @see DigestAlgorithm - * @see DigestEncoding */ public interface DigesterFactory { /** @@ -49,4 +54,16 @@ public interface DigesterFactory { * @return a new Digester instance */ Digester build(); + + /** + * Returns whether to skip digesting for container (top-level) documents. + * When true, only embedded documents (depth > 0) will be digested. + * <p> + * Default implementation returns false (digest everything). + * + * @return true if container documents should be skipped, false otherwise + */ + default boolean isSkipContainerDocumentDigest() { + return false; + } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index d03eb89961..ffb95dc609 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -152,10 +152,8 @@ public class AutoDetectParser extends CompositeParser { } // Compute digests before type detection if configured - DigestHelper.maybeDigest(tis, - autoDetectParserConfig.digester(), - autoDetectParserConfig.isSkipContainerDocumentDigest(), - metadata, context); + // DigesterFactory is retrieved from ParseContext (configured via other-configs) + DigestHelper.maybeDigest(tis, metadata, context); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata, context); diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index 0aba04ad61..c5c6632c00 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -21,8 +21,6 @@ import java.io.Serializable; import org.xml.sax.ContentHandler; import org.apache.tika.config.TikaComponent; -import org.apache.tika.digest.Digester; -import org.apache.tika.digest.DigesterFactory; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; @@ -79,17 +77,6 @@ public class AutoDetectParserConfig implements Serializable { private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory = NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; - private DigesterFactory digesterFactory = null; - - // Lazily built digester from the factory - private transient Digester digester = null; - - /** - * If true, skip digesting for container (top-level) documents. - * Only embedded documents will be digested. - */ - private boolean skipContainerDocumentDigest = false; - private boolean throwOnZeroBytes = true; /** @@ -172,71 +159,6 @@ public class AutoDetectParserConfig implements Serializable { return contentHandlerDecoratorFactory; } - /** - * Sets the digester factory. - * This is the preferred method for configuring digesting via JSON serialization. - * - * @param digesterFactory the digester factory - */ - public void setDigesterFactory(DigesterFactory digesterFactory) { - this.digesterFactory = digesterFactory; - } - - /** - * Gets the digester factory. - * - * @return the digester factory, or null if not configured - */ - public DigesterFactory getDigesterFactory() { - return digesterFactory; - } - - /** - * Returns the Digester, lazily building it from the factory if needed. - * <p> - * Note: This method is intentionally not named getDigester() to avoid - * Jackson treating it as a bean property during serialization. - * - * @return the Digester, or null if no factory is configured - */ - public Digester digester() { - if (digester == null && digesterFactory != null) { - digester = digesterFactory.build(); - } - return digester; - } - - /** - * Sets the digester directly. This is useful for programmatic configuration - * (e.g., from command-line arguments) when you don't have a DigesterFactory. - * <p> - * Note: This method is intentionally not named setDigester() to avoid - * Jackson treating it as a bean property during deserialization. - * - * @param digester the digester to use - */ - public void digester(Digester digester) { - this.digester = digester; - } - - /** - * Returns whether to skip digesting for container (top-level) documents. - * - * @return true if container documents should be skipped, false otherwise - */ - public boolean isSkipContainerDocumentDigest() { - return skipContainerDocumentDigest; - } - - /** - * Sets whether to skip digesting for container (top-level) documents. - * - * @param skipContainerDocumentDigest if true, only embedded documents will be digested - */ - public void setSkipContainerDocumentDigest(boolean skipContainerDocumentDigest) { - this.skipContainerDocumentDigest = skipContainerDocumentDigest; - } - public void setThrowOnZeroBytes(boolean throwOnZeroBytes) { this.throwOnZeroBytes = throwOnZeroBytes; } @@ -253,8 +175,7 @@ public class AutoDetectParserConfig implements Serializable { maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + - ", skipContainerDocumentDigest=" + skipContainerDocumentDigest + + contentHandlerDecoratorFactory + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index c4587c1fd5..78c8eb2e9a 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -222,6 +222,33 @@ public class ParseContext implements Serializable { return context.isEmpty() && jsonConfigs.isEmpty(); } + /** + * Copies all entries from the source ParseContext into this one. + * Existing entries in this context are overwritten by source entries. + * <p> + * This copies both typed objects (from context map) and JSON configs. + * + * @param source the ParseContext to copy from + * @since Apache Tika 4.0 + */ + public void copyFrom(ParseContext source) { + if (source == null) { + return; + } + // Copy typed objects + context.putAll(source.context); + // Copy JSON configs + jsonConfigs.putAll(source.jsonConfigs); + // Copy resolved configs (if any) + if (source.resolvedConfigs != null && !source.resolvedConfigs.isEmpty()) { + if (resolvedConfigs == null) { + resolvedConfigs = new HashMap<>(); + } + resolvedConfigs.putAll(source.resolvedConfigs); + } + } + + /** * Returns the internal context map for serialization purposes. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java index a8a1894586..d62e38e843 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java @@ -32,15 +32,18 @@ import org.apache.tika.digest.DigesterFactory; * BouncyCastle supports additional algorithms beyond the standard Java ones, * such as SHA3-256, SHA3-384, SHA3-512. * <p> - * Example JSON configuration: + * Example JSON configuration (in other-configs section): * <pre> * { - * "digesterFactory": { - * "bouncy-castle-digester-factory": { - * "digests": [ - * { "algorithm": "MD5" }, - * { "algorithm": "SHA3_256", "encoding": "BASE32" } - * ] + * "other-configs": { + * "digester-factory": { + * "bouncy-castle-digester-factory": { + * "digests": [ + * { "algorithm": "MD5" }, + * { "algorithm": "SHA3_256", "encoding": "BASE32" } + * ], + * "skipContainerDocumentDigest": false + * } * } * } * } @@ -50,6 +53,7 @@ import org.apache.tika.digest.DigesterFactory; public class BouncyCastleDigesterFactory implements DigesterFactory { private List<DigestDef> digests = new ArrayList<>(); + private boolean skipContainerDocumentDigest = false; public BouncyCastleDigesterFactory() { digests.add(new DigestDef(DigestDef.Algorithm.MD5)); @@ -60,6 +64,15 @@ public class BouncyCastleDigesterFactory implements DigesterFactory { return new BouncyCastleDigester(digests); } + @Override + public boolean isSkipContainerDocumentDigest() { + return skipContainerDocumentDigest; + } + + public void setSkipContainerDocumentDigest(boolean skipContainerDocumentDigest) { + this.skipContainerDocumentDigest = skipContainerDocumentDigest; + } + public List<DigestDef> getDigests() { return digests; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java index b141c7340e..5c0c81a54d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java @@ -29,15 +29,18 @@ import org.apache.tika.digest.DigesterFactory; * <p> * Default: MD5 with HEX encoding. * <p> - * Example JSON configuration: + * Example JSON configuration (in other-configs section): * <pre> * { - * "digesterFactory": { - * "commons-digester": { - * "digests": [ - * { "algorithm": "MD5" }, - * { "algorithm": "SHA256", "encoding": "BASE32" } - * ] + * "other-configs": { + * "digester-factory": { + * "commons-digester-factory": { + * "digests": [ + * { "algorithm": "MD5" }, + * { "algorithm": "SHA256", "encoding": "BASE32" } + * ], + * "skipContainerDocumentDigest": false + * } * } * } * } @@ -47,6 +50,7 @@ import org.apache.tika.digest.DigesterFactory; public class CommonsDigesterFactory implements DigesterFactory { private List<DigestDef> digests = new ArrayList<>(); + private boolean skipContainerDocumentDigest = false; public CommonsDigesterFactory() { digests.add(new DigestDef(DigestDef.Algorithm.MD5)); @@ -57,6 +61,15 @@ public class CommonsDigesterFactory implements DigesterFactory { return new CommonsDigester(digests); } + @Override + public boolean isSkipContainerDocumentDigest() { + return skipContainerDocumentDigest; + } + + public void setSkipContainerDocumentDigest(boolean skipContainerDocumentDigest) { + this.skipContainerDocumentDigest = skipContainerDocumentDigest; + } + public List<DigestDef> getDigests() { return digests; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index ebce85add9..2801642c7c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -96,8 +97,10 @@ public class AutoDetectParserConfigTest extends TikaTest { public void testDigests() throws Exception { //test to make sure that the decorator is only applied once for //legacy (e.g. not RecursiveParserWrapperHandler) parsing - Parser p = TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context); // SHA256 with BASE32 encoding includes encoding in the key assertEquals("SO67W5OGGMOFPMFQTHTNL5YU5EQXWPMNEPU7HKOZX2ULHRQICRZA====", metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32")); @@ -115,8 +118,10 @@ public class AutoDetectParserConfigTest extends TikaTest { public void testDigestsSkipContainer() throws Exception { //test to make sure that the decorator is only applied once for //legacy (e.g. not RecursiveParserWrapperHandler) parsing - Parser p = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json").loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context); // SHA256 with BASE32 encoding includes encoding in the key assertNull(metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32")); assertNull(metadataList.get(0).get("X-TIKA:digest:MD5")); @@ -130,8 +135,10 @@ public class AutoDetectParserConfigTest extends TikaTest { @Test public void testDigestsEmptyParser() throws Exception { //TIKA-3939 -- ensure that digesting happens even with EmptyParser - Parser p = TikaLoaderHelper.getLoader("tika-config-digests-pdf-only.json").loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-pdf-only.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p, context); assertEquals(1, metadataList.size()); assertEquals("4ef0d3bdb12ba603f4caf7d2e2c6112e", metadataList.get(0).get("X-TIKA:digest:MD5")); @@ -143,8 +150,10 @@ public class AutoDetectParserConfigTest extends TikaTest { public void testContainerZeroBytes() throws Exception { Path tmp = Files.createTempFile("tika-test", ""); try { - Parser p = TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata(tmp, p, context, true); assertEquals("d41d8cd98f00b204e9800998ecf8427e", metadataList.get(0).get("X-TIKA:digest:MD5")); assertEquals("0", metadataList.get(0).get(Metadata.CONTENT_LENGTH)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index e54ef93a79..01d28b5188 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -567,14 +567,16 @@ public class AutoDetectParserTest extends TikaTest { //TIKA-4533 -- this tests both that a very large embedded OLE doc doesn't cause a zip bomb //exception AND that the sha for the embedded OLE doc is not the sha for a zero-byte file String expectedSha = "bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78"; - AutoDetectParser autoDetectParser = (AutoDetectParser) TikaLoaderHelper.getLoader("tika-4533.json").loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-4533.json"); + AutoDetectParser autoDetectParser = (AutoDetectParser) loader.loadAutoDetectParser(); + ParseContext parseContext = loader.loadParseContext(); //this models what happens in tika-pipes if (autoDetectParser.getAutoDetectParserConfig() .getEmbeddedDocumentExtractorFactory() == null) { autoDetectParser.getAutoDetectParserConfig() .setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory()); } - List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new ParseContext()); + List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, parseContext); assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256")); assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 9b054d4ad4..efa124e8f0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -36,6 +36,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -437,13 +438,15 @@ public class RecursiveParserWrapperTest extends TikaTest { ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, boolean digest) throws Exception { - ParseContext context = new ParseContext(); + ParseContext context; Parser wrapped; if (digest) { - wrapped = TikaLoaderHelper.getLoader("tika-config-md5-digest.json") - .loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-md5-digest.json"); + wrapped = loader.loadAutoDetectParser(); + context = loader.loadParseContext(); } else { wrapped = AUTO_DETECT_PARSER; + context = new ParseContext(); } RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java index bc8174e918..e5fa61735d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java @@ -26,8 +26,10 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; /** @@ -60,10 +62,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testCommonsDigesterBasic() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json") - .loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); Metadata m = new Metadata(); - getXML("test_recursive_embedded.docx", p, m); + getXML("test_recursive_embedded.docx", p, m, context); assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should match"); assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match"); @@ -75,9 +78,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testCommonsDigesterWithBase32() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); Metadata m = new Metadata(); - getXML("test_recursive_embedded.docx", p, m); + getXML("test_recursive_embedded.docx", p, m, context); // SHA256 with BASE32 encoding - just verify it exists with non-default key assertNotNull(m.get(P + "SHA256:BASE32"), @@ -89,9 +94,10 @@ public class DigestConfigTest extends TikaTest { @Test public void testCommonsDigesterLengthsCalculated() throws Exception { // This tests that TIKA-4016 added lengths - Parser p = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json") - .loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context); for (Metadata m : metadataList) { assertNotNull(m.get(Metadata.CONTENT_LENGTH)); } @@ -99,9 +105,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testCommonsDigesterSkipContainer() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json") - .loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p); + // Tests skipContainerDocumentDigest on the factory (configured in other-configs) + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context); // Container should NOT have digest assertNull(metadataList.get(0).get(P + "MD5"), @@ -118,10 +126,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testBouncyCastleDigesterBasic() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json") - .loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); Metadata m = new Metadata(); - getXML("test_recursive_embedded.docx", p, m); + getXML("test_recursive_embedded.docx", p, m, context); assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should match"); assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match"); @@ -133,10 +142,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testBouncyCastleDigesterMultipleAlgorithms() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-bc-digests-multiple.json") - .loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-multiple.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); Metadata m = new Metadata(); - getXML("test_recursive_embedded.docx", p, m); + getXML("test_recursive_embedded.docx", p, m, context); assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should match"); assertEquals(EXPECTED_SHA256, m.get(P + "SHA256"), "SHA256 digest should match"); @@ -150,10 +160,11 @@ public class DigestConfigTest extends TikaTest { @Test public void testBouncyCastleDigesterBase32Encoding() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-bc-digests-base32.json") - .loadAutoDetectParser(); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-base32.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); Metadata m = new Metadata(); - getXML("test_recursive_embedded.docx", p, m); + getXML("test_recursive_embedded.docx", p, m, context); // Non-default encoding includes encoding in the key assertEquals(EXPECTED_SHA1_BASE32, m.get(P + "SHA1:BASE32"), @@ -162,9 +173,10 @@ public class DigestConfigTest extends TikaTest { @Test public void testBouncyCastleDigesterLengthsCalculated() throws Exception { - Parser p = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json") - .loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p); + TikaLoader loader = TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json"); + Parser p = loader.loadAutoDetectParser(); + ParseContext context = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p, context); for (Metadata m : metadataList) { assertNotNull(m.get(Metadata.CONTENT_LENGTH)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java index a211165f56..52904b6589 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java @@ -25,18 +25,17 @@ import java.util.List; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; -import org.apache.tika.digest.DigestDef; +import org.apache.tika.digest.DigesterFactory; import org.apache.tika.digest.SkipContainerDocumentDigest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.parser.digestutils.CommonsDigesterFactory; /** * Tests for SkipContainerDocumentDigest functionality with MockParser and embedded documents. + * DigesterFactory is now configured via ParseContext (via other-configs in JSON). */ public class SkipContainerDocumentDigestTest extends TikaTest { @@ -46,14 +45,16 @@ public class SkipContainerDocumentDigestTest extends TikaTest { @Test public void testDigestContainerAndEmbedded() throws Exception { // skipContainerDocumentDigest = false means digest everything - AutoDetectParserConfig config = new AutoDetectParserConfig(); - config.digester(new CommonsDigester(DigestDef.Algorithm.MD5)); - config.setSkipContainerDocumentDigest(false); + CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setSkipContainerDocumentDigest(false); AutoDetectParser parser = new AutoDetectParser(); - parser.setAutoDetectParserConfig(config); - List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", parser); + ParseContext context = new ParseContext(); + context.set(DigesterFactory.class, factory); + + List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", + parser, new Metadata(), context, false); // Should have container + embedded assertEquals(2, metadataList.size()); @@ -70,14 +71,16 @@ public class SkipContainerDocumentDigestTest extends TikaTest { @Test public void testSkipContainerDigestOnly() throws Exception { // skipContainerDocumentDigest = true means skip container, digest only embedded - AutoDetectParserConfig config = new AutoDetectParserConfig(); - config.digester(new CommonsDigester(DigestDef.Algorithm.MD5)); - config.setSkipContainerDocumentDigest(true); + CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setSkipContainerDocumentDigest(true); AutoDetectParser parser = new AutoDetectParser(); - parser.setAutoDetectParserConfig(config); - List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", parser); + ParseContext context = new ParseContext(); + context.set(DigesterFactory.class, factory); + + List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", + parser, new Metadata(), context, false); // Should have container + embedded assertEquals(2, metadataList.size()); @@ -94,15 +97,14 @@ public class SkipContainerDocumentDigestTest extends TikaTest { @Test public void testSkipContainerDocumentDigestMarkerInParseContext() throws Exception { // Test that the SkipContainerDocumentDigest marker in ParseContext works - AutoDetectParserConfig config = new AutoDetectParserConfig(); - config.digester(new CommonsDigester(DigestDef.Algorithm.MD5)); - config.setSkipContainerDocumentDigest(false); // Config says digest all + CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setSkipContainerDocumentDigest(false); // Factory says digest all AutoDetectParser parser = new AutoDetectParser(); - parser.setAutoDetectParserConfig(config); - // Set the marker in ParseContext to override config + // Set both factory and the marker in ParseContext - marker overrides factory ParseContext context = new ParseContext(); + context.set(DigesterFactory.class, factory); context.set(SkipContainerDocumentDigest.class, SkipContainerDocumentDigest.INSTANCE); List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", @@ -111,7 +113,7 @@ public class SkipContainerDocumentDigestTest extends TikaTest { // Should have container + embedded assertEquals(2, metadataList.size()); - // Container should NOT have digest because ParseContext marker overrides config + // Container should NOT have digest because ParseContext marker overrides factory assertNull(metadataList.get(0).get(DIGEST_KEY), "Container document should NOT have digest when ParseContext marker is set"); @@ -122,12 +124,8 @@ public class SkipContainerDocumentDigestTest extends TikaTest { @Test public void testNoDigesterConfigured() throws Exception { - // When no digester is configured, no digests should be computed - AutoDetectParserConfig config = new AutoDetectParserConfig(); - // Don't set any digester - + // When no digester is configured in ParseContext, no digests should be computed AutoDetectParser parser = new AutoDetectParser(); - parser.setAutoDetectParserConfig(config); List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", parser); @@ -142,20 +140,52 @@ public class SkipContainerDocumentDigestTest extends TikaTest { } @Test - public void testDigestWithFactory() throws Exception { - // Test using the factory pattern + public void testDigestWithFactoryInParseContext() throws Exception { + // Test that DigesterFactory in ParseContext is used CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setSkipContainerDocumentDigest(false); + + AutoDetectParser parser = new AutoDetectParser(); - AutoDetectParserConfig config = new AutoDetectParserConfig(); - config.setDigesterFactory(factory); - config.setSkipContainerDocumentDigest(false); + ParseContext context = new ParseContext(); + context.set(DigesterFactory.class, factory); + + List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", + parser, new Metadata(), context, false); + + // Should have container + embedded + assertEquals(2, metadataList.size()); + + // Both should have digest + assertNotNull(metadataList.get(0).get(DIGEST_KEY), + "Container document should have digest when ParseContext provides factory"); + assertNotNull(metadataList.get(1).get(DIGEST_KEY), + "Embedded document should have digest when ParseContext provides factory"); + } + + @Test + public void testSkipContainerOnFactory() throws Exception { + // Test skipContainerDocumentDigest configured on the factory + CommonsDigesterFactory factory = new CommonsDigesterFactory(); + factory.setSkipContainerDocumentDigest(true); AutoDetectParser parser = new AutoDetectParser(); - parser.setAutoDetectParserConfig(config); - List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", parser); + ParseContext context = new ParseContext(); + context.set(DigesterFactory.class, factory); + + List<Metadata> metadataList = getRecursiveMetadata("mock_embedded_for_digest.xml", + parser, new Metadata(), context, false); // Should have container + embedded assertEquals(2, metadataList.size()); + + // Container should NOT have digest because factory says to skip + assertNull(metadataList.get(0).get(DIGEST_KEY), + "Container document should NOT have digest when factory.skipContainerDocumentDigest=true"); + + // Embedded should have digest + assertNotNull(metadataList.get(1).get(DIGEST_KEY), + "Embedded document should have digest"); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index dfe1c591ac..8ff5ccbb27 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -148,8 +148,10 @@ public class OOXMLParserTest extends TikaTest { @Test public void testDigestTranslator() throws Exception { - Parser parser = TikaLoader.load(getConfigPath(OOXMLParserTest.class, "tika-config-digests.json")).loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser); + TikaLoader loader = TikaLoader.load(getConfigPath(OOXMLParserTest.class, "tika-config-digests.json")); + Parser parser = loader.loadAutoDetectParser(); + ParseContext parseContext = loader.loadParseContext(); + List<Metadata> metadataList = getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser, parseContext); assertEquals(4, metadataList.size()); for (Metadata m : metadataList) { assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32")); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json index 12b49d6267..76416f19d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json @@ -3,8 +3,10 @@ "maximumCompressionRatio": 100, "maximumDepth": 100, "maximumPackageEntryDepth": 100, - "throwOnZeroBytes": false, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "SHA256" } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json index 5ac209517f..f9e04fe037 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json @@ -1,13 +1,15 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json index 53bfd01732..8d4a9db55f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json @@ -1,7 +1,10 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "MD2" }, @@ -12,7 +15,6 @@ { "algorithm": "SHA512" } ] } - }, - "throwOnZeroBytes": false + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json index b2e23ad974..d8dcaba9a3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json @@ -1,7 +1,10 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "bouncy-castle-digester-factory": { "digests": [ { "algorithm": "MD5" }, @@ -11,7 +14,6 @@ { "algorithm": "SHA3_512" } ] } - }, - "throwOnZeroBytes": false + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json index c37e6965f2..7256297b30 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json @@ -1,7 +1,10 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD2" }, @@ -12,7 +15,6 @@ { "algorithm": "SHA512" } ] } - }, - "throwOnZeroBytes": false + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index 60825fe974..4dc5242e60 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -10,14 +10,16 @@ ], "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } ] } - }, - "throwOnZeroBytes": false + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json index 8ed562166a..c6676b29be 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json @@ -1,15 +1,17 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": true, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } - ] + ], + "skipContainerDocumentDigest": true } - }, - "throwOnZeroBytes": false + } } -} \ No newline at end of file +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json index 50bbd90b99..360b4f5170 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json @@ -1,14 +1,16 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } ] } - }, - "throwOnZeroBytes": false + } } -} \ No newline at end of file +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json index a13a80c7db..3aa9e04375 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json @@ -1,6 +1,6 @@ { - "auto-detect-parser": { - "digesterFactory": { + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index 6a466c1385..7c22d00ef8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@ -35,10 +35,11 @@ } }, "contentHandlerDecoratorFactory": "upcasing-content-handler-decorator-factory", - "skipContainerDocumentDigest": false, - "digesterFactory": { - "commons-digester-factory": {} - }, "throwOnZeroBytes": true + }, + "other-configs": { + "digester-factory": { + "commons-digester-factory": {} + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index 3ca9aa461a..bb7acf39e9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -1,15 +1,6 @@ { "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": true, - "digesterFactory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "SHA256", "encoding": "BASE32" }, - { "algorithm": "MD5" } - ] - } - }, "metadataWriteFilterFactory": { "standard-write-filter-factory": { "includeFields": [ @@ -19,6 +10,16 @@ } }, "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "SHA256", "encoding": "BASE32" }, + { "algorithm": "MD5" } + ], + "skipContainerDocumentDigest": true + } + } } } - diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java index 457fe11168..a11014478c 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java @@ -62,12 +62,11 @@ class EmitHandler { this.directEmitThresholdBytes = directEmitThresholdBytes; } - public PipesResult emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { + public PipesResult emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData, ParseContext parseContext) { long start = System.currentTimeMillis(); String stack = getContainerStacktrace(t, parseData.getMetadataList()); //we need to apply the metadata filter after we pull out the stacktrace - filterMetadata(t, parseData); - ParseContext parseContext = t.getParseContext(); + filterMetadata(parseData, parseContext); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); if (StringUtils.isBlank(stack) || @@ -200,8 +199,8 @@ class EmitHandler { } } - private void filterMetadata(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { - MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); + private void filterMetadata(MetadataListAndEmbeddedBytes parseData, ParseContext parseContext) { + MetadataFilter filter = parseContext.get(MetadataFilter.class); if (filter == null) { filter = defaultMetadataFilter; } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java index 98055b639b..c14ee24656 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java @@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.fetcher.Fetcher; @@ -40,14 +41,14 @@ class FetchHandler { this.fetcherManager = fetcherManager; } - public TisOrResult fetch(FetchEmitTuple fetchEmitTuple, Metadata metadata) { + public TisOrResult fetch(FetchEmitTuple fetchEmitTuple, Metadata metadata, ParseContext parseContext) { FetcherOrResult fetcherResult = getFetcher(fetchEmitTuple); if (fetcherResult.pipesResult != null) { return new TisOrResult(null, fetcherResult.pipesResult); } try { TikaInputStream tis = fetcherResult.fetcher.fetch( - fetchEmitTuple.getFetchKey().getFetchKey(), metadata, fetchEmitTuple.getParseContext()); + fetchEmitTuple.getFetchKey().getFetchKey(), metadata, parseContext); return new TisOrResult(tis, null); } catch (IOException | TikaException e) { return new TisOrResult(null, new PipesResult(PipesResult.RESULT_STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e))); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index b6d31d0f00..bbcb21b4e0 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -31,6 +31,7 @@ import org.xml.sax.SAXException; import org.apache.tika.detect.Detector; import org.apache.tika.digest.Digester; +import org.apache.tika.digest.DigesterFactory; import org.apache.tika.digest.SkipContainerDocumentDigest; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaConfigException; @@ -58,7 +59,6 @@ class ParseHandler { private static final Logger LOG = LoggerFactory.getLogger(ParseHandler.class); private final Detector detector; - private final Digester digester; private final ArrayBlockingQueue<Metadata> intermediateResult; private final CountDownLatch countDownLatch; private final AutoDetectParser autoDetectParser; @@ -67,12 +67,11 @@ class ParseHandler { private final ParseMode defaultParseMode; - ParseHandler(Detector detector, Digester digester, ArrayBlockingQueue<Metadata> intermediateResult, + ParseHandler(Detector detector, ArrayBlockingQueue<Metadata> intermediateResult, CountDownLatch countDownLatch, AutoDetectParser autoDetectParser, RecursiveParserWrapper recursiveParserWrapper, ContentHandlerFactory defaultContentHandlerFactory, ParseMode defaultParseMode) { this.detector = detector; - this.digester = digester; this.intermediateResult = intermediateResult; this.countDownLatch = countDownLatch; this.autoDetectParser = autoDetectParser; @@ -124,8 +123,11 @@ class ParseHandler { private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, ParseContext parseContext) { - if (digester != null) { + // Get DigesterFactory from ParseContext (configured via other-configs) + DigesterFactory digesterFactory = parseContext.get(DigesterFactory.class); + if (digesterFactory != null && !digesterFactory.isSkipContainerDocumentDigest()) { try { + Digester digester = digesterFactory.build(); digester.digest(tis, metadata, parseContext); // Mark that we've already digested the container document so AutoDetectParser // won't re-digest it during parsing diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index 6e52ceb4d8..a91e66823a 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -53,13 +53,13 @@ import org.xml.sax.SAXException; import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.Detector; -import org.apache.tika.digest.Digester; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.PipesResult; @@ -131,7 +131,6 @@ public class PipesServer implements AutoCloseable { return (byte) (ordinal() + 1); } } - private Digester digester; private Detector detector; @@ -295,14 +294,16 @@ public class PipesServer implements AutoCloseable { CountDownLatch countDownLatch = new CountDownLatch(1); FetchEmitTuple fetchEmitTuple = readFetchEmitTuple(); + // Create merged ParseContext: defaults from tika-config + request overrides + ParseContext mergedContext = createMergedParseContext(fetchEmitTuple.getParseContext()); // Resolve friendly-named configs in ParseContext to actual objects - ParseContextUtils.resolveAll(fetchEmitTuple.getParseContext(), getClass().getClassLoader()); + ParseContextUtils.resolveAll(mergedContext, getClass().getClassLoader()); - PipesWorker pipesWorker = getPipesWorker(intermediateResult, fetchEmitTuple, countDownLatch); + PipesWorker pipesWorker = getPipesWorker(intermediateResult, fetchEmitTuple, mergedContext, countDownLatch); executorCompletionService.submit(pipesWorker); //set progress counter try { - loopUntilDone(fetchEmitTuple, executorCompletionService, intermediateResult, countDownLatch); + loopUntilDone(fetchEmitTuple, mergedContext, executorCompletionService, intermediateResult, countDownLatch); } catch (Throwable t) { LOG.error("Serious problem: {}", HexFormat.of().formatHex(new byte[]{(byte)request}), t); } @@ -334,21 +335,23 @@ public class PipesServer implements AutoCloseable { } } - private PipesWorker getPipesWorker(ArrayBlockingQueue<Metadata> intermediateResult, FetchEmitTuple fetchEmitTuple, CountDownLatch countDownLatch) { + private PipesWorker getPipesWorker(ArrayBlockingQueue<Metadata> intermediateResult, FetchEmitTuple fetchEmitTuple, + ParseContext mergedContext, CountDownLatch countDownLatch) { FetchHandler fetchHandler = new FetchHandler(fetcherManager); - ParseHandler parseHandler = new ParseHandler(detector, digester, intermediateResult, countDownLatch, autoDetectParser, + ParseHandler parseHandler = new ParseHandler(detector, intermediateResult, countDownLatch, autoDetectParser, rMetaParser, defaultContentHandlerFactory, pipesConfig.getParseMode()); Long thresholdBytes = pipesConfig.getEmitStrategy().getThresholdBytes(); long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES; EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold); - PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple, autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler); + PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple, mergedContext, autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler); return pipesWorker; } - private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ExecutorCompletionService<PipesResult> executorCompletionService, + private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext mergedContext, + ExecutorCompletionService<PipesResult> executorCompletionService, ArrayBlockingQueue<Metadata> intermediateResult, CountDownLatch countDownLatch) throws InterruptedException, IOException { Instant start = Instant.now(); - long timeoutMillis = PipesClient.getTimeoutMillis(pipesConfig, fetchEmitTuple.getParseContext()); + long timeoutMillis = PipesClient.getTimeoutMillis(pipesConfig, mergedContext); long mockProgressCounter = 0; boolean wroteIntermediateResult = false; @@ -463,21 +466,6 @@ public class PipesServer implements AutoCloseable { this.fetcherManager = FetcherManager.load(tikaPluginManager, tikaJsonConfig, true, configStore); this.emitterManager = EmitterManager.load(tikaPluginManager, tikaJsonConfig, true, configStore); this.autoDetectParser = (AutoDetectParser) tikaLoader.loadAutoDetectParser(); - // Get the digester for pre-parse digesting of container documents. - // If user configured skipContainerDocumentDigest=false (the default), PipesServer - // digests the container document before parsing to ensure we have the digest even - // if parsing times out. The SkipContainerDocumentDigest marker is then added to - // ParseContext to prevent AutoDetectParser from re-digesting the container. - // If user configured skipContainerDocumentDigest=true, we don't digest containers at all. - boolean skipContainerDigest = autoDetectParser.getAutoDetectParserConfig() - .isSkipContainerDocumentDigest(); - if (!skipContainerDigest) { - // User wants container documents digested - we'll do it in ParseHandler before parse - this.digester = autoDetectParser.getAutoDetectParserConfig().digester(); - } else { - // User doesn't want container documents digested - this.digester = null; - } // If the user hasn't configured an embedded document extractor, set up the // RUnpackExtractorFactory @@ -488,6 +476,23 @@ public class PipesServer implements AutoCloseable { this.rMetaParser = new RecursiveParserWrapper(autoDetectParser); } + /** + * Creates a merged ParseContext with defaults from tika-config overlaid with request values. + * Request values take precedence over defaults. + * <p> + * Creates a fresh context each time to avoid shared state between requests. + * + * @param requestContext the ParseContext from FetchEmitTuple + * @return a new ParseContext with defaults + request overrides + */ + private ParseContext createMergedParseContext(ParseContext requestContext) throws TikaConfigException { + // Create fresh context with defaults from tika-config (e.g., DigesterFactory) + ParseContext mergedContext = tikaLoader.loadParseContext(); + // Overlay request's values (request takes precedence) + mergedContext.copyFrom(requestContext); + return mergedContext; + } + private ConfigStore createConfigStore(PipesConfig pipesConfig, TikaPluginManager tikaPluginManager) throws TikaException { String configStoreType = pipesConfig.getConfigStoreType(); String configStoreParams = pipesConfig.getConfigStoreParams(); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index d8315cfd9f..57733d38d5 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -51,15 +51,18 @@ class PipesWorker implements Callable<PipesResult> { private static final Logger LOG = LoggerFactory.getLogger(PipesWorker.class); private final FetchEmitTuple fetchEmitTuple; + private final ParseContext parseContext; private final AutoDetectParser autoDetectParser; private final EmitterManager emitterManager; private final FetchHandler fetchHandler; private final ParseHandler parseHandler; private final EmitHandler emitHandler; - public PipesWorker(FetchEmitTuple fetchEmitTuple, AutoDetectParser autoDetectParser, EmitterManager emitterManager, FetchHandler fetchHandler, ParseHandler parseHandler, + public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext parseContext, AutoDetectParser autoDetectParser, + EmitterManager emitterManager, FetchHandler fetchHandler, ParseHandler parseHandler, EmitHandler emitHandler) { this.fetchEmitTuple = fetchEmitTuple; + this.parseContext = parseContext; this.autoDetectParser = autoDetectParser; this.emitterManager = emitterManager; this.fetchHandler = fetchHandler; @@ -83,7 +86,7 @@ class PipesWorker implements Callable<PipesResult> { if (parseData == null || metadataIsEmpty(parseData.getMetadataList())) { return PipesResults.EMPTY_OUTPUT; } - return emitHandler.emitParseData(fetchEmitTuple, parseData); + return emitHandler.emitParseData(fetchEmitTuple, parseData, parseContext); } finally { if (parseData != null && parseData.hasEmbeddedDocumentByteStore() && parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) { @@ -107,21 +110,21 @@ class PipesWorker implements Callable<PipesResult> { //we want to isolate and not touch the metadata sent into the fetchEmitTuple //so that we can inject it after the filter at the very end Metadata metadata = new Metadata(); - FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata); + FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata, parseContext); if (tisOrResult.pipesResult() != null) { return new ParseDataOrPipesResult(null, tisOrResult.pipesResult()); } - ParseContext parseContext = null; + ParseContext localContext = null; try { - parseContext = setupParseContext(fetchEmitTuple); + localContext = setupParseContext(); } catch (IOException e) { LOG.warn("fetcher initialization exception id={}", fetchEmitTuple.getId(), e); return new ParseDataOrPipesResult(null, new PipesResult(PipesResult.RESULT_STATUS.FETCHER_INITIALIZATION_EXCEPTION, ExceptionUtils.getStackTrace(e))); } try (TikaInputStream tis = tisOrResult.tis()) { - return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, parseContext); + return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, localContext); } catch (SecurityException e) { LOG.error("security exception id={}", fetchEmitTuple.getId(), e); throw e; @@ -134,8 +137,7 @@ class PipesWorker implements Callable<PipesResult> { - private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) throws TikaException, IOException { - ParseContext parseContext = fetchEmitTuple.getParseContext(); + private ParseContext setupParseContext() throws TikaException, IOException { // ContentHandlerFactory and ParseMode are retrieved from ParseContext in ParseHandler. // They are set in ParseContext from PipesConfig loaded via TikaLoader at startup. EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json index 755c345dfa..98573b46fb 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json @@ -46,11 +46,12 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": false, - "digesterFactory": { - "mock-digester-factory": {} - }, "throwOnZeroBytes": false }, + "other-configs": { + "digester-factory": { + "mock-digester-factory": {} + } + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json index 2e0748f854..5308be9a1c 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json @@ -45,11 +45,12 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": false, - "digesterFactory": { - "mock-digester-factory": {} - }, "throwOnZeroBytes": false }, + "other-configs": { + "digester-factory": { + "mock-digester-factory": {} + } + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index 07a78edf3c..f8d5d3464b 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@ -46,10 +46,6 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": false, - "digesterFactory": { - "mock-digester-factory": {} - }, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": false, @@ -58,5 +54,10 @@ }, "throwOnZeroBytes": false }, + "other-configs": { + "digester-factory": { + "mock-digester-factory": {} + } + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json index 6498c15a7a..c9189c2ae2 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json @@ -41,11 +41,12 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "skipContainerDocumentDigest": false, - "digesterFactory": { - "mock-digester-factory": {} - }, "throwOnZeroBytes": false }, + "other-configs": { + "digester-factory": { + "mock-digester-factory": {} + } + }, "plugin-roots": "PLUGINS_PATHS" } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index b527532e5b..389f33697d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -36,6 +36,7 @@ import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.digest.DigesterFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; @@ -47,6 +48,7 @@ import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; @@ -376,6 +378,39 @@ public class TikaLoader { return autoDetectParser; } + /** + * Loads and returns a ParseContext populated with components from the "other-configs" section. + * <p> + * This method loads components that should be passed via ParseContext, such as: + * <ul> + * <li>DigesterFactory (from "digester-factory")</li> + * </ul> + * <p> + * Use this method when you need a pre-configured ParseContext for parsing operations. + * + * <p>Example usage: + * <pre> + * TikaLoader loader = TikaLoader.load(configPath); + * Parser parser = loader.loadAutoDetectParser(); + * ParseContext context = loader.loadParseContext(); + * parser.parse(stream, handler, metadata, context); + * </pre> + * + * @return a ParseContext populated with configured components + * @throws TikaConfigException if loading fails + */ + public ParseContext loadParseContext() throws TikaConfigException { + ParseContext context = new ParseContext(); + + // Load DigesterFactory from other-configs if present + DigesterFactory digesterFactory = configs().load("digester-factory", DigesterFactory.class); + if (digesterFactory != null) { + context.set(DigesterFactory.class, digesterFactory); + } + + return context; + } + /** * Returns a ConfigLoader for loading simple configuration objects. * <p> diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index d73c002546..0cb335676c 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@ -79,14 +79,16 @@ public abstract class CXFTestBase { { "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" } ] } - }, - "throwOnZeroBytes": false + } } } """; diff --git a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json index 7c301943d2..434b61e626 100644 --- a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json @@ -46,15 +46,17 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } }, "plugin-roots": "PLUGINS_PATHS" -} \ No newline at end of file +} diff --git a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json index bcae4fb7e6..06510b1a1e 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json @@ -48,15 +48,17 @@ }, "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } }, "plugin-roots": "PLUGINS_PATHS" -} \ No newline at end of file +} diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json index fbe51d01c4..dc25f3ae0e 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json @@ -11,15 +11,17 @@ ], "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } }, "fetchers": { "file-system-fetcher": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json index 930334b088..665442b733 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json @@ -16,15 +16,17 @@ ], "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } }, "fetchers": { "file-system-fetcher": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json index 3a4d88fb69..51e7806e81 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json @@ -16,15 +16,17 @@ ], "auto-detect-parser": { "outputThreshold": 1000000, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } ] } - }, - "throwOnZeroBytes": false + } }, "fetchers": { "file-system-fetcher": {
