This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bd1513677e TIKA-4635 -- refactor DigesterFactory to be standalone
(#2555)
bd1513677e is described below
commit bd1513677e51cd0bcfc815fb238bf83555d7b939
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jan 28 10:34:08 2026 -0500
TIKA-4635 -- refactor DigesterFactory to be standalone (#2555)
---
.../ROOT/pages/configuration/digesters.adoc | 192 +++++++++++++++++++++
docs/modules/ROOT/pages/configuration/index.adoc | 4 +
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 18 +-
.../java/org/apache/tika/digest/DigestDef.java | 2 +-
.../java/org/apache/tika/digest/DigestHelper.java | 41 +++--
.../org/apache/tika/digest/DigesterFactory.java | 39 +++--
.../org/apache/tika/parser/AutoDetectParser.java | 6 +-
.../apache/tika/parser/AutoDetectParserConfig.java | 81 +--------
.../java/org/apache/tika/parser/ParseContext.java | 26 +++
.../parser/digestutils/BouncyCastleDigester.java | 4 +-
.../digestutils/BouncyCastleDigesterFactory.java | 27 ++-
.../tika/parser/digestutils/CommonsDigester.java | 4 +-
.../parser/digestutils/CommonsDigesterFactory.java | 27 ++-
.../tika/parser/AutoDetectParserConfigTest.java | 24 ++-
.../apache/tika/parser/AutoDetectParserTest.java | 6 +-
.../tika/parser/RecursiveParserWrapperTest.java | 9 +-
.../tika/parser/digest/DigestConfigTest.java | 58 ++++---
.../digest/SkipContainerDocumentDigestTest.java | 92 ++++++----
.../parser/microsoft/ooxml/OOXMLParserTest.java | 6 +-
.../src/test/resources/configs/tika-4533.json | 6 +-
.../configs/tika-config-bc-digests-base32.json | 8 +-
.../configs/tika-config-bc-digests-basic.json | 8 +-
.../configs/tika-config-bc-digests-multiple.json | 8 +-
.../configs/tika-config-commons-digests-basic.json | 8 +-
.../configs/tika-config-digests-pdf-only.json | 8 +-
.../tika-config-digests-skip-container.json | 14 +-
.../resources/configs/tika-config-digests.json | 10 +-
.../resources/configs/tika-config-md5-digest.json | 4 +-
...a-config-upcasing-custom-handler-decorator.json | 9 +-
.../configs/tika-config-write-filter.json | 13 +-
.../apache/tika/pipes/core/server/EmitHandler.java | 9 +-
.../tika/pipes/core/server/FetchHandler.java | 5 +-
.../tika/pipes/core/server/ParseHandler.java | 10 +-
.../apache/tika/pipes/core/server/PipesServer.java | 57 +++---
.../apache/tika/pipes/core/server/PipesWorker.java | 21 ++-
.../test/resources/configs/tika-config-basic.json | 9 +-
.../resources/configs/tika-config-passback.json | 9 +-
.../resources/configs/tika-config-truncate.json | 9 +-
.../resources/configs/tika-config-uppercasing.json | 9 +-
.../configs/tika-config-write-limiter.json | 1 -
.../org/apache/tika/config/loader/TikaLoader.java | 43 +++++
.../server/core/resource/MetadataResource.java | 2 +
.../tika/server/core/resource/TikaResource.java | 15 +-
.../org/apache/tika/server/core/CXFTestBase.java | 8 +-
.../resources/configs/cxf-test-base-template.json | 10 +-
.../resources/configs/cxf-test-base-template.json | 10 +-
.../configs/tika-config-for-server-tests.json | 8 +-
.../tika-config-langdetect-opennlp-filter.json | 8 +-
.../tika-config-langdetect-optimaize-filter.json | 8 +-
49 files changed, 681 insertions(+), 332 deletions(-)
diff --git a/docs/modules/ROOT/pages/configuration/digesters.adoc
b/docs/modules/ROOT/pages/configuration/digesters.adoc
new file mode 100644
index 0000000000..f09deb8446
--- /dev/null
+++ b/docs/modules/ROOT/pages/configuration/digesters.adoc
@@ -0,0 +1,192 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Configuring Digesters
+
+Tika can compute cryptographic digests (hashes) of documents during parsing.
This is useful for
+document deduplication, integrity verification, and forensic analysis.
+
+== Overview
+
+Digesters compute hash values of document content and store them in metadata.
The digest value
+is stored with a key like `X-TIKA:digest:SHA256` (for HEX encoding) or
`X-TIKA:digest:SHA256:BASE32`
+(for non-default encodings).
+
+Tika provides two digester implementations:
+
+* **CommonsDigesterFactory** - Uses Apache Commons Codec. Supports MD2, MD5,
SHA1, SHA256, SHA384, SHA512.
+* **BouncyCastleDigesterFactory** - Uses BouncyCastle provider. Supports all
Commons algorithms plus SHA3-256, SHA3-384, SHA3-512.
+
+== JSON Configuration
+
+Configure digesters in the `other-configs.digester-factory` section of your
tika-config.json.
+
+=== Basic Example with CommonsDigester
+
+This example configures multiple digest algorithms:
+
+.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json[tika-config-commons-digests-basic.json]
+[source,json]
+----
+{
+ "other-configs": {
+ "digester-factory": {
+ "commons-digester-factory": {
+ "digests": [
+ { "algorithm": "MD5" },
+ { "algorithm": "SHA256" },
+ { "algorithm": "SHA512" }
+ ]
+ }
+ }
+ }
+}
+----
+
+=== Using BouncyCastle for SHA3 Algorithms
+
+For SHA3 algorithms, use the BouncyCastle digester:
+
+.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json[tika-config-bc-digests-multiple.json]
+[source,json]
+----
+{
+ "other-configs": {
+ "digester-factory": {
+ "bouncy-castle-digester-factory": {
+ "digests": [
+ { "algorithm": "MD5" },
+ { "algorithm": "SHA256" },
+ { "algorithm": "SHA3_512" }
+ ]
+ }
+ }
+ }
+}
+----
+
+=== Custom Encoding
+
+By default, digest values are encoded as lowercase hexadecimal. You can
specify BASE32 or BASE64 encoding:
+
+.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json[tika-config-digests.json]
+[source,json]
+----
+{
+ "other-configs": {
+ "digester-factory": {
+ "commons-digester-factory": {
+ "digests": [
+ { "algorithm": "SHA256", "encoding": "BASE32" },
+ { "algorithm": "MD5" }
+ ]
+ }
+ }
+ }
+}
+----
+
+Non-default encodings include the encoding in the metadata key:
`X-TIKA:digest:SHA256:BASE32`.
+
+=== Skip Container Document Digest
+
+When processing documents with embedded content (e.g., a ZIP file with PDFs
inside), you may
+want to digest only the embedded documents, not the container. Set
`skipContainerDocumentDigest`
+to `true`:
+
+.link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json[tika-config-digests-skip-container.json]
+[source,json]
+----
+{
+ "other-configs": {
+ "digester-factory": {
+ "commons-digester-factory": {
+ "digests": [
+ { "algorithm": "MD5" }
+ ],
+ "skipContainerDocumentDigest": true
+ }
+ }
+ }
+}
+----
+
+== Supported Algorithms
+
+[cols="1,1,1"]
+|===
+|Algorithm |CommonsDigester |BouncyCastleDigester
+
+|MD2 |Yes |Yes
+|MD5 |Yes |Yes
+|SHA1 |Yes |Yes
+|SHA256 |Yes |Yes
+|SHA384 |Yes |Yes
+|SHA512 |Yes |Yes
+|SHA3_256 |No |Yes
+|SHA3_384 |No |Yes
+|SHA3_512 |No |Yes
+|===
+
+== Supported Encodings
+
+* **HEX** (default) - Lowercase hexadecimal
+* **BASE32** - RFC 4648 Base32
+* **BASE64** - RFC 4648 Base64
+
+== Programmatic Configuration
+
+You can also configure digesters programmatically via `ParseContext`:
+
+[source,java]
+----
+// See: CommonsDigesterFactory.java
+CommonsDigesterFactory factory = new CommonsDigesterFactory();
+factory.setDigests(Arrays.asList(
+ new DigestDef(DigestDef.Algorithm.SHA256),
+ new DigestDef(DigestDef.Algorithm.MD5, DigestDef.Encoding.BASE32)
+));
+factory.setSkipContainerDocumentDigest(true);
+
+ParseContext context = new ParseContext();
+context.set(DigesterFactory.class, factory);
+
+// Use with AutoDetectParser
+AutoDetectParser parser = new AutoDetectParser();
+parser.parse(inputStream, handler, metadata, context);
+----
+
+See
link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java[CommonsDigesterFactory.java]
and
+link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java[BouncyCastleDigesterFactory.java]
for implementation details.
+
+== Command Line Usage
+
+When using the Tika CLI (`tika-app`), you can enable digesting with the
`--digest` flag:
+
+[source,bash]
+----
+java -jar tika-app.jar --digest=SHA256 document.pdf
+----
+
+This computes a SHA256 digest of the document. The digest value appears in the
metadata output.
+
+== Related Classes
+
+*
link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java[DigesterFactory]
- Factory interface
+*
link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java[DigestDef]
- Algorithm and encoding definition
+*
link:https://github.com/apache/tika/blob/main/tika-core/src/main/java/org/apache/tika/digest/Digester.java[Digester]
- Digester interface
+*
link:https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java[DigestConfigTest]
- Test examples
diff --git a/docs/modules/ROOT/pages/configuration/index.adoc
b/docs/modules/ROOT/pages/configuration/index.adoc
index 6ef39a6a59..393aa6e63c 100644
--- a/docs/modules/ROOT/pages/configuration/index.adoc
+++ b/docs/modules/ROOT/pages/configuration/index.adoc
@@ -34,6 +34,10 @@ xref:migration-to-4x/index.adoc[Migration Guide] for details
on converting to JS
* xref:configuration/parsers/pdf-parser.adoc[PDFParser] - PDF parsing options
* xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] -
OCR options for image-based text extraction
+=== Other Configuration
+
+* xref:configuration/digesters.adoc[Digesters] - Computing cryptographic
hashes of documents
+
// Add links to specific topics as they are created
// * xref:json-config.adoc[JSON Configuration Reference]
// * xref:detectors.adoc[Configuring Detectors]
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 35604e2c64..4e64c9db79 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -40,6 +40,7 @@ import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@@ -72,7 +73,7 @@ import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.digest.DigestDef;
-import org.apache.tika.digest.Digester;
+import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -96,7 +97,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.digestutils.CommonsDigester;
+import org.apache.tika.parser.digestutils.CommonsDigesterFactory;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -202,7 +203,7 @@ public class TikaCLI {
* Password for opening encrypted documents, or <code>null</code>.
*/
private String password = System.getenv("TIKA_PASSWORD");
- private Digester digester = null;
+ private DigesterFactory digesterFactory = null;
private boolean pipeMode = true;
private boolean prettyPrint;
private final OutputType XML = new OutputType() {
@@ -434,7 +435,9 @@ public class TikaCLI {
} else if (arg.startsWith("--digest=")) {
String algorithmName =
arg.substring("--digest=".length()).toUpperCase(Locale.ROOT);
DigestDef.Algorithm algorithm =
DigestDef.Algorithm.valueOf(algorithmName);
- digester = new CommonsDigester(algorithm);
+ CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setDigests(Collections.singletonList(new
DigestDef(algorithm)));
+ digesterFactory = factory;
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
@@ -731,9 +734,10 @@ public class TikaCLI {
parser = new NetworkParser(networkURI);
} else {
parser = tikaLoader.loadAutoDetectParser();
- if (digester != null && parser instanceof AutoDetectParser) {
- ((AutoDetectParser)
parser).getAutoDetectParserConfig().digester(digester);
- }
+ }
+ // Set DigesterFactory in ParseContext if configured via --digest=
+ if (digesterFactory != null) {
+ context.set(DigesterFactory.class, digesterFactory);
}
detector = tikaLoader.loadDetectors();
context.set(Parser.class, parser);
diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java
b/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java
index fa1f0398d7..8d75814237 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/DigestDef.java
@@ -119,7 +119,7 @@ public class DigestDef {
*
* @return the metadata key
*/
- public String getMetadataKey() {
+ public String metadataKey() {
StringBuilder sb = new StringBuilder();
sb.append(TikaCoreProperties.TIKA_META_PREFIX);
sb.append("digest");
diff --git a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java
b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java
index d2bb3aee43..a06d8393cc 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/DigestHelper.java
@@ -31,7 +31,19 @@ import org.apache.tika.parser.ParseContext;
/**
* Utility class for computing digests on streams.
- * This follows the same pattern as AutoDetectParser's maybeSpool() method.
+ * <p>
+ * The DigesterFactory is retrieved from ParseContext. Configure it via
+ * the "other-configs" section in tika-config.json:
+ * <pre>
+ * "other-configs": {
+ * "digester-factory": {
+ * "commons-digester-factory": {
+ * "digests": [{ "algorithm": "SHA256" }],
+ * "skipContainerDocumentDigest": true
+ * }
+ * }
+ * }
+ * </pre>
*/
public class DigestHelper {
@@ -39,32 +51,37 @@ public class DigestHelper {
new DefaultEmbeddedStreamTranslator();
/**
- * Computes digests on the stream if configured.
+ * Computes digests on the stream if a DigesterFactory is configured in
ParseContext.
+ * <p>
* This is called directly from AutoDetectParser.parse() before type
detection.
*
- * @param tis the TikaInputStream to digest
- * @param digester the digester to use (may be null)
- * @param skipContainerDocumentDigest if true, skip digesting for
top-level documents (depth 0)
- * @param metadata metadata to read embedded depth
from and write digests to
- * @param context parse context (may contain
SkipContainerDocumentDigest marker)
+ * @param tis the TikaInputStream to digest
+ * @param metadata metadata to read depth from and write digests to
+ * @param context parse context (should contain DigesterFactory, may
contain SkipContainerDocumentDigest marker)
* @throws IOException if an I/O error occurs
*/
public static void maybeDigest(TikaInputStream tis,
- Digester digester,
- boolean skipContainerDocumentDigest,
Metadata metadata,
ParseContext context) throws IOException {
- if (digester == null) {
+ DigesterFactory digesterFactory = context.get(DigesterFactory.class);
+
+ if (digesterFactory == null) {
return;
}
- // Check both the config setting and the ParseContext marker
- if (skipContainerDocumentDigest ||
SkipContainerDocumentDigest.shouldSkip(context)) {
+
+ // Get skip setting from factory or ParseContext marker
+ boolean skipContainer = digesterFactory.isSkipContainerDocumentDigest()
+ || SkipContainerDocumentDigest.shouldSkip(context);
+
+ if (skipContainer) {
Integer depth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
if (depth == null || depth == 0) {
return;
}
}
+ Digester digester = digesterFactory.build();
+
// Handle embedded stream translation if needed (e.g., for OLE2
objects in TikaInputStream's open container)
if (EMBEDDED_STREAM_TRANSLATOR.shouldTranslate(tis, metadata)) {
try (TemporaryResources tmp = new TemporaryResources()) {
diff --git
a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
index 1b9215d226..0c35d33c01 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
@@ -21,26 +21,31 @@ package org.apache.tika.digest;
* Implementations should be annotated with {@code @TikaComponent} and
* provide bean properties for configuration (e.g., digests).
* <p>
- * This is used in {@link org.apache.tika.parser.AutoDetectParserConfig} to
- * configure digesting in the AutoDetectParser.
+ * Configure this factory in the "other-configs" section of tika-config.json.
+ * The factory is loaded into the ParseContext and used by AutoDetectParser
+ * during parsing to compute digests.
* <p>
* Example JSON configuration:
* <pre>
- * "auto-detect-parser": {
- * "digesterFactory": {
- * "commons-digester-factory": {
- * "digests": [
- * { "algorithm": "MD5" },
- * { "algorithm": "SHA256", "encoding": "BASE32" }
- * ]
+ * {
+ * "other-configs": {
+ * "digester-factory": {
+ * "commons-digester-factory": {
+ * "digests": [
+ * { "algorithm": "MD5" },
+ * { "algorithm": "SHA256", "encoding": "BASE32" }
+ * ],
+ * "skipContainerDocumentDigest": true
+ * }
* }
* }
* }
* </pre>
+ * <p>
+ * When using TikaLoader, call {@code loader.loadParseContext()} to get a
+ * ParseContext with the DigesterFactory already set.
*
* @see DigestDef
- * @see DigestAlgorithm
- * @see DigestEncoding
*/
public interface DigesterFactory {
/**
@@ -49,4 +54,16 @@ public interface DigesterFactory {
* @return a new Digester instance
*/
Digester build();
+
+ /**
+ * Returns whether to skip digesting for container (top-level) documents.
+ * When true, only embedded documents (depth > 0) will be digested.
+ * <p>
+ * Default implementation returns false (digest everything).
+ *
+ * @return true if container documents should be skipped, false otherwise
+ */
+ default boolean isSkipContainerDocumentDigest() {
+ return false;
+ }
}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 351aa49a65..2d8e7ca21d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -147,10 +147,8 @@ public class AutoDetectParser extends CompositeParser {
public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
ParseContext context) throws IOException, SAXException,
TikaException {
// Compute digests before type detection if configured
- DigestHelper.maybeDigest(tis,
- autoDetectParserConfig.digester(),
- autoDetectParserConfig.isSkipContainerDocumentDigest(),
- metadata, context);
+ // DigesterFactory is retrieved from ParseContext (configured via
other-configs)
+ DigestHelper.maybeDigest(tis, metadata, context);
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata, context);
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 4fa1d3c083..ebf359ff1c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -21,8 +21,6 @@ import java.io.Serializable;
import org.xml.sax.ContentHandler;
import org.apache.tika.config.TikaComponent;
-import org.apache.tika.digest.Digester;
-import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
@@ -76,17 +74,6 @@ public class AutoDetectParserConfig implements Serializable {
private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory =
NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
- private DigesterFactory digesterFactory = null;
-
- // Lazily built digester from the factory
- private transient Digester digester = null;
-
- /**
- * If true, skip digesting for container (top-level) documents.
- * Only embedded documents will be digested.
- */
- private boolean skipContainerDocumentDigest = false;
-
private boolean throwOnZeroBytes = true;
/**
@@ -160,71 +147,6 @@ public class AutoDetectParserConfig implements
Serializable {
return contentHandlerDecoratorFactory;
}
- /**
- * Sets the digester factory.
- * This is the preferred method for configuring digesting via JSON
serialization.
- *
- * @param digesterFactory the digester factory
- */
- public void setDigesterFactory(DigesterFactory digesterFactory) {
- this.digesterFactory = digesterFactory;
- }
-
- /**
- * Gets the digester factory.
- *
- * @return the digester factory, or null if not configured
- */
- public DigesterFactory getDigesterFactory() {
- return digesterFactory;
- }
-
- /**
- * Returns the Digester, lazily building it from the factory if needed.
- * <p>
- * Note: This method is intentionally not named getDigester() to avoid
- * Jackson treating it as a bean property during serialization.
- *
- * @return the Digester, or null if no factory is configured
- */
- public Digester digester() {
- if (digester == null && digesterFactory != null) {
- digester = digesterFactory.build();
- }
- return digester;
- }
-
- /**
- * Sets the digester directly. This is useful for programmatic
configuration
- * (e.g., from command-line arguments) when you don't have a
DigesterFactory.
- * <p>
- * Note: This method is intentionally not named setDigester() to avoid
- * Jackson treating it as a bean property during deserialization.
- *
- * @param digester the digester to use
- */
- public void digester(Digester digester) {
- this.digester = digester;
- }
-
- /**
- * Returns whether to skip digesting for container (top-level) documents.
- *
- * @return true if container documents should be skipped, false otherwise
- */
- public boolean isSkipContainerDocumentDigest() {
- return skipContainerDocumentDigest;
- }
-
- /**
- * Sets whether to skip digesting for container (top-level) documents.
- *
- * @param skipContainerDocumentDigest if true, only embedded documents
will be digested
- */
- public void setSkipContainerDocumentDigest(boolean
skipContainerDocumentDigest) {
- this.skipContainerDocumentDigest = skipContainerDocumentDigest;
- }
-
public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
this.throwOnZeroBytes = throwOnZeroBytes;
}
@@ -240,8 +162,7 @@ public class AutoDetectParserConfig implements Serializable
{
", maximumDepth=" + maximumDepth + ",
maximumPackageEntryDepth=" +
maximumPackageEntryDepth + ",
embeddedDocumentExtractorFactory=" +
embeddedDocumentExtractorFactory + ",
contentHandlerDecoratorFactory=" +
- contentHandlerDecoratorFactory + ", digesterFactory=" +
digesterFactory +
- ", skipContainerDocumentDigest=" + skipContainerDocumentDigest
+
+ contentHandlerDecoratorFactory +
", throwOnZeroBytes=" + throwOnZeroBytes + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index ae1ecc3bbb..f5338594ff 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -224,6 +224,32 @@ public class ParseContext implements Serializable {
return context.isEmpty() && jsonConfigs.isEmpty();
}
+ /**
+ * Copies all entries from the source ParseContext into this one.
+ * Existing entries in this context are overwritten by source entries.
+ * <p>
+ * This copies both typed objects (from context map) and JSON configs.
+ *
+ * @param source the ParseContext to copy from
+ * @since Apache Tika 4.0
+ */
+ public void copyFrom(ParseContext source) {
+ if (source == null) {
+ return;
+ }
+ // Copy typed objects
+ context.putAll(source.context);
+ // Copy JSON configs
+ jsonConfigs.putAll(source.jsonConfigs);
+ // Copy resolved configs (if any)
+ if (source.resolvedConfigs != null &&
!source.resolvedConfigs.isEmpty()) {
+ if (resolvedConfigs == null) {
+ resolvedConfigs = new HashMap<>();
+ }
+ resolvedConfigs.putAll(source.resolvedConfigs);
+ }
+ }
+
/**
* Creates a new Metadata object with any configured limits applied.
* <p>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
index 68a7280f83..ec0cdbad50 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
@@ -67,7 +67,7 @@ public class BouncyCastleDigester extends CompositeDigester {
Encoder encoder = getEncoder(def.getEncoding());
digesters[i++] = new BCInputStreamDigester(
def.getAlgorithm().getJavaName(),
- def.getMetadataKey(),
+ def.metadataKey(),
encoder);
}
return digesters;
@@ -81,7 +81,7 @@ public class BouncyCastleDigester extends CompositeDigester {
DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
digesters[i++] = new BCInputStreamDigester(
algorithm.getJavaName(),
- def.getMetadataKey(),
+ def.metadataKey(),
encoder);
}
return digesters;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
index a8a1894586..d62e38e843 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
@@ -32,15 +32,18 @@ import org.apache.tika.digest.DigesterFactory;
* BouncyCastle supports additional algorithms beyond the standard Java ones,
* such as SHA3-256, SHA3-384, SHA3-512.
* <p>
- * Example JSON configuration:
+ * Example JSON configuration (in other-configs section):
* <pre>
* {
- * "digesterFactory": {
- * "bouncy-castle-digester-factory": {
- * "digests": [
- * { "algorithm": "MD5" },
- * { "algorithm": "SHA3_256", "encoding": "BASE32" }
- * ]
+ * "other-configs": {
+ * "digester-factory": {
+ * "bouncy-castle-digester-factory": {
+ * "digests": [
+ * { "algorithm": "MD5" },
+ * { "algorithm": "SHA3_256", "encoding": "BASE32" }
+ * ],
+ * "skipContainerDocumentDigest": false
+ * }
* }
* }
* }
@@ -50,6 +53,7 @@ import org.apache.tika.digest.DigesterFactory;
public class BouncyCastleDigesterFactory implements DigesterFactory {
private List<DigestDef> digests = new ArrayList<>();
+ private boolean skipContainerDocumentDigest = false;
public BouncyCastleDigesterFactory() {
digests.add(new DigestDef(DigestDef.Algorithm.MD5));
@@ -60,6 +64,15 @@ public class BouncyCastleDigesterFactory implements
DigesterFactory {
return new BouncyCastleDigester(digests);
}
+ @Override
+ public boolean isSkipContainerDocumentDigest() {
+ return skipContainerDocumentDigest;
+ }
+
+ public void setSkipContainerDocumentDigest(boolean
skipContainerDocumentDigest) {
+ this.skipContainerDocumentDigest = skipContainerDocumentDigest;
+ }
+
public List<DigestDef> getDigests() {
return digests;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
index 0f5185b0f5..e3f1424f35 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
@@ -60,7 +60,7 @@ public class CommonsDigester extends CompositeDigester {
Encoder encoder = getEncoder(def.getEncoding());
digesters[i++] = new InputStreamDigester(
def.getAlgorithm().getJavaName(),
- def.getMetadataKey(),
+ def.metadataKey(),
encoder);
}
return digesters;
@@ -75,7 +75,7 @@ public class CommonsDigester extends CompositeDigester {
DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
digesters[i++] = new InputStreamDigester(
algorithm.getJavaName(),
- def.getMetadataKey(),
+ def.metadataKey(),
encoder);
}
return digesters;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index b141c7340e..5c0c81a54d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -29,15 +29,18 @@ import org.apache.tika.digest.DigesterFactory;
* <p>
* Default: MD5 with HEX encoding.
* <p>
- * Example JSON configuration:
+ * Example JSON configuration (in other-configs section):
* <pre>
* {
- * "digesterFactory": {
- * "commons-digester": {
- * "digests": [
- * { "algorithm": "MD5" },
- * { "algorithm": "SHA256", "encoding": "BASE32" }
- * ]
+ * "other-configs": {
+ * "digester-factory": {
+ * "commons-digester-factory": {
+ * "digests": [
+ * { "algorithm": "MD5" },
+ * { "algorithm": "SHA256", "encoding": "BASE32" }
+ * ],
+ * "skipContainerDocumentDigest": false
+ * }
* }
* }
* }
@@ -47,6 +50,7 @@ import org.apache.tika.digest.DigesterFactory;
public class CommonsDigesterFactory implements DigesterFactory {
private List<DigestDef> digests = new ArrayList<>();
+ private boolean skipContainerDocumentDigest = false;
public CommonsDigesterFactory() {
digests.add(new DigestDef(DigestDef.Algorithm.MD5));
@@ -57,6 +61,15 @@ public class CommonsDigesterFactory implements
DigesterFactory {
return new CommonsDigester(digests);
}
+ @Override
+ public boolean isSkipContainerDocumentDigest() {
+ return skipContainerDocumentDigest;
+ }
+
+ public void setSkipContainerDocumentDigest(boolean
skipContainerDocumentDigest) {
+ this.skipContainerDocumentDigest = skipContainerDocumentDigest;
+ }
+
public List<DigestDef> getDigests() {
return digests;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 471bc6b377..babd30e79e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -103,8 +103,10 @@ public class AutoDetectParserConfigTest extends TikaTest {
public void testDigests() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context);
// SHA256 with BASE32 encoding includes encoding in the key
assertEquals("SO67W5OGGMOFPMFQTHTNL5YU5EQXWPMNEPU7HKOZX2ULHRQICRZA====",
metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32"));
@@ -122,8 +124,10 @@ public class AutoDetectParserConfigTest extends TikaTest {
public void testDigestsSkipContainer() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json").loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p, context);
// SHA256 with BASE32 encoding includes encoding in the key
assertNull(metadataList.get(0).get("X-TIKA:digest:SHA256:BASE32"));
assertNull(metadataList.get(0).get("X-TIKA:digest:MD5"));
@@ -137,8 +141,10 @@ public class AutoDetectParserConfigTest extends TikaTest {
@Test
public void testDigestsEmptyParser() throws Exception {
//TIKA-3939 -- ensure that digesting happens even with EmptyParser
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests-pdf-only.json").loadAutoDetectParser();
- List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests-pdf-only.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p,
context);
assertEquals(1, metadataList.size());
assertEquals("4ef0d3bdb12ba603f4caf7d2e2c6112e",
metadataList.get(0).get("X-TIKA:digest:MD5"));
@@ -150,8 +156,10 @@ public class AutoDetectParserConfigTest extends TikaTest {
public void testContainerZeroBytes() throws Exception {
Path tmp = Files.createTempFile("tika-test", "");
try {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser();
- List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList = getRecursiveMetadata(tmp, p,
context, true);
assertEquals("d41d8cd98f00b204e9800998ecf8427e",
metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("0",
metadataList.get(0).get(Metadata.CONTENT_LENGTH));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index e54ef93a79..01d28b5188 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -567,14 +567,16 @@ public class AutoDetectParserTest extends TikaTest {
//TIKA-4533 -- this tests both that a very large embedded OLE doc
doesn't cause a zip bomb
//exception AND that the sha for the embedded OLE doc is not the sha
for a zero-byte file
String expectedSha =
"bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78";
- AutoDetectParser autoDetectParser = (AutoDetectParser)
TikaLoaderHelper.getLoader("tika-4533.json").loadAutoDetectParser();
+ TikaLoader loader = TikaLoaderHelper.getLoader("tika-4533.json");
+ AutoDetectParser autoDetectParser = (AutoDetectParser)
loader.loadAutoDetectParser();
+ ParseContext parseContext = loader.loadParseContext();
//this models what happens in tika-pipes
if (autoDetectParser.getAutoDetectParserConfig()
.getEmbeddedDocumentExtractorFactory() == null) {
autoDetectParser.getAutoDetectParserConfig()
.setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory());
}
- List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new
ParseContext());
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, parseContext);
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256"));
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 9b054d4ad4..efa124e8f0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -36,6 +36,7 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -437,13 +438,15 @@ public class RecursiveParserWrapperTest extends TikaTest {
ContentHandlerFactory
contentHandlerFactory,
boolean catchEmbeddedExceptions,
boolean digest) throws Exception {
- ParseContext context = new ParseContext();
+ ParseContext context;
Parser wrapped;
if (digest) {
- wrapped = TikaLoaderHelper.getLoader("tika-config-md5-digest.json")
- .loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-md5-digest.json");
+ wrapped = loader.loadAutoDetectParser();
+ context = loader.loadParseContext();
} else {
wrapped = AUTO_DETECT_PARSER;
+ context = new ParseContext();
}
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java
index bc8174e918..e5fa61735d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/DigestConfigTest.java
@@ -26,8 +26,10 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
/**
@@ -60,10 +62,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testCommonsDigesterBasic() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json")
- .loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
Metadata m = new Metadata();
- getXML("test_recursive_embedded.docx", p, m);
+ getXML("test_recursive_embedded.docx", p, m, context);
assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should
match");
assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should
match");
@@ -75,9 +78,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testCommonsDigesterWithBase32() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests.json").loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
Metadata m = new Metadata();
- getXML("test_recursive_embedded.docx", p, m);
+ getXML("test_recursive_embedded.docx", p, m, context);
// SHA256 with BASE32 encoding - just verify it exists with
non-default key
assertNotNull(m.get(P + "SHA256:BASE32"),
@@ -89,9 +94,10 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testCommonsDigesterLengthsCalculated() throws Exception {
// This tests that TIKA-4016 added lengths
- Parser p =
TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json")
- .loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-commons-digests-basic.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p, context);
for (Metadata m : metadataList) {
assertNotNull(m.get(Metadata.CONTENT_LENGTH));
}
@@ -99,9 +105,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testCommonsDigesterSkipContainer() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json")
- .loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p);
+ // Tests skipContainerDocumentDigest on the factory (configured in
other-configs)
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-digests-skip-container.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p, context);
// Container should NOT have digest
assertNull(metadataList.get(0).get(P + "MD5"),
@@ -118,10 +126,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testBouncyCastleDigesterBasic() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json")
- .loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
Metadata m = new Metadata();
- getXML("test_recursive_embedded.docx", p, m);
+ getXML("test_recursive_embedded.docx", p, m, context);
assertEquals(EXPECTED_MD2, m.get(P + "MD2"), "MD2 digest should
match");
assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should
match");
@@ -133,10 +142,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testBouncyCastleDigesterMultipleAlgorithms() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-bc-digests-multiple.json")
- .loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-bc-digests-multiple.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
Metadata m = new Metadata();
- getXML("test_recursive_embedded.docx", p, m);
+ getXML("test_recursive_embedded.docx", p, m, context);
assertEquals(EXPECTED_MD5, m.get(P + "MD5"), "MD5 digest should
match");
assertEquals(EXPECTED_SHA256, m.get(P + "SHA256"), "SHA256 digest
should match");
@@ -150,10 +160,11 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testBouncyCastleDigesterBase32Encoding() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-bc-digests-base32.json")
- .loadAutoDetectParser();
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-bc-digests-base32.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
Metadata m = new Metadata();
- getXML("test_recursive_embedded.docx", p, m);
+ getXML("test_recursive_embedded.docx", p, m, context);
// Non-default encoding includes encoding in the key
assertEquals(EXPECTED_SHA1_BASE32, m.get(P + "SHA1:BASE32"),
@@ -162,9 +173,10 @@ public class DigestConfigTest extends TikaTest {
@Test
public void testBouncyCastleDigesterLengthsCalculated() throws Exception {
- Parser p =
TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json")
- .loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p);
+ TikaLoader loader =
TikaLoaderHelper.getLoader("tika-config-bc-digests-basic.json");
+ Parser p = loader.loadAutoDetectParser();
+ ParseContext context = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("test_recursive_embedded.docx", p, context);
for (Metadata m : metadataList) {
assertNotNull(m.get(Metadata.CONTENT_LENGTH));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
index a211165f56..52904b6589 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
@@ -25,18 +25,17 @@ import java.util.List;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
-import org.apache.tika.digest.DigestDef;
+import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.digest.SkipContainerDocumentDigest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.parser.digestutils.CommonsDigesterFactory;
/**
* Tests for SkipContainerDocumentDigest functionality with MockParser and
embedded documents.
+ * DigesterFactory is now configured via ParseContext (via other-configs in
JSON).
*/
public class SkipContainerDocumentDigestTest extends TikaTest {
@@ -46,14 +45,16 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
@Test
public void testDigestContainerAndEmbedded() throws Exception {
// skipContainerDocumentDigest = false means digest everything
- AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
- config.setSkipContainerDocumentDigest(false);
+ CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setSkipContainerDocumentDigest(false);
AutoDetectParser parser = new AutoDetectParser();
- parser.setAutoDetectParserConfig(config);
- List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml", parser);
+ ParseContext context = new ParseContext();
+ context.set(DigesterFactory.class, factory);
+
+ List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml",
+ parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
@@ -70,14 +71,16 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
@Test
public void testSkipContainerDigestOnly() throws Exception {
// skipContainerDocumentDigest = true means skip container, digest
only embedded
- AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
- config.setSkipContainerDocumentDigest(true);
+ CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setSkipContainerDocumentDigest(true);
AutoDetectParser parser = new AutoDetectParser();
- parser.setAutoDetectParserConfig(config);
- List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml", parser);
+ ParseContext context = new ParseContext();
+ context.set(DigesterFactory.class, factory);
+
+ List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml",
+ parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
@@ -94,15 +97,14 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
@Test
public void testSkipContainerDocumentDigestMarkerInParseContext() throws
Exception {
// Test that the SkipContainerDocumentDigest marker in ParseContext
works
- AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
- config.setSkipContainerDocumentDigest(false); // Config says digest all
+ CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setSkipContainerDocumentDigest(false); // Factory says digest
all
AutoDetectParser parser = new AutoDetectParser();
- parser.setAutoDetectParserConfig(config);
- // Set the marker in ParseContext to override config
+ // Set both factory and the marker in ParseContext - marker overrides
factory
ParseContext context = new ParseContext();
+ context.set(DigesterFactory.class, factory);
context.set(SkipContainerDocumentDigest.class,
SkipContainerDocumentDigest.INSTANCE);
List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml",
@@ -111,7 +113,7 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
// Should have container + embedded
assertEquals(2, metadataList.size());
- // Container should NOT have digest because ParseContext marker
overrides config
+ // Container should NOT have digest because ParseContext marker
overrides factory
assertNull(metadataList.get(0).get(DIGEST_KEY),
"Container document should NOT have digest when ParseContext
marker is set");
@@ -122,12 +124,8 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
@Test
public void testNoDigesterConfigured() throws Exception {
- // When no digester is configured, no digests should be computed
- AutoDetectParserConfig config = new AutoDetectParserConfig();
- // Don't set any digester
-
+ // When no digester is configured in ParseContext, no digests should
be computed
AutoDetectParser parser = new AutoDetectParser();
- parser.setAutoDetectParserConfig(config);
List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml", parser);
@@ -142,20 +140,52 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
}
@Test
- public void testDigestWithFactory() throws Exception {
- // Test using the factory pattern
+ public void testDigestWithFactoryInParseContext() throws Exception {
+ // Test that DigesterFactory in ParseContext is used
CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setSkipContainerDocumentDigest(false);
+
+ AutoDetectParser parser = new AutoDetectParser();
- AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.setDigesterFactory(factory);
- config.setSkipContainerDocumentDigest(false);
+ ParseContext context = new ParseContext();
+ context.set(DigesterFactory.class, factory);
+
+ List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml",
+ parser, new Metadata(), context, false);
+
+ // Should have container + embedded
+ assertEquals(2, metadataList.size());
+
+ // Both should have digest
+ assertNotNull(metadataList.get(0).get(DIGEST_KEY),
+ "Container document should have digest when ParseContext
provides factory");
+ assertNotNull(metadataList.get(1).get(DIGEST_KEY),
+ "Embedded document should have digest when ParseContext
provides factory");
+ }
+
+ @Test
+ public void testSkipContainerOnFactory() throws Exception {
+ // Test skipContainerDocumentDigest configured on the factory
+ CommonsDigesterFactory factory = new CommonsDigesterFactory();
+ factory.setSkipContainerDocumentDigest(true);
AutoDetectParser parser = new AutoDetectParser();
- parser.setAutoDetectParserConfig(config);
- List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml", parser);
+ ParseContext context = new ParseContext();
+ context.set(DigesterFactory.class, factory);
+
+ List<Metadata> metadataList =
getRecursiveMetadata("mock_embedded_for_digest.xml",
+ parser, new Metadata(), context, false);
// Should have container + embedded
assertEquals(2, metadataList.size());
+
+ // Container should NOT have digest because factory says to skip
+ assertNull(metadataList.get(0).get(DIGEST_KEY),
+ "Container document should NOT have digest when
factory.skipContainerDocumentDigest=true");
+
+ // Embedded should have digest
+ assertNotNull(metadataList.get(1).get(DIGEST_KEY),
+ "Embedded document should have digest");
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index dfe1c591ac..8ff5ccbb27 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -148,8 +148,10 @@ public class OOXMLParserTest extends TikaTest {
@Test
public void testDigestTranslator() throws Exception {
- Parser parser = TikaLoader.load(getConfigPath(OOXMLParserTest.class,
"tika-config-digests.json")).loadAutoDetectParser();
- List<Metadata> metadataList =
getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser);
+ TikaLoader loader =
TikaLoader.load(getConfigPath(OOXMLParserTest.class,
"tika-config-digests.json"));
+ Parser parser = loader.loadAutoDetectParser();
+ ParseContext parseContext = loader.loadParseContext();
+ List<Metadata> metadataList =
getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser, parseContext);
assertEquals(4, metadataList.size());
for (Metadata m : metadataList) {
assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32"));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
index 12b49d6267..76416f19d7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
@@ -3,8 +3,10 @@
"maximumCompressionRatio": 100,
"maximumDepth": 100,
"maximumPackageEntryDepth": 100,
- "throwOnZeroBytes": false,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "SHA256" }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
index 5ac209517f..f9e04fe037 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
@@ -1,13 +1,15 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"bouncy-castle-digester-factory": {
"digests": [
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
index 53bfd01732..8d4a9db55f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
@@ -1,7 +1,10 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"bouncy-castle-digester-factory": {
"digests": [
{ "algorithm": "MD2" },
@@ -12,7 +15,6 @@
{ "algorithm": "SHA512" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
index b2e23ad974..d8dcaba9a3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
@@ -1,7 +1,10 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"bouncy-castle-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
@@ -11,7 +14,6 @@
{ "algorithm": "SHA3_512" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
index c37e6965f2..7256297b30 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
@@ -1,7 +1,10 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD2" },
@@ -12,7 +15,6 @@
{ "algorithm": "SHA512" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index 60825fe974..4dc5242e60 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -10,14 +10,16 @@
],
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
index 8ed562166a..c6676b29be 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
@@ -1,15 +1,17 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": true,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
- ]
+ ],
+ "skipContainerDocumentDigest": true
}
- },
- "throwOnZeroBytes": false
+ }
}
-}
\ No newline at end of file
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
index 50bbd90b99..360b4f5170 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
@@ -1,14 +1,16 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
-}
\ No newline at end of file
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
index a13a80c7db..3aa9e04375 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
@@ -1,6 +1,6 @@
{
- "auto-detect-parser": {
- "digesterFactory": {
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
index c21c7f3f2d..28c5763f0b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
@@ -22,10 +22,11 @@
}
},
"contentHandlerDecoratorFactory":
"upcasing-content-handler-decorator-factory",
- "skipContainerDocumentDigest": false,
- "digesterFactory": {
- "commons-digester-factory": {}
- },
"throwOnZeroBytes": true
+ },
+ "other-configs": {
+ "digester-factory": {
+ "commons-digester-factory": {}
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
index 48314a2ab4..6a1e6a925a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
@@ -1,18 +1,18 @@
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": true,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
- ]
+ ],
+ "skipContainerDocumentDigest": true
}
},
- "throwOnZeroBytes": false
- },
- "other-configs": {
"metadata-write-limiter-factory": {
"standard-metadata-limiter-factory": {
"includeFields": [
@@ -23,4 +23,3 @@
}
}
}
-
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
index 457fe11168..a11014478c 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java
@@ -62,12 +62,11 @@ class EmitHandler {
this.directEmitThresholdBytes = directEmitThresholdBytes;
}
- public PipesResult emitParseData(FetchEmitTuple t,
MetadataListAndEmbeddedBytes parseData) {
+ public PipesResult emitParseData(FetchEmitTuple t,
MetadataListAndEmbeddedBytes parseData, ParseContext parseContext) {
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply the metadata filter after we pull out the stacktrace
- filterMetadata(t, parseData);
- ParseContext parseContext = t.getParseContext();
+ filterMetadata(parseData, parseContext);
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
t.getOnParseException();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
parseContext.get(EmbeddedDocumentBytesConfig.class);
if (StringUtils.isBlank(stack) ||
@@ -200,8 +199,8 @@ class EmitHandler {
}
}
- private void filterMetadata(FetchEmitTuple t, MetadataListAndEmbeddedBytes
parseData) {
- MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
+ private void filterMetadata(MetadataListAndEmbeddedBytes parseData,
ParseContext parseContext) {
+ MetadataFilter filter = parseContext.get(MetadataFilter.class);
if (filter == null) {
filter = defaultMetadataFilter;
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java
index 98055b639b..c14ee24656 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/FetchHandler.java
@@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.api.fetcher.Fetcher;
@@ -40,14 +41,14 @@ class FetchHandler {
this.fetcherManager = fetcherManager;
}
- public TisOrResult fetch(FetchEmitTuple fetchEmitTuple, Metadata metadata)
{
+ public TisOrResult fetch(FetchEmitTuple fetchEmitTuple, Metadata metadata,
ParseContext parseContext) {
FetcherOrResult fetcherResult = getFetcher(fetchEmitTuple);
if (fetcherResult.pipesResult != null) {
return new TisOrResult(null, fetcherResult.pipesResult);
}
try {
TikaInputStream tis = fetcherResult.fetcher.fetch(
- fetchEmitTuple.getFetchKey().getFetchKey(), metadata,
fetchEmitTuple.getParseContext());
+ fetchEmitTuple.getFetchKey().getFetchKey(), metadata,
parseContext);
return new TisOrResult(tis, null);
} catch (IOException | TikaException e) {
return new TisOrResult(null, new
PipesResult(PipesResult.RESULT_STATUS.FETCH_EXCEPTION,
ExceptionUtils.getStackTrace(e)));
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index b6d31d0f00..bbcb21b4e0 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -31,6 +31,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.detect.Detector;
import org.apache.tika.digest.Digester;
+import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.digest.SkipContainerDocumentDigest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
@@ -58,7 +59,6 @@ class ParseHandler {
private static final Logger LOG =
LoggerFactory.getLogger(ParseHandler.class);
private final Detector detector;
- private final Digester digester;
private final ArrayBlockingQueue<Metadata> intermediateResult;
private final CountDownLatch countDownLatch;
private final AutoDetectParser autoDetectParser;
@@ -67,12 +67,11 @@ class ParseHandler {
private final ParseMode defaultParseMode;
- ParseHandler(Detector detector, Digester digester,
ArrayBlockingQueue<Metadata> intermediateResult,
+ ParseHandler(Detector detector, ArrayBlockingQueue<Metadata>
intermediateResult,
CountDownLatch countDownLatch, AutoDetectParser
autoDetectParser,
RecursiveParserWrapper recursiveParserWrapper,
ContentHandlerFactory defaultContentHandlerFactory,
ParseMode defaultParseMode) {
this.detector = detector;
- this.digester = digester;
this.intermediateResult = intermediateResult;
this.countDownLatch = countDownLatch;
this.autoDetectParser = autoDetectParser;
@@ -124,8 +123,11 @@ class ParseHandler {
private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata
metadata,
ParseContext parseContext) {
- if (digester != null) {
+ // Get DigesterFactory from ParseContext (configured via other-configs)
+ DigesterFactory digesterFactory =
parseContext.get(DigesterFactory.class);
+ if (digesterFactory != null &&
!digesterFactory.isSkipContainerDocumentDigest()) {
try {
+ Digester digester = digesterFactory.build();
digester.digest(tis, metadata, parseContext);
// Mark that we've already digested the container document so
AutoDetectParser
// won't re-digest it during parsing
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
index 8cf9308577..5c6e551f50 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
@@ -53,7 +53,6 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.Detector;
-import org.apache.tika.digest.Digester;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.RUnpackExtractorFactory;
@@ -61,6 +60,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.PipesResult;
@@ -132,7 +132,6 @@ public class PipesServer implements AutoCloseable {
return (byte) (ordinal() + 1);
}
}
- private Digester digester;
private Detector detector;
@@ -300,14 +299,16 @@ public class PipesServer implements AutoCloseable {
CountDownLatch countDownLatch = new CountDownLatch(1);
FetchEmitTuple fetchEmitTuple = readFetchEmitTuple();
+ // Create merged ParseContext: defaults from tika-config +
request overrides
+ ParseContext mergedContext =
createMergedParseContext(fetchEmitTuple.getParseContext());
// Resolve friendly-named configs in ParseContext to
actual objects
-
ParseContextUtils.resolveAll(fetchEmitTuple.getParseContext(),
getClass().getClassLoader());
+ ParseContextUtils.resolveAll(mergedContext,
getClass().getClassLoader());
- PipesWorker pipesWorker =
getPipesWorker(intermediateResult, fetchEmitTuple, countDownLatch);
+ PipesWorker pipesWorker =
getPipesWorker(intermediateResult, fetchEmitTuple, mergedContext,
countDownLatch);
executorCompletionService.submit(pipesWorker);
//set progress counter
try {
- loopUntilDone(fetchEmitTuple,
executorCompletionService, intermediateResult, countDownLatch);
+ loopUntilDone(fetchEmitTuple, mergedContext,
executorCompletionService, intermediateResult, countDownLatch);
} catch (Throwable t) {
LOG.error("Serious problem: {}",
HexFormat.of().formatHex(new byte[]{(byte)request}), t);
}
@@ -339,21 +340,23 @@ public class PipesServer implements AutoCloseable {
}
}
- private PipesWorker getPipesWorker(ArrayBlockingQueue<Metadata>
intermediateResult, FetchEmitTuple fetchEmitTuple, CountDownLatch
countDownLatch) {
+ private PipesWorker getPipesWorker(ArrayBlockingQueue<Metadata>
intermediateResult, FetchEmitTuple fetchEmitTuple,
+ ParseContext mergedContext,
CountDownLatch countDownLatch) {
FetchHandler fetchHandler = new FetchHandler(fetcherManager);
- ParseHandler parseHandler = new ParseHandler(detector, digester,
intermediateResult, countDownLatch, autoDetectParser,
+ ParseHandler parseHandler = new ParseHandler(detector,
intermediateResult, countDownLatch, autoDetectParser,
rMetaParser, defaultContentHandlerFactory,
pipesConfig.getParseMode());
Long thresholdBytes =
pipesConfig.getEmitStrategy().getThresholdBytes();
long threshold = (thresholdBytes != null) ? thresholdBytes :
EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES;
EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter,
emitStrategy, emitterManager, threshold);
- PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple,
autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler,
defaultMetadataWriteLimiterFactory);
- return pipesWorker;
+ return new PipesWorker(fetchEmitTuple, mergedContext,
autoDetectParser, emitterManager,
+ fetchHandler, parseHandler, emitHandler,
defaultMetadataWriteLimiterFactory);
}
- private void loopUntilDone(FetchEmitTuple fetchEmitTuple,
ExecutorCompletionService<PipesResult> executorCompletionService,
+ private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext
mergedContext,
+ ExecutorCompletionService<PipesResult>
executorCompletionService,
ArrayBlockingQueue<Metadata>
intermediateResult, CountDownLatch countDownLatch) throws InterruptedException,
IOException {
Instant start = Instant.now();
- long timeoutMillis = PipesClient.getTimeoutMillis(pipesConfig,
fetchEmitTuple.getParseContext());
+ long timeoutMillis = PipesClient.getTimeoutMillis(pipesConfig,
mergedContext);
long mockProgressCounter = 0;
boolean wroteIntermediateResult = false;
@@ -468,21 +471,6 @@ public class PipesServer implements AutoCloseable {
this.fetcherManager = FetcherManager.load(tikaPluginManager,
tikaJsonConfig, true, configStore);
this.emitterManager = EmitterManager.load(tikaPluginManager,
tikaJsonConfig, true, configStore);
this.autoDetectParser = (AutoDetectParser)
tikaLoader.loadAutoDetectParser();
- // Get the digester for pre-parse digesting of container documents.
- // If user configured skipContainerDocumentDigest=false (the default),
PipesServer
- // digests the container document before parsing to ensure we have the
digest even
- // if parsing times out. The SkipContainerDocumentDigest marker is
then added to
- // ParseContext to prevent AutoDetectParser from re-digesting the
container.
- // If user configured skipContainerDocumentDigest=true, we don't
digest containers at all.
- boolean skipContainerDigest =
autoDetectParser.getAutoDetectParserConfig()
- .isSkipContainerDocumentDigest();
- if (!skipContainerDigest) {
- // User wants container documents digested - we'll do it in
ParseHandler before parse
- this.digester =
autoDetectParser.getAutoDetectParserConfig().digester();
- } else {
- // User doesn't want container documents digested
- this.digester = null;
- }
// If the user hasn't configured an embedded document extractor, set
up the
// RUnpackExtractorFactory
@@ -494,6 +482,23 @@ public class PipesServer implements AutoCloseable {
}
+ /**
+ * Creates a merged ParseContext with defaults from tika-config overlaid
with request values.
+ * Request values take precedence over defaults.
+ * <p>
+ * Creates a fresh context each time to avoid shared state between
requests.
+ *
+ * @param requestContext the ParseContext from FetchEmitTuple
+ * @return a new ParseContext with defaults + request overrides
+ */
+ private ParseContext createMergedParseContext(ParseContext requestContext)
throws TikaConfigException {
+ // Create fresh context with defaults from tika-config (e.g.,
DigesterFactory)
+ ParseContext mergedContext = tikaLoader.loadParseContext();
+ // Overlay request's values (request takes precedence)
+ mergedContext.copyFrom(requestContext);
+ return mergedContext;
+ }
+
private ConfigStore createConfigStore(PipesConfig pipesConfig,
TikaPluginManager tikaPluginManager) throws TikaException {
String configStoreType = pipesConfig.getConfigStoreType();
String configStoreParams = pipesConfig.getConfigStoreParams();
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index 18b83192ac..df54ea0042 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -52,6 +52,7 @@ class PipesWorker implements Callable<PipesResult> {
private static final Logger LOG =
LoggerFactory.getLogger(PipesWorker.class);
private final FetchEmitTuple fetchEmitTuple;
+ private final ParseContext parseContext;
private final AutoDetectParser autoDetectParser;
private final EmitterManager emitterManager;
private final FetchHandler fetchHandler;
@@ -59,9 +60,11 @@ class PipesWorker implements Callable<PipesResult> {
private final EmitHandler emitHandler;
private final MetadataWriteLimiterFactory
defaultMetadataWriteLimiterFactory;
- public PipesWorker(FetchEmitTuple fetchEmitTuple, AutoDetectParser
autoDetectParser, EmitterManager emitterManager, FetchHandler fetchHandler,
ParseHandler parseHandler,
+ public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext
parseContext, AutoDetectParser autoDetectParser,
+ EmitterManager emitterManager, FetchHandler
fetchHandler, ParseHandler parseHandler,
EmitHandler emitHandler, MetadataWriteLimiterFactory
defaultMetadataWriteLimiterFactory) {
this.fetchEmitTuple = fetchEmitTuple;
+ this.parseContext = parseContext;
this.autoDetectParser = autoDetectParser;
this.emitterManager = emitterManager;
this.fetchHandler = fetchHandler;
@@ -86,7 +89,7 @@ class PipesWorker implements Callable<PipesResult> {
if (parseData == null ||
metadataIsEmpty(parseData.getMetadataList())) {
return PipesResults.EMPTY_OUTPUT;
}
- return emitHandler.emitParseData(fetchEmitTuple, parseData);
+ return emitHandler.emitParseData(fetchEmitTuple, parseData,
parseContext);
} finally {
if (parseData != null && parseData.hasEmbeddedDocumentByteStore()
&&
parseData.getEmbeddedDocumentBytesHandler() instanceof
Closeable) {
@@ -109,22 +112,23 @@ class PipesWorker implements Callable<PipesResult> {
//start a new metadata object to gather info from the fetch process
//we want to isolate and not touch the metadata sent into the
fetchEmitTuple
//so that we can inject it after the filter at the very end
- ParseContext parseContext = null;
+ ParseContext localContext = null;
try {
- parseContext = setupParseContext(fetchEmitTuple);
+ localContext = setupParseContext();
} catch (IOException e) {
LOG.warn("fetcher initialization exception id={}",
fetchEmitTuple.getId(), e);
return new ParseDataOrPipesResult(null,
new
PipesResult(PipesResult.RESULT_STATUS.FETCHER_INITIALIZATION_EXCEPTION,
ExceptionUtils.getStackTrace(e)));
}
- Metadata metadata = parseContext.newMetadata();
- FetchHandler.TisOrResult tisOrResult =
fetchHandler.fetch(fetchEmitTuple, metadata);
+ // Use newMetadata() to apply any configured write limits
+ Metadata metadata = localContext.newMetadata();
+ FetchHandler.TisOrResult tisOrResult =
fetchHandler.fetch(fetchEmitTuple, metadata, localContext);
if (tisOrResult.pipesResult() != null) {
return new ParseDataOrPipesResult(null, tisOrResult.pipesResult());
}
try (TikaInputStream tis = tisOrResult.tis()) {
- return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata,
parseContext);
+ return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata,
localContext);
} catch (SecurityException e) {
LOG.error("security exception id={}", fetchEmitTuple.getId(), e);
throw e;
@@ -137,8 +141,7 @@ class PipesWorker implements Callable<PipesResult> {
- private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple)
throws TikaException, IOException {
- ParseContext parseContext = fetchEmitTuple.getParseContext();
+ private ParseContext setupParseContext() throws TikaException, IOException
{
// ContentHandlerFactory and ParseMode are retrieved from ParseContext
in ParseHandler.
// They are set in ParseContext from PipesConfig loaded via TikaLoader
at startup.
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
index 755c345dfa..98573b46fb 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
@@ -46,11 +46,12 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": false,
- "digesterFactory": {
- "mock-digester-factory": {}
- },
"throwOnZeroBytes": false
},
+ "other-configs": {
+ "digester-factory": {
+ "mock-digester-factory": {}
+ }
+ },
"plugin-roots": "PLUGINS_PATHS"
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
index 2e0748f854..5308be9a1c 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
@@ -45,11 +45,12 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": false,
- "digesterFactory": {
- "mock-digester-factory": {}
- },
"throwOnZeroBytes": false
},
+ "other-configs": {
+ "digester-factory": {
+ "mock-digester-factory": {}
+ }
+ },
"plugin-roots": "PLUGINS_PATHS"
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
index 07a78edf3c..f8d5d3464b 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
@@ -46,10 +46,6 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": false,
- "digesterFactory": {
- "mock-digester-factory": {}
- },
"embeddedDocumentExtractorFactory": {
"runpack-extractor-factory": {
"writeFileNameToContent": false,
@@ -58,5 +54,10 @@
},
"throwOnZeroBytes": false
},
+ "other-configs": {
+ "digester-factory": {
+ "mock-digester-factory": {}
+ }
+ },
"plugin-roots": "PLUGINS_PATHS"
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
index 6498c15a7a..c9189c2ae2 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
@@ -41,11 +41,12 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": false,
- "digesterFactory": {
- "mock-digester-factory": {}
- },
"throwOnZeroBytes": false
},
+ "other-configs": {
+ "digester-factory": {
+ "mock-digester-factory": {}
+ }
+ },
"plugin-roots": "PLUGINS_PATHS"
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json
index b1c41dc063..c8dfbacc7c 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json
@@ -46,7 +46,6 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "skipContainerDocumentDigest": true,
"throwOnZeroBytes": false
},
"other-configs": {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index b527532e5b..55f6ff0993 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -36,17 +36,20 @@ import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
+import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.renderer.CompositeRenderer;
import org.apache.tika.renderer.Renderer;
@@ -376,6 +379,46 @@ public class TikaLoader {
return autoDetectParser;
}
+ /**
+ * Loads and returns a ParseContext populated with components from the
"other-configs" section.
+ * <p>
+ * This method loads components that should be passed via ParseContext,
such as:
+ * <ul>
+ * <li>DigesterFactory (from "digester-factory")</li>
+ * <li>MetadataWriteLimiterFactory (from
"metadata-write-limiter-factory")</li>
+ * </ul>
+ * <p>
+ * Use this method when you need a pre-configured ParseContext for parsing
operations.
+ *
+ * <p>Example usage:
+ * <pre>
+ * TikaLoader loader = TikaLoader.load(configPath);
+ * Parser parser = loader.loadAutoDetectParser();
+ * ParseContext context = loader.loadParseContext();
+ * parser.parse(stream, handler, metadata, context);
+ * </pre>
+ *
+ * @return a ParseContext populated with configured components
+ * @throws TikaConfigException if loading fails
+ */
+ public ParseContext loadParseContext() throws TikaConfigException {
+ ParseContext context = new ParseContext();
+
+ // Load DigesterFactory from other-configs if present
+ DigesterFactory digesterFactory = configs().load("digester-factory",
DigesterFactory.class);
+ if (digesterFactory != null) {
+ context.set(DigesterFactory.class, digesterFactory);
+ }
+
+ // Load MetadataWriteLimiterFactory from other-configs if present
+ MetadataWriteLimiterFactory metadataWriteLimiterFactory =
configs().load(MetadataWriteLimiterFactory.class);
+ if (metadataWriteLimiterFactory != null) {
+ context.set(MetadataWriteLimiterFactory.class,
metadataWriteLimiterFactory);
+ }
+
+ return context;
+ }
+
/**
* Returns a ConfigLoader for loading simple configuration objects.
* <p>
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
index c84676c14b..7101105ba8 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
@@ -77,6 +77,7 @@ public class MetadataResource {
@Context HttpHeaders httpHeaders,
@Context UriInfo info) throws Exception {
+ // Load default context from config, then overlay with request config
ParseContext context = TikaResource.createParseContext();
Metadata metadata = context.newMetadata();
try (TikaInputStream tis = setupMultipartConfig(attachments, metadata,
context)) {
@@ -171,6 +172,7 @@ public class MetadataResource {
protected Metadata parseMetadata(TikaInputStream tis, Metadata metadata,
MultivaluedMap<String, String> httpHeaders, UriInfo info)
throws IOException, TikaConfigException {
+ // Load default context from config (includes DigesterFactory from
other-configs)
final ParseContext context = TikaResource.createParseContext();
Parser parser = TikaResource.createParser();
fillMetadata(parser, metadata, httpHeaders);
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 968bd83f99..31a5817ad8 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -115,18 +115,19 @@ public class TikaResource {
}
/**
- * Creates a new ParseContext with the default MetadataWriteLimiterFactory
set if configured.
- * This should be used instead of {@code createParseContext()} to ensure
metadata limits
- * are applied when configured.
+ * Creates a new ParseContext with defaults loaded from tika-config.
+ * This loads components from "other-configs" such as DigesterFactory and
MetadataWriteLimiterFactory.
*
* @return a new ParseContext with defaults applied
*/
public static ParseContext createParseContext() {
- ParseContext context = new ParseContext();
- if (DEFAULT_METADATA_WRITE_LIMITER_FACTORY != null) {
- context.set(MetadataWriteLimiterFactory.class,
DEFAULT_METADATA_WRITE_LIMITER_FACTORY);
+ try {
+ return TIKA_LOADER.loadParseContext();
+ } catch (TikaConfigException e) {
+ // Fall back to empty context if loading fails
+ LOG.warn("Failed to load ParseContext from config, using empty
context", e);
+ return new ParseContext();
}
- return context;
}
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index d73c002546..0cb335676c 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -79,14 +79,16 @@ public abstract class CXFTestBase {
{
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" }
]
}
- },
- "throwOnZeroBytes": false
+ }
}
}
""";
diff --git
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
index 7c301943d2..434b61e626 100644
---
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
@@ -46,15 +46,17 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
},
"plugin-roots": "PLUGINS_PATHS"
-}
\ No newline at end of file
+}
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
index bcae4fb7e6..06510b1a1e 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
@@ -48,15 +48,17 @@
},
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
},
"plugin-roots": "PLUGINS_PATHS"
-}
\ No newline at end of file
+}
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
index fbe51d01c4..dc25f3ae0e 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
@@ -11,15 +11,17 @@
],
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
},
"fetchers": {
"file-system-fetcher": {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
index 930334b088..665442b733 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
@@ -16,15 +16,17 @@
],
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
},
"fetchers": {
"file-system-fetcher": {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
index 3a4d88fb69..51e7806e81 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
@@ -16,15 +16,17 @@
],
"auto-detect-parser": {
"outputThreshold": 1000000,
- "digesterFactory": {
+ "throwOnZeroBytes": false
+ },
+ "other-configs": {
+ "digester-factory": {
"commons-digester-factory": {
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
}
- },
- "throwOnZeroBytes": false
+ }
},
"fetchers": {
"file-system-fetcher": {