This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4568
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b6d25ea55721dd24b89b6e51a4476a4894606b2e
Author: tallison <[email protected]>
AuthorDate: Fri Dec 12 11:02:05 2025 -0500

    TIKA-4568 -- deprecate DigestingParser
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  4 +-
 .../org/apache/tika/parser/AutoDetectParser.java   | 47 +++++++++-------------
 .../apache/tika/parser/AutoDetectParserConfig.java | 40 ++++++++++++++++++
 .../org/apache/tika/parser/DigestingParser.java    |  8 ++++
 .../apache/tika/parser/AutoDetectParserTest.java   |  1 +
 .../parser/BouncyCastleDigestingParserTest.java    |  1 +
 .../apache/tika/parser/DigestingParserTest.java    |  1 +
 .../tika/parser/RecursiveParserWrapperTest.java    |  1 +
 .../apache/tika/pipes/core/server/PipesServer.java | 10 +++--
 .../server/core/resource/UnpackerResource.java     |  9 -----
 10 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index adb708c2dc..8ff46a435e 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -733,8 +733,8 @@ public class TikaCLI {
             parser = new NetworkParser(networkURI);
         } else {
             parser = tikaLoader.loadAutoDetectParser();
-            if (digester != null) {
-                parser = new DigestingParser(parser, digester, false);
+            if (digester != null && parser instanceof AutoDetectParser) {
+                ((AutoDetectParser) 
parser).getAutoDetectParserConfig().digester(digester);
             }
         }
         detector = tikaLoader.loadDetectors();
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index c1668cc303..a7117dc1ed 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.digest.DigestHelper;
 import org.apache.tika.sax.SecureContentHandler;
 
 public class AutoDetectParser extends CompositeParser {
@@ -90,46 +91,32 @@ public class AutoDetectParser extends CompositeParser {
         setAutoDetectParserConfig(AutoDetectParserConfig.DEFAULT);
     }
 
-    public AutoDetectParser(MediaTypeRegistry mediaTypeRegistry, Parser 
parser, Detector detector, AutoDetectParserConfig autoDetectParserConfig) {
+    public AutoDetectParser(MediaTypeRegistry mediaTypeRegistry, Parser 
parser, Detector detector,
+                             AutoDetectParserConfig autoDetectParserConfig) {
         super(mediaTypeRegistry, parser);
-        setFallback(buildFallbackParser(parser, 
autoDetectParserConfig.getDigesterFactory()));
+        setFallback(getFallbackFrom(parser));
         setDetector(detector);
         setAutoDetectParserConfig(autoDetectParserConfig);
     }
 
-    public static Parser build(CompositeParser parser, Detector detector, 
AutoDetectParserConfig autoDetectParserConfig) {
-        return new AutoDetectParser(parser.getMediaTypeRegistry(), 
getParser(parser, autoDetectParserConfig.getDigesterFactory()), detector, 
autoDetectParserConfig);
+    public static Parser build(CompositeParser parser, Detector detector,
+                               AutoDetectParserConfig autoDetectParserConfig) {
+        return new AutoDetectParser(parser.getMediaTypeRegistry(), parser, 
detector,
+                autoDetectParserConfig);
     }
 
     public AutoDetectParser(TikaConfig config) {
-        super(config.getMediaTypeRegistry(), getParser(config.getParser(), 
config.getAutoDetectParserConfig().getDigesterFactory()));
-        setFallback(buildFallbackParser(config.getParser(), 
config.getAutoDetectParserConfig().getDigesterFactory()));
+        super(config.getMediaTypeRegistry(), config.getParser());
+        setFallback(getFallbackFrom(config.getParser()));
         setDetector(config.getDetector());
         setAutoDetectParserConfig(config.getAutoDetectParserConfig());
     }
 
-    private static Parser buildFallbackParser(Parser defaultParser, 
DigestingParser.DigesterFactory digesterFactory) {
-        Parser fallback = null;
-        Parser p = defaultParser;
-        if (p instanceof DefaultParser) {
-            fallback = ((DefaultParser)p).getFallback();
-        } else {
-            fallback = new EmptyParser();
+    private static Parser getFallbackFrom(Parser defaultParser) {
+        if (defaultParser instanceof DefaultParser) {
+            return ((DefaultParser) defaultParser).getFallback();
         }
-
-        if (digesterFactory == null) {
-            return fallback;
-        } else {
-            return new DigestingParser(fallback, digesterFactory.build(), 
digesterFactory.isSkipContainerDocument());
-        }
-
-    }
-
-    private static Parser getParser(Parser defaultParser, 
DigestingParser.DigesterFactory digesterFactory) {
-        if (digesterFactory == null) {
-            return defaultParser;
-        }
-        return new DigestingParser(defaultParser,digesterFactory.build(), 
digesterFactory.isSkipContainerDocument());
+        return new EmptyParser();
     }
 
     /**
@@ -181,6 +168,12 @@ public class AutoDetectParser extends CompositeParser {
             //figure out if we should spool to disk
             maybeSpool(tis, autoDetectParserConfig, metadata);
 
+            // Compute digests before type detection if configured
+            DigestHelper.maybeDigest(tis,
+                    autoDetectParserConfig.digester(),
+                    autoDetectParserConfig.isSkipContainerDocument(),
+                    metadata, context, tmp);
+
             // Automatically detect the MIME type of the document
             MediaType type = detector.detect(tis, metadata);
             //update CONTENT_TYPE as long as it wasn't set by parser override
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index a8bcc17686..76ad80ac17 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -95,6 +95,9 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
 
     private DigestingParser.DigesterFactory digesterFactory = null;
 
+    // Lazily built digester from the factory
+    private transient DigestingParser.Digester digester = null;
+
     private boolean throwOnZeroBytes = true;
 
     /**
@@ -195,6 +198,43 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
         return this.digesterFactory;
     }
 
+    /**
+     * Returns the Digester, lazily building it from the factory if needed.
+     * <p>
+     * Note: This method is intentionally not named getDigester() to avoid
+     * Jackson treating it as a bean property during serialization.
+     *
+     * @return the Digester, or null if no factory is configured
+     */
+    public DigestingParser.Digester digester() {
+        if (digester == null && digesterFactory != null) {
+            digester = digesterFactory.build();
+        }
+        return digester;
+    }
+
+    /**
+     * Sets the digester directly. This is useful for programmatic 
configuration
+     * (e.g., from command-line arguments) when you don't have a 
DigesterFactory.
+     * <p>
+     * Note: This method is intentionally not named setDigester() to avoid
+     * Jackson treating it as a bean property during deserialization.
+     *
+     * @param digester the digester to use
+     */
+    public void digester(DigestingParser.Digester digester) {
+        this.digester = digester;
+    }
+
+    /**
+     * Returns whether to skip digesting for container (top-level) documents.
+     *
+     * @return true if container documents should be skipped, false otherwise
+     */
+    public boolean isSkipContainerDocument() {
+        return digesterFactory != null && 
digesterFactory.isSkipContainerDocument();
+    }
+
     public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
         this.throwOnZeroBytes = throwOnZeroBytes;
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 054d4234ff..dd1329223d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -34,6 +34,14 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 
+/**
+ * A parser decorator that computes digests of the parsed content.
+ *
+ * @deprecated Since 3.x. Use {@link 
AutoDetectParserConfig#setDigesterFactory(DigesterFactory)}
+ * to configure digesting. The AutoDetectParser now calls digesting directly 
in its parse method.
+ * The interfaces {@link Digester}, {@link DigesterFactory}, and {@link 
Encoder} are still in use.
+ */
+@Deprecated
 public class DigestingParser extends ParserDecorator {
 
     private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 044f9c6724..c6677001c8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -563,6 +563,7 @@ public class AutoDetectParserTest extends TikaTest {
         }
     }
 
+    @SuppressWarnings("deprecation")
     @Test
     public void testDigestingOpenContainers() throws Exception {
         //TIKA-4533 -- this tests both that a very large embedded OLE doc 
doesn't cause a zip bomb
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
index 9971b7e039..1a31426e61 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
@@ -44,6 +44,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.digestutils.BouncyCastleDigester;
 
 
+@SuppressWarnings("deprecation")
 public class BouncyCastleDigestingParserTest extends TikaTest {
 
     private final static String P = TikaCoreProperties.TIKA_META_PREFIX + 
"digest" +
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index e6267005b7..859703cffe 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -45,6 +45,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.digestutils.CommonsDigester;
 
 
+@SuppressWarnings("deprecation")
 public class DigestingParserTest extends TikaTest {
 
     private final static String P = TikaCoreProperties.TIKA_META_PREFIX + 
"digest" +
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 816dfff8af..1c5fa4cf60 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -432,6 +432,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
         }
     }
 
+    @SuppressWarnings("deprecation")
     private List<Metadata> getMetadata(Metadata metadata,
                                        ContentHandlerFactory 
contentHandlerFactory,
                                        boolean catchEmbeddedExceptions,
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
index 974afad089..48b8dacac8 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
@@ -459,10 +459,12 @@ public class PipesServer implements AutoCloseable {
         // Always initialize emitters to support runtime overrides via 
ParseContext
         this.emitterManager = EmitterManager.load(tikaPluginManager, 
tikaJsonConfig);
         this.autoDetectParser = (AutoDetectParser) 
tikaLoader.loadAutoDetectParser();
-        if (autoDetectParser.getAutoDetectParserConfig()
-                .getDigesterFactory() != null) {
-            this.digester = autoDetectParser.getAutoDetectParserConfig()
-                    .getDigesterFactory().build();
+        // Get the digester for pre-parse digesting of container documents.
+        // The AutoDetectParser now handles digesting internally via 
DigestHelper,
+        // but PipesServer does its own pre-parse digesting for container 
documents.
+        // Setting skipContainerDocument(true) ensures AutoDetectParser only 
digests embedded docs.
+        this.digester = 
autoDetectParser.getAutoDetectParserConfig().digester();
+        if (digester != null) {
             //override this value because we'll be digesting before parse
             autoDetectParser.getAutoDetectParserConfig().getDigesterFactory()
                     .setSkipContainerDocument(true);
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index 74918ee9b6..3739ae97ce 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -65,7 +65,6 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -144,10 +143,6 @@ public class UnpackerResource {
         long unpackMaxBytes = DEFAULT_MAX_ATTACHMENT_BYTES;
 
         Parser parser = TikaResource.createParser();
-        if (parser instanceof DigestingParser) {
-            //no need to digest for unwrapping
-            parser = ((DigestingParser) parser).getWrappedParser();
-        }
 
         TikaResource.logRequest(LOG, "/unpack/config", metadata);
         //even though we aren't currently parsing embedded documents,
@@ -205,10 +200,6 @@ public class UnpackerResource {
             }
         }
         Parser parser = TikaResource.createParser();
-        if (parser instanceof DigestingParser) {
-            //no need to digest for unwrapping
-            parser = ((DigestingParser) parser).getWrappedParser();
-        }
         fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
 
         TikaResource.logRequest(LOG, "/unpack", metadata);

Reply via email to