This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4568 in repository https://gitbox.apache.org/repos/asf/tika.git
commit b6d25ea55721dd24b89b6e51a4476a4894606b2e Author: tallison <[email protected]> AuthorDate: Fri Dec 12 11:02:05 2025 -0500 TIKA-4568 -- deprecate DigestingParser --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +- .../org/apache/tika/parser/AutoDetectParser.java | 47 +++++++++------------- .../apache/tika/parser/AutoDetectParserConfig.java | 40 ++++++++++++++++++ .../org/apache/tika/parser/DigestingParser.java | 8 ++++ .../apache/tika/parser/AutoDetectParserTest.java | 1 + .../parser/BouncyCastleDigestingParserTest.java | 1 + .../apache/tika/parser/DigestingParserTest.java | 1 + .../tika/parser/RecursiveParserWrapperTest.java | 1 + .../apache/tika/pipes/core/server/PipesServer.java | 10 +++-- .../server/core/resource/UnpackerResource.java | 9 ----- 10 files changed, 80 insertions(+), 42 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index adb708c2dc..8ff46a435e 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -733,8 +733,8 @@ public class TikaCLI { parser = new NetworkParser(networkURI); } else { parser = tikaLoader.loadAutoDetectParser(); - if (digester != null) { - parser = new DigestingParser(parser, digester, false); + if (digester != null && parser instanceof AutoDetectParser) { + ((AutoDetectParser) parser).getAutoDetectParserConfig().digester(digester); } } detector = tikaLoader.loadDetectors(); diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index c1668cc303..a7117dc1ed 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.digest.DigestHelper; import org.apache.tika.sax.SecureContentHandler; public class AutoDetectParser extends CompositeParser { @@ -90,46 +91,32 @@ public class AutoDetectParser extends CompositeParser { setAutoDetectParserConfig(AutoDetectParserConfig.DEFAULT); } - public AutoDetectParser(MediaTypeRegistry mediaTypeRegistry, Parser parser, Detector detector, AutoDetectParserConfig autoDetectParserConfig) { + public AutoDetectParser(MediaTypeRegistry mediaTypeRegistry, Parser parser, Detector detector, + AutoDetectParserConfig autoDetectParserConfig) { super(mediaTypeRegistry, parser); - setFallback(buildFallbackParser(parser, autoDetectParserConfig.getDigesterFactory())); + setFallback(getFallbackFrom(parser)); setDetector(detector); setAutoDetectParserConfig(autoDetectParserConfig); } - public static Parser build(CompositeParser parser, Detector detector, AutoDetectParserConfig autoDetectParserConfig) { - return new AutoDetectParser(parser.getMediaTypeRegistry(), getParser(parser, autoDetectParserConfig.getDigesterFactory()), detector, autoDetectParserConfig); + public static Parser build(CompositeParser parser, Detector detector, + AutoDetectParserConfig autoDetectParserConfig) { + return new AutoDetectParser(parser.getMediaTypeRegistry(), parser, detector, + autoDetectParserConfig); } public AutoDetectParser(TikaConfig config) { - super(config.getMediaTypeRegistry(), getParser(config.getParser(), config.getAutoDetectParserConfig().getDigesterFactory())); - setFallback(buildFallbackParser(config.getParser(), config.getAutoDetectParserConfig().getDigesterFactory())); + super(config.getMediaTypeRegistry(), config.getParser()); + setFallback(getFallbackFrom(config.getParser())); setDetector(config.getDetector()); setAutoDetectParserConfig(config.getAutoDetectParserConfig()); } - private static Parser buildFallbackParser(Parser defaultParser, DigestingParser.DigesterFactory digesterFactory) { - Parser fallback = null; - Parser p = defaultParser; - if (p instanceof DefaultParser) { - fallback = ((DefaultParser)p).getFallback(); - } else { - fallback = new EmptyParser(); + private static Parser getFallbackFrom(Parser defaultParser) { + if (defaultParser instanceof DefaultParser) { + return ((DefaultParser) defaultParser).getFallback(); } - - if (digesterFactory == null) { - return fallback; - } else { - return new DigestingParser(fallback, digesterFactory.build(), digesterFactory.isSkipContainerDocument()); - } - - } - - private static Parser getParser(Parser defaultParser, DigestingParser.DigesterFactory digesterFactory) { - if (digesterFactory == null) { - return defaultParser; - } - return new DigestingParser(defaultParser,digesterFactory.build(), digesterFactory.isSkipContainerDocument()); + return new EmptyParser(); } /** @@ -181,6 +168,12 @@ public class AutoDetectParser extends CompositeParser { //figure out if we should spool to disk maybeSpool(tis, autoDetectParserConfig, metadata); + // Compute digests before type detection if configured + DigestHelper.maybeDigest(tis, + autoDetectParserConfig.digester(), + autoDetectParserConfig.isSkipContainerDocument(), + metadata, context, tmp); + // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata); //update CONTENT_TYPE as long as it wasn't set by parser override diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index a8bcc17686..76ad80ac17 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -95,6 +95,9 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { private DigestingParser.DigesterFactory digesterFactory = null; + // Lazily built digester from the factory + private transient DigestingParser.Digester digester = null; + private boolean throwOnZeroBytes = true; /** @@ -195,6 +198,43 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { return this.digesterFactory; } + /** + * Returns the Digester, lazily building it from the factory if needed. + * <p> + * Note: This method is intentionally not named getDigester() to avoid + * Jackson treating it as a bean property during serialization. + * + * @return the Digester, or null if no factory is configured + */ + public DigestingParser.Digester digester() { + if (digester == null && digesterFactory != null) { + digester = digesterFactory.build(); + } + return digester; + } + + /** + * Sets the digester directly. This is useful for programmatic configuration + * (e.g., from command-line arguments) when you don't have a DigesterFactory. + * <p> + * Note: This method is intentionally not named setDigester() to avoid + * Jackson treating it as a bean property during deserialization. + * + * @param digester the digester to use + */ + public void digester(DigestingParser.Digester digester) { + this.digester = digester; + } + + /** + * Returns whether to skip digesting for container (top-level) documents. + * + * @return true if container documents should be skipped, false otherwise + */ + public boolean isSkipContainerDocument() { + return digesterFactory != null && digesterFactory.isSkipContainerDocument(); + } + public void setThrowOnZeroBytes(boolean throwOnZeroBytes) { this.throwOnZeroBytes = throwOnZeroBytes; } diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java index 054d4234ff..dd1329223d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java @@ -34,6 +34,14 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +/** + * A parser decorator that computes digests of the parsed content. + * + * @deprecated Since 3.x. Use {@link AutoDetectParserConfig#setDigesterFactory(DigesterFactory)} + * to configure digesting. The AutoDetectParser now calls digesting directly in its parse method. + * The interfaces {@link Digester}, {@link DigesterFactory}, and {@link Encoder} are still in use. + */ +@Deprecated public class DigestingParser extends ParserDecorator { private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index 044f9c6724..c6677001c8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -563,6 +563,7 @@ public class AutoDetectParserTest extends TikaTest { } } + @SuppressWarnings("deprecation") @Test public void testDigestingOpenContainers() throws Exception { //TIKA-4533 -- this tests both that a very large embedded OLE doc doesn't cause a zip bomb diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java index 9971b7e039..1a31426e61 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java @@ -44,6 +44,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.digestutils.BouncyCastleDigester; +@SuppressWarnings("deprecation") public class BouncyCastleDigestingParserTest extends TikaTest { private final static String P = TikaCoreProperties.TIKA_META_PREFIX + "digest" + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java index e6267005b7..859703cffe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java @@ -45,6 +45,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.digestutils.CommonsDigester; +@SuppressWarnings("deprecation") public class DigestingParserTest extends TikaTest { private final static String P = TikaCoreProperties.TIKA_META_PREFIX + "digest" + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 816dfff8af..1c5fa4cf60 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -432,6 +432,7 @@ public class RecursiveParserWrapperTest extends TikaTest { } } + @SuppressWarnings("deprecation") private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index 974afad089..48b8dacac8 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -459,10 +459,12 @@ public class PipesServer implements AutoCloseable { // Always initialize emitters to support runtime overrides via ParseContext this.emitterManager = EmitterManager.load(tikaPluginManager, tikaJsonConfig); this.autoDetectParser = (AutoDetectParser) tikaLoader.loadAutoDetectParser(); - if (autoDetectParser.getAutoDetectParserConfig() - .getDigesterFactory() != null) { - this.digester = autoDetectParser.getAutoDetectParserConfig() - .getDigesterFactory().build(); + // Get the digester for pre-parse digesting of container documents. + // The AutoDetectParser now handles digesting internally via DigestHelper, + // but PipesServer does its own pre-parse digesting for container documents. + // Setting skipContainerDocument(true) ensures AutoDetectParser only digests embedded docs. + this.digester = autoDetectParser.getAutoDetectParserConfig().digester(); + if (digester != null) { //override this value because we'll be digesting before parse autoDetectParser.getAutoDetectParserConfig().getDigesterFactory() .setSkipContainerDocument(true); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java index 74918ee9b6..3739ae97ce 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java @@ -65,7 +65,6 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -144,10 +143,6 @@ public class UnpackerResource { long unpackMaxBytes = DEFAULT_MAX_ATTACHMENT_BYTES; Parser parser = TikaResource.createParser(); - if (parser instanceof DigestingParser) { - //no need to digest for unwrapping - parser = ((DigestingParser) parser).getWrappedParser(); - } TikaResource.logRequest(LOG, "/unpack/config", metadata); //even though we aren't currently parsing embedded documents, @@ -205,10 +200,6 @@ public class UnpackerResource { } } Parser parser = TikaResource.createParser(); - if (parser instanceof DigestingParser) { - //no need to digest for unwrapping - parser = ((DigestingParser) parser).getWrappedParser(); - } fillMetadata(parser, metadata, httpHeaders.getRequestHeaders()); TikaResource.logRequest(LOG, "/unpack", metadata);
