This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 08715445b4 TIKA-4607 - rm DigestingParser from 4.x (#2506)
08715445b4 is described below

commit 08715445b460c3d8cfdb0b3d1a5c667ad74bcf42
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 30 19:23:21 2025 -0500

    TIKA-4607 - rm DigestingParser from 4.x (#2506)
    
    * TIKA-4607 - rm DigestingParser from 4.x
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/parser/DigestingParser.java    | 109 ---------------------
 .../apache/tika/parser/AutoDetectParserTest.java   |  15 ---
 .../tika/parser/RecursiveParserWrapperTest.java    |  24 ++---
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  15 +++
 .../resources/configs/tika-config-md5-digest.json  |  12 +++
 6 files changed, 41 insertions(+), 136 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index bea9b52359..b9348576fa 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -27,6 +27,8 @@ Release 4.0.0-BETA1 - ???
 
    * API changes in the EmbeddedStreamTranslator (TIKA-4518).
 
+   * Removed DigestingParser (TIKA-4607).
+
   OTHER CHANGES
 
    * Fix concurrency bug in TikaToXMP (TIKA-4393)
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
deleted file mode 100644
index 30369bf752..0000000000
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
-import org.apache.tika.extractor.EmbeddedStreamTranslator;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-
-/**
- * A parser decorator that computes digests of the parsed content.
- *
- * @deprecated Since 4.x. Use {@link 
AutoDetectParserConfig#setDigesterFactory(org.apache.tika.digest.DigesterFactory)}
- * to configure digesting. The AutoDetectParser now calls digesting directly 
in its parse method.
- * The interfaces {@link org.apache.tika.digest.Digester},
- * {@link org.apache.tika.digest.DigesterFactory}, and
- * {@link org.apache.tika.digest.Encoder} have moved to the
- * {@code org.apache.tika.digest} package.
- */
-@Deprecated
-public class DigestingParser extends ParserDecorator {
-
-    private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
-    private final org.apache.tika.digest.Digester digester;
-    private final boolean skipContainerDocument;
-    /**
-     * Creates a decorator for the given parser.
-     *
-     * @param parser the parser instance to be decorated
-     * @param digester the digester to use
-     * @param skipContainerDocument if true, skip digesting top-level documents
-     */
-    public DigestingParser(Parser parser, org.apache.tika.digest.Digester 
digester,
-                           boolean skipContainerDocument) {
-        super(parser);
-        this.digester = digester;
-        this.skipContainerDocument = skipContainerDocument;
-    }
-
-    @Override
-    public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
-                      ParseContext context) throws IOException, SAXException, 
TikaException {
-
-
-        if (! shouldDigest(metadata)) {
-            super.parse(tis, handler, metadata, context);
-            return;
-        }
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-
-            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
-                Path tmpBytes = tmp.createTempFile();
-                try (OutputStream os = Files.newOutputStream(tmpBytes)) {
-                    embeddedStreamTranslator.translate(tis, metadata, os);
-                }
-                try (TikaInputStream translated = 
TikaInputStream.get(tmpBytes)) {
-                    digester.digest(translated, metadata, context);
-                }
-            } else {
-                digester.digest(tis, metadata, context);
-            }
-            super.parse(tis, handler, metadata, context);
-        } finally {
-            tmp.dispose();
-        }
-    }
-
-    private boolean shouldDigest(Metadata metadata) {
-        if (digester == null) {
-            return false;
-        }
-        if (! skipContainerDocument) {
-            return true;
-        }
-        Integer parseDepth = 
metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
-        if (parseDepth == null || parseDepth == 0) {
-            return false;
-        }
-        return true;
-    }
-
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 17a57934df..61348e510e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -27,7 +27,6 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipOutputStream;
@@ -39,8 +38,6 @@ import org.apache.tika.TikaLoaderHelper;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.detect.Detector;
-import org.apache.tika.digest.DigestDef;
-import org.apache.tika.digest.Digester;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.exception.ZeroByteFileException;
@@ -50,7 +47,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMPDM;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.digestutils.CommonsDigester;
 import org.apache.tika.parser.external.CompositeExternalParser;
 import org.apache.tika.parser.ogg.FlacParser;
 import org.apache.tika.parser.ogg.OpusParser;
@@ -563,7 +559,6 @@ public class AutoDetectParserTest extends TikaTest {
         }
     }
 
-    @SuppressWarnings("deprecation")
     @Test
     public void testDigestingOpenContainers() throws Exception {
         //TIKA-4533 -- this tests both that a very large embedded OLE doc 
doesn't cause a zip bomb
@@ -580,15 +575,5 @@ public class AutoDetectParserTest extends TikaTest {
         assertEquals(expectedSha, 
metadataList.get(2).get("X-TIKA:digest:SHA256"));
         
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
         assertEquals(2049290L, 
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
-
-        Digester digester = new CommonsDigester(10000, 
DigestDef.Algorithm.SHA256);
-
-        //now test that we get the same digest if we wrap the auto detect 
parser vs configuring it
-        autoDetectParser = new AutoDetectParser();
-        Parser digestingParser = new DigestingParser(autoDetectParser, 
digester, true);
-        metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", 
digestingParser, new ParseContext());
-        assertEquals(expectedSha, 
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
-        assertEquals(2049290L, 
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
-
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index e009d65b9f..b13bd3962c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -32,14 +32,12 @@ import org.apache.commons.io.input.ClosedInputStream;
 import org.apache.commons.io.input.ProxyInputStream;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.TikaLoaderHelper;
 import org.apache.tika.TikaTest;
-import org.apache.tika.digest.DigestDef;
-import org.apache.tika.digest.Digester;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.digestutils.CommonsDigester;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.ContentHandlerFactory;
@@ -292,7 +290,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
"test_recursive_embedded_npe.docx");
         list = getMetadata(metadata,
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                false, null);
+                false, false);
 
         //Composite parser swallows caught TikaExceptions, IOExceptions and 
SAXExceptions
         //and just doesn't bother to report that there was an exception.
@@ -350,7 +348,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
"test_recursive_embedded.docx");
         List<Metadata> list = getMetadata(metadata,
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                true, new CommonsDigester(100000, DigestDef.Algorithm.MD5));
+                true, true);
 
         String md5Key = "X-TIKA:digest:MD5";
         assertEquals("59f626e09a8c16ab6dbc2800c685f772", 
list.get(0).get(md5Key));
@@ -388,16 +386,18 @@ public class RecursiveParserWrapperTest extends TikaTest {
         }
 
     }
-
-    @SuppressWarnings("deprecation")
+    
     private List<Metadata> getMetadata(Metadata metadata,
                                        ContentHandlerFactory 
contentHandlerFactory,
                                        boolean catchEmbeddedExceptions,
-                                       Digester digester) throws Exception {
+                                       boolean digest) throws Exception {
         ParseContext context = new ParseContext();
-        Parser wrapped = AUTO_DETECT_PARSER;
-        if (digester != null) {
-            wrapped = new DigestingParser(wrapped, digester, false);
+        Parser wrapped;
+        if (digest) {
+            wrapped = TikaLoaderHelper.getLoader("tika-config-md5-digest.json")
+                    .loadAutoDetectParser();
+        } else {
+            wrapped = AUTO_DETECT_PARSER;
         }
         RecursiveParserWrapper wrapper =
                 new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
@@ -422,7 +422,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
     private List<Metadata> getMetadata(Metadata metadata,
                                        ContentHandlerFactory 
contentHandlerFactory)
             throws Exception {
-        return getMetadata(metadata, contentHandlerFactory, true, null);
+        return getMetadata(metadata, contentHandlerFactory, true, false);
     }
 
     private static class CloseCountingInputStream extends ProxyInputStream {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 36038a8ca0..860d3ef147 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 
 import java.util.List;
@@ -25,10 +26,12 @@ import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.EMFParser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
@@ -142,4 +145,16 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Example of a table",
                 metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
     }
+
+    @Test
+    public void testDigestTranslator() throws Exception {
+        Parser parser = TikaLoader.load(getConfigPath(OOXMLParserTest.class, 
"tika-config-digests.json")).loadAutoDetectParser();
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser);
+        assertEquals(4, metadataList.size());
+        debug(metadataList);
+        for (Metadata m : metadataList) {
+            assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32"));
+            assertNull(m.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+        }
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
new file mode 100644
index 0000000000..caffd0c709
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
@@ -0,0 +1,12 @@
+{
+  "auto-detect-parser": {
+    "digesterFactory": {
+      "commons-digester-factory": {
+        "markLimit": 100000,
+        "digests": [
+          { "algorithm": "MD5" }
+        ]
+      }
+    }
+  }
+}

Reply via email to