This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 08715445b4 TIKA-4607 - rm DigestingParser from 4.x (#2506)
08715445b4 is described below
commit 08715445b460c3d8cfdb0b3d1a5c667ad74bcf42
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 30 19:23:21 2025 -0500
TIKA-4607 - rm DigestingParser from 4.x (#2506)
* TIKA-4607 - rm DigestingParser from 4.x
---
CHANGES.txt | 2 +
.../org/apache/tika/parser/DigestingParser.java | 109 ---------------------
.../apache/tika/parser/AutoDetectParserTest.java | 15 ---
.../tika/parser/RecursiveParserWrapperTest.java | 24 ++---
.../parser/microsoft/ooxml/OOXMLParserTest.java | 15 +++
.../resources/configs/tika-config-md5-digest.json | 12 +++
6 files changed, 41 insertions(+), 136 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index bea9b52359..b9348576fa 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -27,6 +27,8 @@ Release 4.0.0-BETA1 - ???
* API changes in the EmbeddedStreamTranslator (TIKA-4518).
+ * Removed DigestingParser (TIKA-4607).
+
OTHER CHANGES
* Fix concurrency bug in TikaToXMP (TIKA-4393)
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
deleted file mode 100644
index 30369bf752..0000000000
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
-import org.apache.tika.extractor.EmbeddedStreamTranslator;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-
-/**
- * A parser decorator that computes digests of the parsed content.
- *
- * @deprecated Since 4.x. Use {@link
AutoDetectParserConfig#setDigesterFactory(org.apache.tika.digest.DigesterFactory)}
- * to configure digesting. The AutoDetectParser now calls digesting directly
in its parse method.
- * The interfaces {@link org.apache.tika.digest.Digester},
- * {@link org.apache.tika.digest.DigesterFactory}, and
- * {@link org.apache.tika.digest.Encoder} have moved to the
- * {@code org.apache.tika.digest} package.
- */
-@Deprecated
-public class DigestingParser extends ParserDecorator {
-
- private final EmbeddedStreamTranslator embeddedStreamTranslator = new
DefaultEmbeddedStreamTranslator();
- private final org.apache.tika.digest.Digester digester;
- private final boolean skipContainerDocument;
- /**
- * Creates a decorator for the given parser.
- *
- * @param parser the parser instance to be decorated
- * @param digester the digester to use
- * @param skipContainerDocument if true, skip digesting top-level documents
- */
- public DigestingParser(Parser parser, org.apache.tika.digest.Digester
digester,
- boolean skipContainerDocument) {
- super(parser);
- this.digester = digester;
- this.skipContainerDocument = skipContainerDocument;
- }
-
- @Override
- public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
- ParseContext context) throws IOException, SAXException,
TikaException {
-
-
- if (! shouldDigest(metadata)) {
- super.parse(tis, handler, metadata, context);
- return;
- }
- TemporaryResources tmp = new TemporaryResources();
- try {
-
- if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
- Path tmpBytes = tmp.createTempFile();
- try (OutputStream os = Files.newOutputStream(tmpBytes)) {
- embeddedStreamTranslator.translate(tis, metadata, os);
- }
- try (TikaInputStream translated =
TikaInputStream.get(tmpBytes)) {
- digester.digest(translated, metadata, context);
- }
- } else {
- digester.digest(tis, metadata, context);
- }
- super.parse(tis, handler, metadata, context);
- } finally {
- tmp.dispose();
- }
- }
-
- private boolean shouldDigest(Metadata metadata) {
- if (digester == null) {
- return false;
- }
- if (! skipContainerDocument) {
- return true;
- }
- Integer parseDepth =
metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
- if (parseDepth == null || parseDepth == 0) {
- return false;
- }
- return true;
- }
-
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 17a57934df..61348e510e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -27,7 +27,6 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
-import java.util.Locale;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@@ -39,8 +38,6 @@ import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.Detector;
-import org.apache.tika.digest.DigestDef;
-import org.apache.tika.digest.Digester;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
@@ -50,7 +47,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.parser.external.CompositeExternalParser;
import org.apache.tika.parser.ogg.FlacParser;
import org.apache.tika.parser.ogg.OpusParser;
@@ -563,7 +559,6 @@ public class AutoDetectParserTest extends TikaTest {
}
}
- @SuppressWarnings("deprecation")
@Test
public void testDigestingOpenContainers() throws Exception {
//TIKA-4533 -- this tests both that a very large embedded OLE doc
doesn't cause a zip bomb
@@ -580,15 +575,5 @@ public class AutoDetectParserTest extends TikaTest {
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256"));
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
-
- Digester digester = new CommonsDigester(10000,
DigestDef.Algorithm.SHA256);
-
- //now test that we get the same digest if we wrap the auto detect
parser vs configuring it
- autoDetectParser = new AutoDetectParser();
- Parser digestingParser = new DigestingParser(autoDetectParser,
digester, true);
- metadataList = getRecursiveMetadata("testLargeOLEDoc.doc",
digestingParser, new ParseContext());
- assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
- assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
-
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index e009d65b9f..b13bd3962c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -32,14 +32,12 @@ import org.apache.commons.io.input.ClosedInputStream;
import org.apache.commons.io.input.ProxyInputStream;
import org.junit.jupiter.api.Test;
+import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
-import org.apache.tika.digest.DigestDef;
-import org.apache.tika.digest.Digester;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -292,7 +290,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
"test_recursive_embedded_npe.docx");
list = getMetadata(metadata,
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- false, null);
+ false, false);
//Composite parser swallows caught TikaExceptions, IOExceptions and
SAXExceptions
//and just doesn't bother to report that there was an exception.
@@ -350,7 +348,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
"test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata,
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- true, new CommonsDigester(100000, DigestDef.Algorithm.MD5));
+ true, true);
String md5Key = "X-TIKA:digest:MD5";
assertEquals("59f626e09a8c16ab6dbc2800c685f772",
list.get(0).get(md5Key));
@@ -388,16 +386,18 @@ public class RecursiveParserWrapperTest extends TikaTest {
}
}
-
- @SuppressWarnings("deprecation")
+
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory
contentHandlerFactory,
boolean catchEmbeddedExceptions,
- Digester digester) throws Exception {
+ boolean digest) throws Exception {
ParseContext context = new ParseContext();
- Parser wrapped = AUTO_DETECT_PARSER;
- if (digester != null) {
- wrapped = new DigestingParser(wrapped, digester, false);
+ Parser wrapped;
+ if (digest) {
+ wrapped = TikaLoaderHelper.getLoader("tika-config-md5-digest.json")
+ .loadAutoDetectParser();
+ } else {
+ wrapped = AUTO_DETECT_PARSER;
}
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
@@ -422,7 +422,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory
contentHandlerFactory)
throws Exception {
- return getMetadata(metadata, contentHandlerFactory, true, null);
+ return getMetadata(metadata, contentHandlerFactory, true, false);
}
private static class CloseCountingInputStream extends ProxyInputStream {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 36038a8ca0..860d3ef147 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft.ooxml;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.util.List;
@@ -25,10 +26,12 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.EMFParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
@@ -142,4 +145,16 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Example of a table",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ @Test
+ public void testDigestTranslator() throws Exception {
+ Parser parser = TikaLoader.load(getConfigPath(OOXMLParserTest.class,
"tika-config-digests.json")).loadAutoDetectParser();
+ List<Metadata> metadataList =
getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser);
+ assertEquals(4, metadataList.size());
+ debug(metadataList);
+ for (Metadata m : metadataList) {
+ assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32"));
+ assertNull(m.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
new file mode 100644
index 0000000000..caffd0c709
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
@@ -0,0 +1,12 @@
+{
+ "auto-detect-parser": {
+ "digesterFactory": {
+ "commons-digester-factory": {
+ "markLimit": 100000,
+ "digests": [
+ { "algorithm": "MD5" }
+ ]
+ }
+ }
+ }
+}