This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4607-main in repository https://gitbox.apache.org/repos/asf/tika.git
commit b605018270851a3171f984150b7f57dcbfdb8f90 Author: tallison <[email protected]> AuthorDate: Tue Dec 30 16:00:50 2025 -0500 TIKA-4607 - rm DigestingParser from 4.x --- CHANGES.txt | 2 + .../org/apache/tika/parser/DigestingParser.java | 109 --------------------- .../apache/tika/parser/AutoDetectParserTest.java | 15 --- .../tika/parser/RecursiveParserWrapperTest.java | 21 ++-- .../parser/microsoft/ooxml/OOXMLParserTest.java | 15 +++ .../resources/configs/tika-config-md5-digest.json | 12 +++ 6 files changed, 39 insertions(+), 135 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index bea9b5235..b9348576f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -27,6 +27,8 @@ Release 4.0.0-BETA1 - ??? * API changes in the EmbeddedStreamTranslator (TIKA-4518). + * Removed DigestingParser (TIKA-4607). + OTHER CHANGES * Fix concurrency bug in TikaToXMP (TIKA-4393) diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java deleted file mode 100644 index 30369bf75..000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser; - - -import java.io.IOException; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; -import org.apache.tika.extractor.EmbeddedStreamTranslator; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; - -/** - * A parser decorator that computes digests of the parsed content. - * - * @deprecated Since 4.x. Use {@link AutoDetectParserConfig#setDigesterFactory(org.apache.tika.digest.DigesterFactory)} - * to configure digesting. The AutoDetectParser now calls digesting directly in its parse method. - * The interfaces {@link org.apache.tika.digest.Digester}, - * {@link org.apache.tika.digest.DigesterFactory}, and - * {@link org.apache.tika.digest.Encoder} have moved to the - * {@code org.apache.tika.digest} package. - */ -@Deprecated -public class DigestingParser extends ParserDecorator { - - private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); - private final org.apache.tika.digest.Digester digester; - private final boolean skipContainerDocument; - /** - * Creates a decorator for the given parser. - * - * @param parser the parser instance to be decorated - * @param digester the digester to use - * @param skipContainerDocument if true, skip digesting top-level documents - */ - public DigestingParser(Parser parser, org.apache.tika.digest.Digester digester, - boolean skipContainerDocument) { - super(parser); - this.digester = digester; - this.skipContainerDocument = skipContainerDocument; - } - - @Override - public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - - - if (! shouldDigest(metadata)) { - super.parse(tis, handler, metadata, context); - return; - } - TemporaryResources tmp = new TemporaryResources(); - try { - - if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { - Path tmpBytes = tmp.createTempFile(); - try (OutputStream os = Files.newOutputStream(tmpBytes)) { - embeddedStreamTranslator.translate(tis, metadata, os); - } - try (TikaInputStream translated = TikaInputStream.get(tmpBytes)) { - digester.digest(translated, metadata, context); - } - } else { - digester.digest(tis, metadata, context); - } - super.parse(tis, handler, metadata, context); - } finally { - tmp.dispose(); - } - } - - private boolean shouldDigest(Metadata metadata) { - if (digester == null) { - return false; - } - if (! skipContainerDocument) { - return true; - } - Integer parseDepth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH); - if (parseDepth == null || parseDepth == 0) { - return false; - } - return true; - } - -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index 17a57934d..61348e510 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -27,7 +27,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -39,8 +38,6 @@ import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.Detector; -import org.apache.tika.digest.DigestDef; -import org.apache.tika.digest.Digester; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.exception.ZeroByteFileException; @@ -50,7 +47,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.parser.external.CompositeExternalParser; import org.apache.tika.parser.ogg.FlacParser; import org.apache.tika.parser.ogg.OpusParser; @@ -563,7 +559,6 @@ public class AutoDetectParserTest extends TikaTest { } } - @SuppressWarnings("deprecation") @Test public void testDigestingOpenContainers() throws Exception { //TIKA-4533 -- this tests both that a very large embedded OLE doc doesn't cause a zip bomb @@ -580,15 +575,5 @@ public class AutoDetectParserTest extends TikaTest { assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256")); assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); - - Digester digester = new CommonsDigester(10000, DigestDef.Algorithm.SHA256); - - //now test that we get the same digest if we wrap the auto detect parser vs configuring it - autoDetectParser = new AutoDetectParser(); - Parser digestingParser = new DigestingParser(autoDetectParser, digester, true); - metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", digestingParser, new ParseContext()); - assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US)); - assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 3cabccdfa..cf93064f5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -36,14 +36,11 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; -import org.apache.tika.digest.DigestDef; -import org.apache.tika.digest.Digester; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; @@ -296,7 +293,7 @@ public class RecursiveParserWrapperTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx"); list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - false, null); + false, false); //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions //and just doesn't bother to report that there was an exception. @@ -354,7 +351,7 @@ public class RecursiveParserWrapperTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - true, new CommonsDigester(100000, DigestDef.Algorithm.MD5)); + true, true); String md5Key = "X-TIKA:digest:MD5"; assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key)); @@ -435,15 +432,17 @@ public class RecursiveParserWrapperTest extends TikaTest { } } - @SuppressWarnings("deprecation") private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, - Digester digester) throws Exception { + boolean digest) throws Exception { ParseContext context = new ParseContext(); - Parser wrapped = AUTO_DETECT_PARSER; - if (digester != null) { - wrapped = new DigestingParser(wrapped, digester, false); + Parser wrapped; + if (digest) { + wrapped = TikaLoaderHelper.getLoader("tika-config-md5-digest.json") + .loadAutoDetectParser(); + } else { + wrapped = AUTO_DETECT_PARSER; } RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions); @@ -468,7 +467,7 @@ public class RecursiveParserWrapperTest extends TikaTest { private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory) throws Exception { - return getMetadata(metadata, contentHandlerFactory, true, null); + return getMetadata(metadata, contentHandlerFactory, true, false); } private static class CloseCountingInputStream extends ProxyInputStream { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 36038a8ca..860d3ef14 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -17,6 +17,7 @@ package org.apache.tika.parser.microsoft.ooxml; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import java.util.List; @@ -25,10 +26,12 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; @@ -142,4 +145,16 @@ public class OOXMLParserTest extends TikaTest { assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); } + + @Test + public void testDigestTranslator() throws Exception { + Parser parser = TikaLoader.load(getConfigPath(OOXMLParserTest.class, "tika-config-digests.json")).loadAutoDetectParser(); + List<Metadata> metadataList = getRecursiveMetadata("testMSChart-govdocs-428996.pptx", parser); + assertEquals(4, metadataList.size()); + debug(metadataList); + for (Metadata m : metadataList) { + assertNotNull(m.get("X-TIKA:digest:SHA256:BASE32")); + assertNull(m.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json new file mode 100644 index 000000000..caffd0c70 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json @@ -0,0 +1,12 @@ +{ + "auto-detect-parser": { + "digesterFactory": { + "commons-digester-factory": { + "markLimit": 100000, + "digests": [ + { "algorithm": "MD5" } + ] + } + } + } +}
