This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0563063d336414c1decefce7b829e58ee64d3360 Author: Tim Allison <[email protected]> AuthorDate: Wed Oct 29 14:22:10 2025 -0400 TIKA-4533 -- fix handling of TikaInputStreams with open containers (#2378) (cherry picked from commit 5dbfb15d04aef1c7e5a970e02add8ab60a08a406) --- .../java/org/apache/tika/io/TikaInputStream.java | 34 +++++++++++++++ .../org/apache/tika/parser/DigestingParser.java | 23 +++++++++- .../org/apache/tika/sax/SecureContentHandler.java | 10 ++--- .../parser/microsoft/AbstractPOIFSExtractor.java | 32 +++++++++----- .../parser/microsoft/pst/OutlookPSTParser.java | 31 ++++++++++++-- .../parser/microsoft/pst/PSTMailItemParser.java | 4 +- .../apache/tika/parser/AutoDetectParserTest.java | 32 ++++++++++++++ .../src/test/resources/configs/tika-4533.xml | 47 +++++++++++++++++++++ .../resources/test-documents/testLargeOLEDoc.doc | Bin 0 -> 2077696 bytes 9 files changed, 192 insertions(+), 21 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 1afa2d14e..8c2a85952 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -240,6 +240,26 @@ public class TikaInputStream extends TaggedInputStream { } } + /** + * Use this if there is no actual underlying InputStream. It is important + * to set a length so that the zip bomb detector won't be triggered + * in the SecurityHandler. + * <p> + * If your stream has underlying bytes and a length, see {@link #setOpenContainer(Object)} + * + * @param openContainer + * @param length + * @param metadata + * @return + */ + public static TikaInputStream getFromContainer(Object openContainer, long length, Metadata metadata) { + TikaInputStream tis = TikaInputStream.get(new byte[0], metadata); + tis.setOpenContainer(openContainer); + //this overwrites the length that was set in the constructor above + tis.setLength(length); + return tis; + } + /** * Casts or wraps the given stream to a TikaInputStream instance. * This method can be used to access the functionality of this class @@ -637,6 +657,10 @@ public class TikaInputStream extends TaggedInputStream { * the stream, eg after a Zip contents * detector has loaded the file to decide * what it contains. + * <p> + * If there's no undelrying stream, consider {@link #getFromContainer(Object, long, Metadata)} + * because that will avoid potential improper zip bomb exceptions from the SecurityHandler if + * it thinks the length of the stream == 0. */ public void setOpenContainer(Object container) { openContainer = container; @@ -787,6 +811,16 @@ public class TikaInputStream extends TaggedInputStream { return position; } + /** + * This should only be called by the constructor for an open container with a 0 length + * byte inputStream + * + * @param length + */ + private void setLength(long length) { + this.length = length; + } + /** * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure * that the alleged bytes skipped were actually skipped. diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java index 8c0358da7..d0bcaa1f9 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java @@ -20,11 +20,16 @@ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; +import org.apache.tika.extractor.EmbeddedStreamTranslator; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -32,6 +37,7 @@ import org.apache.tika.metadata.TikaCoreProperties; public class DigestingParser extends ParserDecorator { + private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); private final Digester digester; private final boolean skipContainerDocument; /** @@ -48,10 +54,25 @@ public class DigestingParser extends ParserDecorator { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + + + if (! shouldDigest(metadata)) { + super.parse(stream, handler, metadata, context); + return; + } TemporaryResources tmp = new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); try { - if (shouldDigest(metadata)) { + + if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { + Path tmpBytes = tmp.createTempFile(); + try (OutputStream os = Files.newOutputStream(tmpBytes)) { + embeddedStreamTranslator.translate(tis, metadata, os); + } + try (TikaInputStream translated = TikaInputStream.get(tmpBytes)) { + digester.digest(translated, metadata, context); + } + } else { digester.digest(tis, metadata, context); } super.parse(tis, handler, metadata, context); diff --git a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java index 3f9f3c42b..fa9d682e8 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java @@ -208,11 +208,11 @@ public class SecureContentHandler extends ContentHandlerDecorator { */ protected void advance(int length) throws SAXException { characterCount += length; - long byteCount = getByteCount(); - if (characterCount > threshold && characterCount > byteCount * ratio) { - throw new SecureSAXException( - "Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount + - " output characters"); + if (characterCount > threshold) { + long byteCount = getByteCount(); + if (characterCount > byteCount * ratio) { + throw new SecureSAXException("Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount + " output characters"); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index db5898ec8..f49c0a971 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -20,6 +20,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.Iterator; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.hpsf.ClassID; @@ -194,7 +195,6 @@ abstract class AbstractPOIFSExtractor { } // It's regular OLE2: - // What kind of document is it? metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, dir.getName()); if (dir.getStorageClsid() != null) { @@ -237,6 +237,18 @@ abstract class AbstractPOIFSExtractor { } } + private long estimateSize(DirectoryEntry dir) { + Iterator<Entry> entries = dir.getEntries(); + long sz = 0; + while (entries.hasNext()) { + Entry entry = entries.next(); + if (entry.isDocumentEntry()) { + sz += ((DocumentEntry)entry).getSize(); + } + } + return sz; + } + private void extractOCXName(DirectoryEntry dir, Metadata metadata) { if (! dir.hasEntry(OCX_NAME)) { return; @@ -266,14 +278,14 @@ abstract class AbstractPOIFSExtractor { } } - private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName, + private void handleCompObj(DirectoryEntry parentDir, POIFSDocumentType type, String rName, Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException { //TODO: figure out if the equivalent of OLE 1.0's //getCommand() and getFileName() exist for OLE 2.0 to populate //TikaCoreProperties.ORIGINAL_RESOURCE_NAME - String contentsEntryName = getContentsEntryName(dir); + String contentsEntryName = getContentsEntryName(parentDir); if (contentsEntryName == null) { //log or record exception? return; @@ -282,7 +294,7 @@ abstract class AbstractPOIFSExtractor { DocumentEntry contentsEntry; try { - contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName); + contentsEntry = (DocumentEntry) parentDir.getEntry(contentsEntryName); } catch (FileNotFoundException fnfe) { EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata); return; @@ -314,7 +326,7 @@ abstract class AbstractPOIFSExtractor { metadata.set(Metadata.CONTENT_TYPE, mediaType.getType()); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension); metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); - parseEmbedded(dir, tis, xhtml, metadata, outputHtml); + parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml); } finally { inp.close(); } @@ -374,15 +386,15 @@ abstract class AbstractPOIFSExtractor { } } - private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml, + private void parseEmbedded(DirectoryEntry parentDir, TikaInputStream tis, XHTMLContentHandler xhtml, Metadata metadata, boolean outputHtml) throws IOException, SAXException { if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) { return; } - if (dir.getStorageClsid() != null) { + if (parentDir.getStorageClsid() != null) { metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID, - dir.getStorageClsid().toString()); + parentDir.getStorageClsid().toString()); } embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml); } @@ -393,8 +405,8 @@ abstract class AbstractPOIFSExtractor { if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) { return; } - try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { - tis.setOpenContainer(dir); + long sz = estimateSize(dir); + try (TikaInputStream tis = TikaInputStream.getFromContainer(dir, sz, metadata)) { if (dir.getStorageClsid() != null) { metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index 8cfb938c9..08f49daf3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; +import com.pff.PSTException; import com.pff.PSTFile; import com.pff.PSTFolder; import com.pff.PSTMessage; @@ -114,9 +115,9 @@ public class OutlookPSTParser implements Parser { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); metadata.set(PST.PST_FOLDER_PATH, folderPath); - try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { - tis.setOpenContainer(pstMail); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getSubject() + ".msg"); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getSubject() + ".msg"); + long length = estimateSize(pstMail); + try (TikaInputStream tis = TikaInputStream.getFromContainer(pstMail, length, metadata)) { embeddedExtractor.parseEmbedded(tis, handler, metadata, true); } pstMail = (PSTMessage) pstFolder.getNextChild(); @@ -134,4 +135,28 @@ public class OutlookPSTParser implements Parser { } } } + + static protected long estimateSize(PSTMessage attachedEmail) { + //we do this for a rough estimate of email body size + //so that we don't get a zip bomb exception on exceedingly large msgs. + long sz = 0; + sz += getStringLength(attachedEmail.getBody()); + try { + sz += getStringLength(attachedEmail.getRTFBody()); + } catch (PSTException | IOException e) { + //swallow + } + sz += getStringLength(attachedEmail.getBodyHTML()); + sz += getStringLength(attachedEmail.getSubject()); + //complete heuristic to account for from, to, etc... + sz += 100_000; + return sz; + } + + private static long getStringLength(String s) { + if (s == null) { + return 0; + } + return s.length(); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index b54d25683..5532525ea 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -222,8 +222,8 @@ public class PSTMailItemParser implements Parser { PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage(); //check for whether this is a binary attachment or an embedded pst msg if (attachedEmail != null) { - try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { - tis.setOpenContainer(attachedEmail); + long sz = OutlookPSTParser.estimateSize(attachedEmail); + try (TikaInputStream tis = TikaInputStream.getFromContainer(attachedEmail, sz, metadata)) { Metadata attachMetadata = new Metadata(); attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attachedEmail.getSubject() + ".msg"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index ed01ae9b0..c3621576f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -19,6 +19,7 @@ package org.apache.tika.parser; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -27,6 +28,8 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; +import java.util.List; +import java.util.Locale; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -48,6 +51,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.parser.external.CompositeExternalParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ToXMLContentHandler; @@ -556,4 +560,32 @@ public class AutoDetectParserTest extends TikaTest { " expectedContentFragment = " + expectedContentFragment + "\n"; } } + + @Test + public void testLargeEmbeddedOle2Object() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc"); + assertEquals(3, metadataList.size()); + assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + } + + @Test + public void testDigestingOpenContainers() throws Exception { + String expectedSha = "bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78"; + TikaConfig tikaConfig = null; + try (InputStream is = AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) { + tikaConfig = new TikaConfig(is); + } + Parser parser = new AutoDetectParser(tikaConfig); + List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext()); + assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256")); + + //now test that we get the same digest if we warp the auto detect parser vs configuring it + Parser autoDetectParser = new AutoDetectParser(); + Parser digestingParser = new DigestingParser(autoDetectParser, new CommonsDigester(10000, "SHA256"), true); + + metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", digestingParser, new ParseContext()); + assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US)); + + + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml new file mode 100644 index 000000000..83661eca5 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <autoDetectParserConfig> + <params> + <!-- if the incoming metadata object has a ContentLength entry and it is larger than this + value, spool the file to disk; this is useful for some file formats that are more efficiently + processed via a file instead of an InputStream --> + <spoolToDisk>0</spoolToDisk> + <!-- the next four are parameters for the SecureContentHandler --> + <!-- threshold used in zip bomb detection. This many characters must be written + before the maximum compression ratio is calculated --> + <outputThreshold>10000</outputThreshold> + <!-- maximum compression ratio between output characters and input bytes --> + <maximumCompressionRatio>100</maximumCompressionRatio> + <!-- maximum XML element nesting level --> + <maximumDepth>100</maximumDepth> + <!-- maximum embedded file depth --> + <maximumPackageEntryDepth>100</maximumPackageEntryDepth> + <!-- throw an exception if a file has zero bytes --> + <throwOnZeroBytes>false</throwOnZeroBytes> + </params> + <!-- as of Tika 2.5.x, this is the preferred way to configure digests --> + <digesterFactory class="org.apache.tika.parser.digestutils.CommonsDigesterFactory"> + <params> + <markLimit>100000</markLimit> + <!-- this specifies SHA256, base32 and MD5 --> + <algorithmString>sha256</algorithmString> + </params> + </digesterFactory> + </autoDetectParserConfig> +</properties> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc new file mode 100644 index 000000000..473eada53 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc differ
