This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5dbfb15d0 TIKA-4533 -- fix handling of TikaInputStreams with open
containers (#2378)
5dbfb15d0 is described below
commit 5dbfb15d04aef1c7e5a970e02add8ab60a08a406
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 29 14:22:10 2025 -0400
TIKA-4533 -- fix handling of TikaInputStreams with open containers (#2378)
---
.../java/org/apache/tika/io/TikaInputStream.java | 34 +++++++++++++++
.../org/apache/tika/parser/DigestingParser.java | 23 +++++++++-
.../org/apache/tika/sax/SecureContentHandler.java | 10 ++---
.../parser/microsoft/AbstractPOIFSExtractor.java | 32 +++++++++-----
.../parser/microsoft/pst/OutlookPSTParser.java | 31 ++++++++++++--
.../parser/microsoft/pst/PSTMailItemParser.java | 4 +-
.../apache/tika/parser/AutoDetectParserTest.java | 32 ++++++++++++++
.../src/test/resources/configs/tika-4533.xml | 47 +++++++++++++++++++++
.../resources/test-documents/testLargeOLEDoc.doc | Bin 0 -> 2077696 bytes
9 files changed, 192 insertions(+), 21 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 0bebd1886..84ccd6f17 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -245,6 +245,26 @@ public class TikaInputStream extends TaggedInputStream {
}
}
+ /**
+ * Use this if there is no actual underlying InputStream. It is important
+ * to set a length so that the zip bomb detector won't be triggered
+ * in the SecurityHandler.
+ * <p>
+ * If your stream has underlying bytes and a length, see {@link
#setOpenContainer(Object)}
+ *
+ * @param openContainer
+ * @param length
+ * @param metadata
+ * @return
+ */
+ public static TikaInputStream getFromContainer(Object openContainer, long
length, Metadata metadata) {
+ TikaInputStream tis = TikaInputStream.get(new byte[0], metadata);
+ tis.setOpenContainer(openContainer);
+ //this overwrites the length that was set in the constructor above
+ tis.setLength(length);
+ return tis;
+ }
+
/**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
@@ -668,6 +688,10 @@ public class TikaInputStream extends TaggedInputStream {
* the stream, eg after a Zip contents
* detector has loaded the file to decide
* what it contains.
+ * <p>
+ * If there's no undelrying stream, consider {@link
#getFromContainer(Object, long, Metadata)}
+ * because that will avoid potential improper zip bomb exceptions from the
SecurityHandler if
+ * it thinks the length of the stream == 0.
*/
public void setOpenContainer(Object container) {
openContainer = container;
@@ -818,6 +842,16 @@ public class TikaInputStream extends TaggedInputStream {
return position;
}
+ /**
+ * This should only be called by the constructor for an open container
with a 0 length
+ * byte inputStream
+ *
+ * @param length
+ */
+ private void setLength(long length) {
+ this.length = length;
+ }
+
/**
* This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure
* that the alleged bytes skipped were actually skipped.
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 8c0358da7..d0bcaa1f9 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -20,11 +20,16 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -32,6 +37,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
public class DigestingParser extends ParserDecorator {
+ private final EmbeddedStreamTranslator embeddedStreamTranslator = new
DefaultEmbeddedStreamTranslator();
private final Digester digester;
private final boolean skipContainerDocument;
/**
@@ -48,10 +54,25 @@ public class DigestingParser extends ParserDecorator {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
ParseContext context) throws IOException, SAXException,
TikaException {
+
+
+ if (! shouldDigest(metadata)) {
+ super.parse(stream, handler, metadata, context);
+ return;
+ }
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try {
- if (shouldDigest(metadata)) {
+
+ if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+ Path tmpBytes = tmp.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpBytes)) {
+ embeddedStreamTranslator.translate(tis, metadata, os);
+ }
+ try (TikaInputStream translated =
TikaInputStream.get(tmpBytes)) {
+ digester.digest(translated, metadata, context);
+ }
+ } else {
digester.digest(tis, metadata, context);
}
super.parse(tis, handler, metadata, context);
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
index 3f9f3c42b..fa9d682e8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
@@ -208,11 +208,11 @@ public class SecureContentHandler extends
ContentHandlerDecorator {
*/
protected void advance(int length) throws SAXException {
characterCount += length;
- long byteCount = getByteCount();
- if (characterCount > threshold && characterCount > byteCount * ratio) {
- throw new SecureSAXException(
- "Suspected zip bomb: " + byteCount + " input bytes
produced " + characterCount +
- " output characters");
+ if (characterCount > threshold) {
+ long byteCount = getByteCount();
+ if (characterCount > byteCount * ratio) {
+ throw new SecureSAXException("Suspected zip bomb: " +
byteCount + " input bytes produced " + characterCount + " output characters");
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 0de4d9c28..7899a488f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -20,6 +20,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.hpsf.ClassID;
@@ -194,7 +195,6 @@ abstract class AbstractPOIFSExtractor {
}
// It's regular OLE2:
-
// What kind of document is it?
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
dir.getName());
if (dir.getStorageClsid() != null) {
@@ -237,6 +237,18 @@ abstract class AbstractPOIFSExtractor {
}
}
+ private long estimateSize(DirectoryEntry dir) {
+ Iterator<Entry> entries = dir.getEntries();
+ long sz = 0;
+ while (entries.hasNext()) {
+ Entry entry = entries.next();
+ if (entry.isDocumentEntry()) {
+ sz += ((DocumentEntry)entry).getSize();
+ }
+ }
+ return sz;
+ }
+
private void extractOCXName(DirectoryEntry dir, Metadata metadata) {
if (! dir.hasEntry(OCX_NAME)) {
return;
@@ -266,14 +278,14 @@ abstract class AbstractPOIFSExtractor {
}
}
- private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type,
String rName,
+ private void handleCompObj(DirectoryEntry parentDir, POIFSDocumentType
type, String rName,
Metadata metadata, XHTMLContentHandler xhtml,
boolean outputHtml)
throws IOException, SAXException {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
- String contentsEntryName = getContentsEntryName(dir);
+ String contentsEntryName = getContentsEntryName(parentDir);
if (contentsEntryName == null) {
//log or record exception?
return;
@@ -282,7 +294,7 @@ abstract class AbstractPOIFSExtractor {
DocumentEntry contentsEntry;
try {
- contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName);
+ contentsEntry = (DocumentEntry)
parentDir.getEntry(contentsEntryName);
} catch (FileNotFoundException fnfe) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe,
parentMetadata);
return;
@@ -314,7 +326,7 @@ abstract class AbstractPOIFSExtractor {
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName +
extension);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
- parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
+ parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml);
} finally {
inp.close();
}
@@ -374,15 +386,15 @@ abstract class AbstractPOIFSExtractor {
}
}
- private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis,
XHTMLContentHandler xhtml,
+ private void parseEmbedded(DirectoryEntry parentDir, TikaInputStream tis,
XHTMLContentHandler xhtml,
Metadata metadata, boolean outputHtml) throws
IOException,
SAXException {
if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
return;
}
- if (dir.getStorageClsid() != null) {
+ if (parentDir.getStorageClsid() != null) {
metadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
- dir.getStorageClsid().toString());
+ parentDir.getStorageClsid().toString());
}
embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
}
@@ -393,8 +405,8 @@ abstract class AbstractPOIFSExtractor {
if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
return;
}
- try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- tis.setOpenContainer(dir);
+ long sz = estimateSize(dir);
+ try (TikaInputStream tis = TikaInputStream.getFromContainer(dir, sz,
metadata)) {
if (dir.getStorageClsid() != null) {
metadata.set(Office.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 8cfb938c9..08f49daf3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
+import com.pff.PSTException;
import com.pff.PSTFile;
import com.pff.PSTFolder;
import com.pff.PSTMessage;
@@ -114,9 +115,9 @@ public class OutlookPSTParser implements Parser {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
metadata.set(PST.PST_FOLDER_PATH, folderPath);
- try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- tis.setOpenContainer(pstMail);
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getSubject() + ".msg");
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getSubject() + ".msg");
+ long length = estimateSize(pstMail);
+ try (TikaInputStream tis =
TikaInputStream.getFromContainer(pstMail, length, metadata)) {
embeddedExtractor.parseEmbedded(tis, handler, metadata,
true);
}
pstMail = (PSTMessage) pstFolder.getNextChild();
@@ -134,4 +135,28 @@ public class OutlookPSTParser implements Parser {
}
}
}
+
+ static protected long estimateSize(PSTMessage attachedEmail) {
+ //we do this for a rough estimate of email body size
+ //so that we don't get a zip bomb exception on exceedingly large msgs.
+ long sz = 0;
+ sz += getStringLength(attachedEmail.getBody());
+ try {
+ sz += getStringLength(attachedEmail.getRTFBody());
+ } catch (PSTException | IOException e) {
+ //swallow
+ }
+ sz += getStringLength(attachedEmail.getBodyHTML());
+ sz += getStringLength(attachedEmail.getSubject());
+ //complete heuristic to account for from, to, etc...
+ sz += 100_000;
+ return sz;
+ }
+
+ private static long getStringLength(String s) {
+ if (s == null) {
+ return 0;
+ }
+ return s.length();
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 13c23a690..3b2cded70 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -224,8 +224,8 @@ public class PSTMailItemParser implements Parser {
PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage();
//check for whether this is a binary attachment or an embedded pst msg
if (attachedEmail != null) {
- try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- tis.setOpenContainer(attachedEmail);
+ long sz = OutlookPSTParser.estimateSize(attachedEmail);
+ try (TikaInputStream tis =
TikaInputStream.getFromContainer(attachedEmail, sz, metadata)) {
Metadata attachMetadata = new Metadata();
attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attachedEmail.getSubject() + ".msg");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index ed01ae9b0..c3621576f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -27,6 +28,8 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@@ -48,6 +51,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.parser.external.CompositeExternalParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
@@ -556,4 +560,32 @@ public class AutoDetectParserTest extends TikaTest {
" expectedContentFragment = " + expectedContentFragment +
"\n";
}
}
+
+ @Test
+ public void testLargeEmbeddedOle2Object() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc");
+ assertEquals(3, metadataList.size());
+
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ }
+
+ @Test
+ public void testDigestingOpenContainers() throws Exception {
+ String expectedSha =
"bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78";
+ TikaConfig tikaConfig = null;
+ try (InputStream is =
AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser parser = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext());
+ assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256"));
+
+ //now test that we get the same digest if we warp the auto detect
parser vs configuring it
+ Parser autoDetectParser = new AutoDetectParser();
+ Parser digestingParser = new DigestingParser(autoDetectParser, new
CommonsDigester(10000, "SHA256"), true);
+
+ metadataList = getRecursiveMetadata("testLargeOLEDoc.doc",
digestingParser, new ParseContext());
+ assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
+
+
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
new file mode 100644
index 000000000..83661eca5
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <autoDetectParserConfig>
+ <params>
+ <!-- if the incoming metadata object has a ContentLength entry and it is
larger than this
+ value, spool the file to disk; this is useful for some file formats
that are more efficiently
+ processed via a file instead of an InputStream -->
+ <spoolToDisk>0</spoolToDisk>
+ <!-- the next four are parameters for the SecureContentHandler -->
+ <!-- threshold used in zip bomb detection. This many characters must be
written
+ before the maximum compression ratio is calculated -->
+ <outputThreshold>10000</outputThreshold>
+ <!-- maximum compression ratio between output characters and input bytes
-->
+ <maximumCompressionRatio>100</maximumCompressionRatio>
+ <!-- maximum XML element nesting level -->
+ <maximumDepth>100</maximumDepth>
+ <!-- maximum embedded file depth -->
+ <maximumPackageEntryDepth>100</maximumPackageEntryDepth>
+ <!-- throw an exception if a file has zero bytes -->
+ <throwOnZeroBytes>false</throwOnZeroBytes>
+ </params>
+ <!-- as of Tika 2.5.x, this is the preferred way to configure digests -->
+ <digesterFactory
class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
+ <params>
+ <markLimit>100000</markLimit>
+ <!-- this specifies SHA256, base32 and MD5 -->
+ <algorithmString>sha256</algorithmString>
+ </params>
+ </digesterFactory>
+ </autoDetectParserConfig>
+</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc
new file mode 100644
index 000000000..473eada53
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testLargeOLEDoc.doc
differ