This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8bec259d0593db9d785db29acb087b906d38acba Author: Tim Allison <[email protected]> AuthorDate: Wed Oct 29 18:57:49 2025 -0400 TIKA-4533 -- need to fix TikaInputStream's setting of length on spooling (#2379) * TIKA-4533 -- need to fix TikaInputStream's setting of length on spooling (cherry picked from commit 1ba2f3548e02087cfdf2516bad06da005ac46698) --- .../java/org/apache/tika/io/TikaInputStream.java | 9 ++++++- .../parser/microsoft/pst/PSTMailItemParser.java | 1 + .../apache/tika/parser/AutoDetectParserTest.java | 31 +++++++++++++--------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 6ba1d2d97..71dbe97cb 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -257,6 +257,7 @@ public class TikaInputStream extends TaggedInputStream { tis.setOpenContainer(openContainer); //this overwrites the length that was set in the constructor above tis.setLength(length); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); return tis; } @@ -775,7 +776,13 @@ public class TikaInputStream extends TaggedInputStream { }; // Update length to file size. Update position, mark - length = Files.size(path); + long sz = Files.size(path); + if (getOpenContainer() != null && sz == 0 && length > -1) { + //don't update size if there's an open container and the sz == 0 + //hope that the length was sent in earlier via getFromContainer + } else { + length = sz; + } position = 0; mark = -1; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index 5532525ea..28791bb7c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -245,6 +245,7 @@ public class PSTMailItemParser implements Parser { attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename); attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + attachMeta.set(Metadata.CONTENT_LENGTH, Integer.toString(attachment.getSize())); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", filename); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index c3621576f..bf16e5f62 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -46,6 +46,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -561,31 +562,35 @@ public class AutoDetectParserTest extends TikaTest { } } - @Test - public void testLargeEmbeddedOle2Object() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc"); - assertEquals(3, metadataList.size()); - assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); - } - @Test public void testDigestingOpenContainers() throws Exception { + //TIKA-4533 -- this tests both that a very large embedded OLE doc doesn't cause a zip bomb + //exception AND that the sha for the embedded OLE doc is not the sha for a zero-byte file String expectedSha = "bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78"; TikaConfig tikaConfig = null; try (InputStream is = AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) { tikaConfig = new TikaConfig(is); } - Parser parser = new AutoDetectParser(tikaConfig); - List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext()); + AutoDetectParser autoDetectParser = new AutoDetectParser(tikaConfig); + //this models what happens in tika-pipes + if (autoDetectParser.getAutoDetectParserConfig() + .getEmbeddedDocumentExtractorFactory() == null) { + autoDetectParser.getAutoDetectParserConfig() + .setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory()); + } + List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new ParseContext()); assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256")); + assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); - //now test that we get the same digest if we warp the auto detect parser vs configuring it - Parser autoDetectParser = new AutoDetectParser(); - Parser digestingParser = new DigestingParser(autoDetectParser, new CommonsDigester(10000, "SHA256"), true); + DigestingParser.Digester digester = new CommonsDigester(10000, "SHA256"); + //now test that we get the same digest if we wrap the auto detect parser vs configuring it + autoDetectParser = new AutoDetectParser(); + Parser digestingParser = new DigestingParser(autoDetectParser, digester, true); metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", digestingParser, new ParseContext()); assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US)); - + assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); } }
