This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4533b in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5f3681d60c2a67035087bf5083664a2d2ca42ce3 Author: tallison <[email protected]> AuthorDate: Wed Oct 29 17:10:09 2025 -0400 TIKA-4533 -- need to fix TikaInputStream's setting of length on spooling --- .../java/org/apache/tika/io/TikaInputStream.java | 9 ++++++++- .../parser/microsoft/pst/PSTMailItemParser.java | 1 + .../apache/tika/parser/AutoDetectParserTest.java | 22 ++++++++++++++++------ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 84ccd6f17..254bff7ba 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -262,6 +262,7 @@ public class TikaInputStream extends TaggedInputStream { tis.setOpenContainer(openContainer); //this overwrites the length that was set in the constructor above tis.setLength(length); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); return tis; } @@ -791,7 +792,13 @@ public class TikaInputStream extends TaggedInputStream { }; // Update length to file size. Update position, mark - length = Files.size(path); + long sz = Files.size(path); + if (getOpenContainer() != null && sz == 0 && length > -1) { + //don't update size if there's an open container and the sz == 0 + //hope that the length was sent in earlier via getFromContainer + } else { + length = sz; + } position = 0; mark = -1; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index 3b2cded70..00ee7c571 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -247,6 +247,7 @@ public class PSTMailItemParser implements Parser { attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename); attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + attachMeta.set(Metadata.CONTENT_LENGTH, Integer.toString(attachment.getSize())); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", filename); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index c3621576f..846b2ba3f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -46,6 +46,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -575,17 +576,26 @@ public class AutoDetectParserTest extends TikaTest { try (InputStream is = AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) { tikaConfig = new TikaConfig(is); } - Parser parser = new AutoDetectParser(tikaConfig); - List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext()); + Parser autoDetectParser = new AutoDetectParser(tikaConfig); + if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig() + .getEmbeddedDocumentExtractorFactory() == null) { + ((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory( + new RUnpackExtractorFactory()); + } + List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new ParseContext()); assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256")); + assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); - //now test that we get the same digest if we warp the auto detect parser vs configuring it - Parser autoDetectParser = new AutoDetectParser(); - Parser digestingParser = new DigestingParser(autoDetectParser, new CommonsDigester(10000, "SHA256"), true); + DigestingParser.Digester digester = new CommonsDigester(10000, "SHA256"); + //now test that we get the same digest if we wrap the auto detect parser vs configuring it + autoDetectParser = new AutoDetectParser(); + Parser digestingParser = new DigestingParser(autoDetectParser, digester, true); metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", digestingParser, new ParseContext()); assertEquals(expectedSha, metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US)); - + assertEquals(2049290L, Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH))); } }
