This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 1ba2f3548 TIKA-4533 -- need to fix TikaInputStream's setting of length
on spooling (#2379)
1ba2f3548 is described below
commit 1ba2f3548e02087cfdf2516bad06da005ac46698
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 29 18:57:49 2025 -0400
TIKA-4533 -- need to fix TikaInputStream's setting of length on spooling
(#2379)
* TIKA-4533 -- need to fix TikaInputStream's setting of length on spooling
---
.../java/org/apache/tika/io/TikaInputStream.java | 9 ++++++-
.../parser/microsoft/pst/PSTMailItemParser.java | 1 +
.../apache/tika/parser/AutoDetectParserTest.java | 31 +++++++++++++---------
3 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 84ccd6f17..254bff7ba 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -262,6 +262,7 @@ public class TikaInputStream extends TaggedInputStream {
tis.setOpenContainer(openContainer);
//this overwrites the length that was set in the constructor above
tis.setLength(length);
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
return tis;
}
@@ -791,7 +792,13 @@ public class TikaInputStream extends TaggedInputStream {
};
// Update length to file size. Update position, mark
- length = Files.size(path);
+ long sz = Files.size(path);
+ if (getOpenContainer() != null && sz == 0 && length > -1) {
+ //don't update size if there's an open container and the sz == 0
+ //hope that the length was sent in earlier via getFromContainer
+ } else {
+ length = sz;
+ }
position = 0;
mark = -1;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 3b2cded70..00ee7c571 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -247,6 +247,7 @@ public class PSTMailItemParser implements Parser {
attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename);
attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+ attachMeta.set(Metadata.CONTENT_LENGTH,
Integer.toString(attachment.getSize()));
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", filename);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index c3621576f..bf16e5f62 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.extractor.RUnpackExtractorFactory;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -561,31 +562,35 @@ public class AutoDetectParserTest extends TikaTest {
}
}
- @Test
- public void testLargeEmbeddedOle2Object() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc");
- assertEquals(3, metadataList.size());
-
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
- }
-
@Test
public void testDigestingOpenContainers() throws Exception {
+ //TIKA-4533 -- this tests both that a very large embedded OLE doc
doesn't cause a zip bomb
+ //exception AND that the sha for the embedded OLE doc is not the sha
for a zero-byte file
String expectedSha =
"bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78";
TikaConfig tikaConfig = null;
try (InputStream is =
AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) {
tikaConfig = new TikaConfig(is);
}
- Parser parser = new AutoDetectParser(tikaConfig);
- List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext());
+ AutoDetectParser autoDetectParser = new AutoDetectParser(tikaConfig);
+ //this models what happens in tika-pipes
+ if (autoDetectParser.getAutoDetectParserConfig()
+ .getEmbeddedDocumentExtractorFactory() == null) {
+ autoDetectParser.getAutoDetectParserConfig()
+
.setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory());
+ }
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new
ParseContext());
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256"));
+
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
- //now test that we get the same digest if we warp the auto detect
parser vs configuring it
- Parser autoDetectParser = new AutoDetectParser();
- Parser digestingParser = new DigestingParser(autoDetectParser, new
CommonsDigester(10000, "SHA256"), true);
+ DigestingParser.Digester digester = new CommonsDigester(10000,
"SHA256");
+ //now test that we get the same digest if we wrap the auto detect
parser vs configuring it
+ autoDetectParser = new AutoDetectParser();
+ Parser digestingParser = new DigestingParser(autoDetectParser,
digester, true);
metadataList = getRecursiveMetadata("testLargeOLEDoc.doc",
digestingParser, new ParseContext());
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
-
+ assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
}
}