This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new d1d797445 Revert "TIKA-4533 -- need to fix TikaInputStream's setting
of length on spooling (#2379)"
d1d797445 is described below
commit d1d7974455d70622a6cfdcbe077e4854a217b7a3
Author: tallison <[email protected]>
AuthorDate: Fri Oct 31 17:02:40 2025 -0400
Revert "TIKA-4533 -- need to fix TikaInputStream's setting of length on
spooling (#2379)"
This reverts commit 8bec259d0593db9d785db29acb087b906d38acba.
---
.../java/org/apache/tika/io/TikaInputStream.java | 9 +------
.../parser/microsoft/pst/PSTMailItemParser.java | 1 -
.../apache/tika/parser/AutoDetectParserTest.java | 31 +++++++++-------------
3 files changed, 14 insertions(+), 27 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 71dbe97cb..6ba1d2d97 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -257,7 +257,6 @@ public class TikaInputStream extends TaggedInputStream {
tis.setOpenContainer(openContainer);
//this overwrites the length that was set in the constructor above
tis.setLength(length);
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
return tis;
}
@@ -776,13 +775,7 @@ public class TikaInputStream extends TaggedInputStream {
};
// Update length to file size. Update position, mark
- long sz = Files.size(path);
- if (getOpenContainer() != null && sz == 0 && length > -1) {
- //don't update size if there's an open container and the sz == 0
- //hope that the length was sent in earlier via getFromContainer
- } else {
- length = sz;
- }
+ length = Files.size(path);
position = 0;
mark = -1;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 28791bb7c..5532525ea 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -245,7 +245,6 @@ public class PSTMailItemParser implements Parser {
attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename);
attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
- attachMeta.set(Metadata.CONTENT_LENGTH,
Integer.toString(attachment.getSize()));
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", filename);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index bf16e5f62..c3621576f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -46,7 +46,6 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
-import org.apache.tika.extractor.RUnpackExtractorFactory;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -562,35 +561,31 @@ public class AutoDetectParserTest extends TikaTest {
}
}
+ @Test
+ public void testLargeEmbeddedOle2Object() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc");
+ assertEquals(3, metadataList.size());
+
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ }
+
@Test
public void testDigestingOpenContainers() throws Exception {
- //TIKA-4533 -- this tests both that a very large embedded OLE doc
doesn't cause a zip bomb
- //exception AND that the sha for the embedded OLE doc is not the sha
for a zero-byte file
String expectedSha =
"bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78";
TikaConfig tikaConfig = null;
try (InputStream is =
AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) {
tikaConfig = new TikaConfig(is);
}
- AutoDetectParser autoDetectParser = new AutoDetectParser(tikaConfig);
- //this models what happens in tika-pipes
- if (autoDetectParser.getAutoDetectParserConfig()
- .getEmbeddedDocumentExtractorFactory() == null) {
- autoDetectParser.getAutoDetectParserConfig()
-
.setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory());
- }
- List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new
ParseContext());
+ Parser parser = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList =
getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext());
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256"));
-
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
- assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
- DigestingParser.Digester digester = new CommonsDigester(10000,
"SHA256");
+ //now test that we get the same digest if we warp the auto detect
parser vs configuring it
+ Parser autoDetectParser = new AutoDetectParser();
+ Parser digestingParser = new DigestingParser(autoDetectParser, new
CommonsDigester(10000, "SHA256"), true);
- //now test that we get the same digest if we wrap the auto detect
parser vs configuring it
- autoDetectParser = new AutoDetectParser();
- Parser digestingParser = new DigestingParser(autoDetectParser,
digester, true);
metadataList = getRecursiveMetadata("testLargeOLEDoc.doc",
digestingParser, new ParseContext());
assertEquals(expectedSha,
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
- assertEquals(2049290L,
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
+
}
}