This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new d1d797445 Revert "TIKA-4533 -- need to fix TikaInputStream's setting 
of length on spooling (#2379)"
d1d797445 is described below

commit d1d7974455d70622a6cfdcbe077e4854a217b7a3
Author: tallison <[email protected]>
AuthorDate: Fri Oct 31 17:02:40 2025 -0400

    Revert "TIKA-4533 -- need to fix TikaInputStream's setting of length on 
spooling (#2379)"
    
    This reverts commit 8bec259d0593db9d785db29acb087b906d38acba.
---
 .../java/org/apache/tika/io/TikaInputStream.java   |  9 +------
 .../parser/microsoft/pst/PSTMailItemParser.java    |  1 -
 .../apache/tika/parser/AutoDetectParserTest.java   | 31 +++++++++-------------
 3 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 71dbe97cb..6ba1d2d97 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -257,7 +257,6 @@ public class TikaInputStream extends TaggedInputStream {
         tis.setOpenContainer(openContainer);
         //this overwrites the length that was set in the constructor above
         tis.setLength(length);
-        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
         return tis;
     }
 
@@ -776,13 +775,7 @@ public class TikaInputStream extends TaggedInputStream {
             };
 
             // Update length to file size. Update position, mark
-            long sz = Files.size(path);
-            if (getOpenContainer() != null && sz == 0 && length > -1) {
-                //don't update size if there's an open container and the sz == 0
-                //hope that the length was sent in earlier via getFromContainer
-            } else {
-                length = sz;
-            }
+            length = Files.size(path);
             position = 0;
             mark = -1;
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 28791bb7c..5532525ea 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -245,7 +245,6 @@ public class PSTMailItemParser implements Parser {
         attachMeta.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
         attachMeta.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filename);
         attachMeta.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-        attachMeta.set(Metadata.CONTENT_LENGTH, 
Integer.toString(attachment.getSize()));
         AttributesImpl attributes = new AttributesImpl();
         attributes.addAttribute("", "class", "class", "CDATA", "embedded");
         attributes.addAttribute("", "id", "id", "CDATA", filename);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index bf16e5f62..c3621576f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -46,7 +46,6 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.exception.ZeroByteFileException;
-import org.apache.tika.extractor.RUnpackExtractorFactory;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -562,35 +561,31 @@ public class AutoDetectParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testLargeEmbeddedOle2Object() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testLargeOLEDoc.doc");
+        assertEquals(3, metadataList.size());
+        
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+    }
+
     @Test
     public void testDigestingOpenContainers() throws Exception {
-        //TIKA-4533 -- this tests both that a very large embedded OLE doc 
doesn't cause a zip bomb
-        //exception AND that the sha for the embedded OLE doc is not the sha 
for a zero-byte file
         String expectedSha = 
"bbc2057a1ff8fe859a296d2fbb493fc0c3e5796749ba72507c0e13f7a3d81f78";
         TikaConfig tikaConfig = null;
         try (InputStream is = 
AutoDetectParserTest.class.getResourceAsStream("/configs/tika-4533.xml")) {
             tikaConfig = new TikaConfig(is);
         }
-        AutoDetectParser autoDetectParser = new AutoDetectParser(tikaConfig);
-        //this models what happens in tika-pipes
-        if (autoDetectParser.getAutoDetectParserConfig()
-                    .getEmbeddedDocumentExtractorFactory() == null) {
-            autoDetectParser.getAutoDetectParserConfig()
-                                                     
.setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory());
-        }
-        List<Metadata> metadataList = 
getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, new 
ParseContext());
+        Parser parser = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testLargeOLEDoc.doc", parser, new ParseContext());
         assertEquals(expectedSha, 
metadataList.get(2).get("X-TIKA:digest:SHA256"));
-        
assertNull(metadataList.get(2).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
-        assertEquals(2049290L, 
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
 
-        DigestingParser.Digester digester = new CommonsDigester(10000, 
"SHA256");
+        //now test that we get the same digest if we warp the auto detect 
parser vs configuring it
+        Parser autoDetectParser = new AutoDetectParser();
+        Parser digestingParser = new DigestingParser(autoDetectParser, new 
CommonsDigester(10000, "SHA256"), true);
 
-        //now test that we get the same digest if we wrap the auto detect 
parser vs configuring it
-        autoDetectParser = new AutoDetectParser();
-        Parser digestingParser = new DigestingParser(autoDetectParser, 
digester, true);
         metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", 
digestingParser, new ParseContext());
         assertEquals(expectedSha, 
metadataList.get(2).get("X-TIKA:digest:SHA256").toLowerCase(Locale.US));
-        assertEquals(2049290L, 
Long.parseLong(metadataList.get(2).get(Metadata.CONTENT_LENGTH)));
+
 
     }
 }

Reply via email to