This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e179523a5 TIKA-4474 -- force spooling on ooxml (#2386)
e179523a5 is described below

commit e179523a5e1c72db4ce6ed1a18eacdf843cca6ad
Author: MANISH <[email protected]>
AuthorDate: Thu Nov 6 20:41:17 2025 +0530

    TIKA-4474 -- force spooling on ooxml (#2386)
    
    * TIKA-4474: force spool ooxml files
    
    * TIKA-4474: handle write limit reached in test
---
 .../src/test/java/org/apache/tika/TikaTest.java    |  10 +++--
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  44 ++-------------------
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |   7 ++++
 .../test-documents/testRecordSizeExceeded.xlsx     | Bin 0 -> 12364136 bytes
 4 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 4345c2a03..c76182341 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -42,8 +42,10 @@ import java.util.Set;
 
 import org.apache.commons.io.IOUtils;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TikaInputStream;
@@ -569,10 +571,12 @@ public abstract class TikaTest {
     public String getText(InputStream is, Parser parser, ParseContext context, 
Metadata metadata)
             throws Exception {
         ContentHandler handler = new BodyContentHandler(1000000);
-        try {
+        try(is){
             parser.parse(is, handler, metadata, context);
-        } finally {
-            is.close();
+        } catch (SAXException e) {
+            if (!WriteLimitReachedException.isWriteLimitReached(e)) {
+                throw e;
+            }
         }
         return handler.toString();
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 35cbbb6d2..936f9f7c9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -105,10 +105,10 @@ public class OOXMLExtractorFactory {
             OOXMLExtractor extractor = null;
 
             // Locate or Open the OPCPackage for the file
-            TikaInputStream tis = TikaInputStream.cast(stream);
-            if (tis != null && tis.getOpenContainer() instanceof 
OPCPackageWrapper) {
+            TikaInputStream tis = TikaInputStream.get(stream);
+            if (tis.getOpenContainer() instanceof OPCPackageWrapper) {
                 pkg = ((OPCPackageWrapper) 
tis.getOpenContainer()).getOPCPackage();
-            } else if (tis != null && tis.hasFile()) {
+            } else {
                 try {
                     pkg = OPCPackage.open(tis.getFile().getPath(), 
PackageAccess.READ);
                 } catch (InvalidOperationException e) {
@@ -117,44 +117,6 @@ public class OOXMLExtractorFactory {
                     pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                 }
                 tis.setOpenContainer(new OPCPackageWrapper(pkg));
-            } else {
-                //OPCPackage slurps rris into memory so we can close rris
-                //without apparent problems
-                mustRevertPackage = true;
-                try (RereadableInputStream rereadableInputStream = new 
RereadableInputStream(stream,
-                        MAX_BUFFER_LENGTH, false)) {
-                    try {
-                        pkg = 
OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream));
-                    } catch (UnsupportedZipFeatureException e) {
-                        if (e.getFeature() !=
-                                
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
-                            throw e;
-                        }
-                        rereadableInputStream.rewind();
-                        tmpRepairedCopy = 
Files.createTempFile("tika-ooxml-repair-", "").toFile();
-                        ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy, false);
-                        //if there isn't enough left to be opened as a package
-                        //throw an exception -- we may want to fall back to 
streaming
-                        //parsing
-                        pkg = OPCPackage.open(tmpRepairedCopy, 
PackageAccess.READ);
-                    } catch (IOException e) {
-                        if (e instanceof EOFException) {
-                            //keep going
-                        } else if (e instanceof IOException && e.getMessage() 
!= null &&
-                                e.getMessage().contains("Truncated")) {
-                            //keep going
-                        } else {
-                            throw e;
-                        }
-                        rereadableInputStream.rewind();
-                        tmpRepairedCopy = 
Files.createTempFile("tika-ooxml-repair-", "").toFile();
-                        ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy, false);
-                        //if there isn't enough left to be opened as a package
-                        //throw an exception -- we may want to fall back to 
streaming
-                        //parsing
-                        pkg = OPCPackage.open(tmpRepairedCopy, 
PackageAccess.READ);
-                    }
-                }
             }
 
             if (pkg != null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index fef9ef648..c50a3077a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1814,4 +1814,11 @@ public class OOXMLParserTest extends 
MultiThreadedTikaTest {
         assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
         assertEquals("true", m.get(Office.HAS_COMMENTS));
     }
+
+    @Test
+    public void testNoRecordSizeOverflow() throws Exception{
+        //TIKA-4474 -- test: files (passed as stream) no longer have limit on 
record size as they are spooled
+        String content = getText("testRecordSizeExceeded.xlsx");
+        assertContains("Repetitive content pattern 3 for compression test row 
1", content);
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
new file mode 100644
index 000000000..c93c487ef
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
 differ

Reply via email to