This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch revert-2387-revert-2386-TIKA-4474 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d2cd7fab860321a8cf0a9f22fbfff61d4610f2aa Author: Tim Allison <[email protected]> AuthorDate: Thu Nov 6 10:15:42 2025 -0500 Revert "Revert "TIKA-4474 -- force spooling on ooxml (#2386)" (#2387)" This reverts commit f00dbcee7f50fe14a9ea1b0e13e6895531870c68. --- .../src/test/java/org/apache/tika/TikaTest.java | 10 +++-- .../microsoft/ooxml/OOXMLExtractorFactory.java | 44 ++------------------- .../parser/microsoft/ooxml/OOXMLParserTest.java | 7 ++++ .../test-documents/testRecordSizeExceeded.xlsx | Bin 0 -> 12364136 bytes 4 files changed, 17 insertions(+), 44 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 4345c2a03..c76182341 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -42,8 +42,10 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedResourceHandler; import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; @@ -569,10 +571,12 @@ public abstract class TikaTest { public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception { ContentHandler handler = new BodyContentHandler(1000000); - try { + try(is){ parser.parse(is, handler, metadata, context); - } finally { - is.close(); + } catch (SAXException e) { + if (!WriteLimitReachedException.isWriteLimitReached(e)) { + throw e; + } } return handler.toString(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 35cbbb6d2..936f9f7c9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -105,10 +105,10 @@ public class OOXMLExtractorFactory { OOXMLExtractor extractor = null; // Locate or Open the OPCPackage for the file - TikaInputStream tis = TikaInputStream.cast(stream); - if (tis != null && tis.getOpenContainer() instanceof OPCPackageWrapper) { + TikaInputStream tis = TikaInputStream.get(stream); + if (tis.getOpenContainer() instanceof OPCPackageWrapper) { pkg = ((OPCPackageWrapper) tis.getOpenContainer()).getOPCPackage(); - } else if (tis != null && tis.hasFile()) { + } else { try { pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); } catch (InvalidOperationException e) { @@ -117,44 +117,6 @@ public class OOXMLExtractorFactory { pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); } tis.setOpenContainer(new OPCPackageWrapper(pkg)); - } else { - //OPCPackage slurps rris into memory so we can close rris - //without apparent problems - mustRevertPackage = true; - try (RereadableInputStream rereadableInputStream = new RereadableInputStream(stream, - MAX_BUFFER_LENGTH, false)) { - try { - pkg = OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream)); - } catch (UnsupportedZipFeatureException e) { - if (e.getFeature() != - UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { - throw e; - } - rereadableInputStream.rewind(); - tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile(); - ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); - //if there isn't enough left to be opened as a package - //throw an exception -- we may want to fall back to streaming - //parsing - pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); - } catch (IOException e) { - if (e instanceof EOFException) { - //keep going - } else if (e instanceof IOException && e.getMessage() != null && - e.getMessage().contains("Truncated")) { - //keep going - } else { - throw e; - } - rereadableInputStream.rewind(); - tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile(); - ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); - //if there isn't enough left to be opened as a package - //throw an exception -- we may want to fall back to streaming - //parsing - pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); - } - } } if (pkg != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index fef9ef648..c50a3077a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1814,4 +1814,11 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("true", m.get(Office.HAS_TRACK_CHANGES)); assertEquals("true", m.get(Office.HAS_COMMENTS)); } + + @Test + public void testNoRecordSizeOverflow() throws Exception{ + //TIKA-4474 -- test: files (passed as stream) no longer have limit on record size as they are spooled + String content = getText("testRecordSizeExceeded.xlsx"); + assertContains("Repetitive content pattern 3 for compression test row 1", content); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx new file mode 100644 index 000000000..c93c487ef Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx differ
