This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch revert-2386-TIKA-4474 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3b47000e7b0f0610a8719c12ace2741cb53fc9ee Author: Tim Allison <[email protected]> AuthorDate: Thu Nov 6 10:11:24 2025 -0500 Revert "TIKA-4474 -- force spooling on ooxml (#2386)" This reverts commit e179523a5e1c72db4ce6ed1a18eacdf843cca6ad. --- .../src/test/java/org/apache/tika/TikaTest.java | 10 ++--- .../microsoft/ooxml/OOXMLExtractorFactory.java | 44 +++++++++++++++++++-- .../parser/microsoft/ooxml/OOXMLParserTest.java | 7 ---- .../test-documents/testRecordSizeExceeded.xlsx | Bin 12364136 -> 0 bytes 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index c76182341..4345c2a03 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -42,10 +42,8 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; import org.apache.tika.config.TikaConfig; -import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedResourceHandler; import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; @@ -571,12 +569,10 @@ public abstract class TikaTest { public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception { ContentHandler handler = new BodyContentHandler(1000000); - try(is){ + try { parser.parse(is, handler, metadata, context); - } catch (SAXException e) { - if (!WriteLimitReachedException.isWriteLimitReached(e)) { - throw e; - } + } finally { + is.close(); } return handler.toString(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 936f9f7c9..35cbbb6d2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -105,10 +105,10 @@ public class OOXMLExtractorFactory { OOXMLExtractor extractor = null; // Locate or Open the OPCPackage for the file - TikaInputStream tis = TikaInputStream.get(stream); - if (tis.getOpenContainer() instanceof OPCPackageWrapper) { + TikaInputStream tis = TikaInputStream.cast(stream); + if (tis != null && tis.getOpenContainer() instanceof OPCPackageWrapper) { pkg = ((OPCPackageWrapper) tis.getOpenContainer()).getOPCPackage(); - } else { + } else if (tis != null && tis.hasFile()) { try { pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); } catch (InvalidOperationException e) { @@ -117,6 +117,44 @@ public class OOXMLExtractorFactory { pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); } tis.setOpenContainer(new OPCPackageWrapper(pkg)); + } else { + //OPCPackage slurps rris into memory so we can close rris + //without apparent problems + mustRevertPackage = true; + try (RereadableInputStream rereadableInputStream = new RereadableInputStream(stream, + MAX_BUFFER_LENGTH, false)) { + try { + pkg = OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream)); + } catch (UnsupportedZipFeatureException e) { + if (e.getFeature() != + UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + throw e; + } + rereadableInputStream.rewind(); + tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile(); + ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); + //if there isn't enough left to be opened as a package + //throw an exception -- we may want to fall back to streaming + //parsing + pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); + } catch (IOException e) { + if (e instanceof EOFException) { + //keep going + } else if (e instanceof IOException && e.getMessage() != null && + e.getMessage().contains("Truncated")) { + //keep going + } else { + throw e; + } + rereadableInputStream.rewind(); + tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile(); + ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); + //if there isn't enough left to be opened as a package + //throw an exception -- we may want to fall back to streaming + //parsing + pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); + } + } } if (pkg != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index c50a3077a..fef9ef648 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1814,11 +1814,4 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("true", m.get(Office.HAS_TRACK_CHANGES)); assertEquals("true", m.get(Office.HAS_COMMENTS)); } - - @Test - public void testNoRecordSizeOverflow() throws Exception{ - //TIKA-4474 -- test: files (passed as stream) no longer have limit on record size as they are spooled - String content = getText("testRecordSizeExceeded.xlsx"); - assertContains("Repetitive content pattern 3 for compression test row 1", content); - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx deleted file mode 100644 index c93c487ef..000000000 Binary files a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx and /dev/null differ
