This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f00dbcee7 Revert "TIKA-4474 -- force spooling on ooxml (#2386)" (#2387)
f00dbcee7 is described below
commit f00dbcee7f50fe14a9ea1b0e13e6895531870c68
Author: Tim Allison <[email protected]>
AuthorDate: Thu Nov 6 10:11:46 2025 -0500
Revert "TIKA-4474 -- force spooling on ooxml (#2386)" (#2387)
This reverts commit e179523a5e1c72db4ce6ed1a18eacdf843cca6ad.
---
.../src/test/java/org/apache/tika/TikaTest.java | 10 ++---
.../microsoft/ooxml/OOXMLExtractorFactory.java | 44 +++++++++++++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 7 ----
.../test-documents/testRecordSizeExceeded.xlsx | Bin 12364136 -> 0 bytes
4 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index c76182341..4345c2a03 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -42,10 +42,8 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
@@ -571,12 +569,10 @@ public abstract class TikaTest {
public String getText(InputStream is, Parser parser, ParseContext context,
Metadata metadata)
throws Exception {
ContentHandler handler = new BodyContentHandler(1000000);
- try(is){
+ try {
parser.parse(is, handler, metadata, context);
- } catch (SAXException e) {
- if (!WriteLimitReachedException.isWriteLimitReached(e)) {
- throw e;
- }
+ } finally {
+ is.close();
}
return handler.toString();
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 936f9f7c9..35cbbb6d2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -105,10 +105,10 @@ public class OOXMLExtractorFactory {
OOXMLExtractor extractor = null;
// Locate or Open the OPCPackage for the file
- TikaInputStream tis = TikaInputStream.get(stream);
- if (tis.getOpenContainer() instanceof OPCPackageWrapper) {
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis != null && tis.getOpenContainer() instanceof
OPCPackageWrapper) {
pkg = ((OPCPackageWrapper)
tis.getOpenContainer()).getOPCPackage();
- } else {
+ } else if (tis != null && tis.hasFile()) {
try {
pkg = OPCPackage.open(tis.getFile().getPath(),
PackageAccess.READ);
} catch (InvalidOperationException e) {
@@ -117,6 +117,44 @@ public class OOXMLExtractorFactory {
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
tis.setOpenContainer(new OPCPackageWrapper(pkg));
+ } else {
+ //OPCPackage slurps rris into memory so we can close rris
+ //without apparent problems
+ mustRevertPackage = true;
+ try (RereadableInputStream rereadableInputStream = new
RereadableInputStream(stream,
+ MAX_BUFFER_LENGTH, false)) {
+ try {
+ pkg =
OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream));
+ } catch (UnsupportedZipFeatureException e) {
+ if (e.getFeature() !=
+
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ throw e;
+ }
+ rereadableInputStream.rewind();
+ tmpRepairedCopy =
Files.createTempFile("tika-ooxml-repair-", "").toFile();
+ ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy, false);
+ //if there isn't enough left to be opened as a package
+ //throw an exception -- we may want to fall back to
streaming
+ //parsing
+ pkg = OPCPackage.open(tmpRepairedCopy,
PackageAccess.READ);
+ } catch (IOException e) {
+ if (e instanceof EOFException) {
+ //keep going
+ } else if (e instanceof IOException && e.getMessage()
!= null &&
+ e.getMessage().contains("Truncated")) {
+ //keep going
+ } else {
+ throw e;
+ }
+ rereadableInputStream.rewind();
+ tmpRepairedCopy =
Files.createTempFile("tika-ooxml-repair-", "").toFile();
+ ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy, false);
+ //if there isn't enough left to be opened as a package
+ //throw an exception -- we may want to fall back to
streaming
+ //parsing
+ pkg = OPCPackage.open(tmpRepairedCopy,
PackageAccess.READ);
+ }
+ }
}
if (pkg != null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index c50a3077a..fef9ef648 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1814,11 +1814,4 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
assertEquals("true", m.get(Office.HAS_COMMENTS));
}
-
- @Test
- public void testNoRecordSizeOverflow() throws Exception{
- //TIKA-4474 -- test: files (passed as stream) no longer have limit on
record size as they are spooled
- String content = getText("testRecordSizeExceeded.xlsx");
- assertContains("Repetitive content pattern 3 for compression test row
1", content);
- }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
deleted file mode 100644
index c93c487ef..000000000
Binary files
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
and /dev/null differ