This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e179523a5 TIKA-4474 -- force spooling on ooxml (#2386)
e179523a5 is described below
commit e179523a5e1c72db4ce6ed1a18eacdf843cca6ad
Author: MANISH <[email protected]>
AuthorDate: Thu Nov 6 20:41:17 2025 +0530
TIKA-4474 -- force spooling on ooxml (#2386)
* TIKA-4474: force spool ooxml files
* TIKA-4474: handle write limit reached in test
---
.../src/test/java/org/apache/tika/TikaTest.java | 10 +++--
.../microsoft/ooxml/OOXMLExtractorFactory.java | 44 ++-------------------
.../parser/microsoft/ooxml/OOXMLParserTest.java | 7 ++++
.../test-documents/testRecordSizeExceeded.xlsx | Bin 0 -> 12364136 bytes
4 files changed, 17 insertions(+), 44 deletions(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 4345c2a03..c76182341 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -42,8 +42,10 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
@@ -569,10 +571,12 @@ public abstract class TikaTest {
public String getText(InputStream is, Parser parser, ParseContext context,
Metadata metadata)
throws Exception {
ContentHandler handler = new BodyContentHandler(1000000);
- try {
+ try(is){
parser.parse(is, handler, metadata, context);
- } finally {
- is.close();
+ } catch (SAXException e) {
+ if (!WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw e;
+ }
}
return handler.toString();
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 35cbbb6d2..936f9f7c9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -105,10 +105,10 @@ public class OOXMLExtractorFactory {
OOXMLExtractor extractor = null;
// Locate or Open the OPCPackage for the file
- TikaInputStream tis = TikaInputStream.cast(stream);
- if (tis != null && tis.getOpenContainer() instanceof
OPCPackageWrapper) {
+ TikaInputStream tis = TikaInputStream.get(stream);
+ if (tis.getOpenContainer() instanceof OPCPackageWrapper) {
pkg = ((OPCPackageWrapper)
tis.getOpenContainer()).getOPCPackage();
- } else if (tis != null && tis.hasFile()) {
+ } else {
try {
pkg = OPCPackage.open(tis.getFile().getPath(),
PackageAccess.READ);
} catch (InvalidOperationException e) {
@@ -117,44 +117,6 @@ public class OOXMLExtractorFactory {
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
tis.setOpenContainer(new OPCPackageWrapper(pkg));
- } else {
- //OPCPackage slurps rris into memory so we can close rris
- //without apparent problems
- mustRevertPackage = true;
- try (RereadableInputStream rereadableInputStream = new
RereadableInputStream(stream,
- MAX_BUFFER_LENGTH, false)) {
- try {
- pkg =
OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream));
- } catch (UnsupportedZipFeatureException e) {
- if (e.getFeature() !=
-
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
- throw e;
- }
- rereadableInputStream.rewind();
- tmpRepairedCopy =
Files.createTempFile("tika-ooxml-repair-", "").toFile();
- ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy, false);
- //if there isn't enough left to be opened as a package
- //throw an exception -- we may want to fall back to
streaming
- //parsing
- pkg = OPCPackage.open(tmpRepairedCopy,
PackageAccess.READ);
- } catch (IOException e) {
- if (e instanceof EOFException) {
- //keep going
- } else if (e instanceof IOException && e.getMessage()
!= null &&
- e.getMessage().contains("Truncated")) {
- //keep going
- } else {
- throw e;
- }
- rereadableInputStream.rewind();
- tmpRepairedCopy =
Files.createTempFile("tika-ooxml-repair-", "").toFile();
- ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy, false);
- //if there isn't enough left to be opened as a package
- //throw an exception -- we may want to fall back to
streaming
- //parsing
- pkg = OPCPackage.open(tmpRepairedCopy,
PackageAccess.READ);
- }
- }
}
if (pkg != null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index fef9ef648..c50a3077a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1814,4 +1814,11 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
assertEquals("true", m.get(Office.HAS_TRACK_CHANGES));
assertEquals("true", m.get(Office.HAS_COMMENTS));
}
+
+ @Test
+ public void testNoRecordSizeOverflow() throws Exception{
+ //TIKA-4474 -- test: files (passed as stream) no longer have limit on
record size as they are spooled
+ String content = getText("testRecordSizeExceeded.xlsx");
+ assertContains("Repetitive content pattern 3 for compression test row
1", content);
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
new file mode 100644
index 000000000..c93c487ef
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRecordSizeExceeded.xlsx
differ