This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e035d0110e TIKA-4688 -- fix truncated ooxml regression (#2691)
e035d0110e is described below
commit e035d0110ec0aa04cf4df1e3271fa5dec7289fcb
Author: Tim Allison <[email protected]>
AuthorDate: Wed Mar 11 10:34:40 2026 -0400
TIKA-4688 -- fix truncated ooxml regression (#2691)
---
.../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 11 ++++++++++-
.../tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java | 16 ++++++++++++++++
.../org/apache/tika/pipes/core/server/ParseHandler.java | 4 ++++
3 files changed, 30 insertions(+), 1 deletion(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 64ca412311..8558f37c21 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -58,6 +58,7 @@ import
org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+
/**
* Figures out the correct {@link OOXMLExtractor} for the supplied document and
* returns it.
@@ -93,7 +94,15 @@ public class OOXMLExtractorFactory {
if (tis.getOpenContainer() instanceof OPCPackageWrapper) {
pkg = ((OPCPackageWrapper)
tis.getOpenContainer()).getOPCPackage();
} else {
- pkg = OPCPackage.open(tis.getPath().toString(),
PackageAccess.READ);
+ // POI 5.x can throw InvalidOperationException (a
RuntimeException
+ // extending OpenXML4JRuntimeException) for truncated/corrupt
zip files.
+ // The detector should have salvaged if needed, but catch
broadly here
+ // as a safety net.
+ try {
+ pkg = OPCPackage.open(tis.getPath().toString(),
PackageAccess.READ);
+ } catch (RuntimeException e) {
+ throw new TikaException("Error opening OOXML file", e);
+ }
tis.setOpenContainer(new OPCPackageWrapper(pkg));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 179f3106f4..0a82cfe0df 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -56,6 +56,22 @@ public class TruncatedOOXMLTest extends TikaTest {
}
+ @Test
+ public void testWordTruncNoCentralDirectory() throws Exception {
+ // Truncated enough that the zip central directory is missing,
+ // but [Content_Types].xml and document.xml are intact.
+ // This exercises the ZipSalvager + OPCPackage fallback path.
+ List<Metadata> metadataList =
+ getRecursiveMetadata(truncate("testWORD_various.docx", 13500),
true);
+ assertEquals(1, metadataList.size());
+ Metadata metadata = metadataList.get(0);
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("This is the header", content);
+ assertContains("Suddenly some Japanese", content);
+ }
+
@Test
public void testTruncation() throws Exception {
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index 79d233ba4e..c97c1311df 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -44,6 +44,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParseRecord;
+import org.apache.tika.parser.ParsingIntent;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.ParseMode;
@@ -143,6 +144,9 @@ class ParseHandler {
LOG.warn("problem digesting: " + t.getId(), e);
}
}
+ // Signal to detectors that parsing will follow, so they can prepare
+ // resources (e.g., ZipSalvager for truncated zips)
+ parseContext.set(ParsingIntent.class, ParsingIntent.WILL_PARSE);
try {
MediaType mt = detector.detect(tis, metadata, parseContext);
metadata.set(Metadata.CONTENT_TYPE, mt.toString());