This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a004dd56492fa53feb772f3d8e9b86aae52b25e7
Author: tallison <[email protected]>
AuthorDate: Mon Dec 16 16:34:25 2019 -0500

    prevent NPE in SAX parsing options on truncated files.
---
 .../tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java    | 11 +++++++++--
 .../apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java   | 10 ++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 141dee3..15f2c33 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -240,7 +240,7 @@ public class OOXMLExtractorFactory {
         }
     }
 
-    private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws 
XmlException, OpenXML4JException, IOException {
+    private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws 
TikaException, XmlException, OpenXML4JException, IOException {
         PackageRelationshipCollection packageRelationshipCollection = 
pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";);
         if (packageRelationshipCollection.size() == 0) {
             packageRelationshipCollection = 
pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";);
@@ -250,6 +250,9 @@ public class OOXMLExtractorFactory {
             return null;
         }
         PackagePart corePart = 
pkg.getPart(packageRelationshipCollection.getRelationship(0));
+        if (corePart == null) {
+            throw new TikaException("Couldn't find core part.");
+        }
         String targetContentType = corePart.getContentType();
         for (XWPFRelation relation : XWPFWordExtractor.SUPPORTED_TYPES) {
             if (targetContentType.equals(relation.getContentType())) {
@@ -259,7 +262,8 @@ public class OOXMLExtractorFactory {
         return null;
     }
 
-    private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean 
eventBased) throws XmlException, OpenXML4JException, IOException {
+    private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean 
eventBased) throws TikaException, XmlException,
+            OpenXML4JException, IOException {
 
         PackageRelationshipCollection packageRelationshipCollection = 
pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";);
         if (packageRelationshipCollection.size() == 0) {
@@ -270,6 +274,9 @@ public class OOXMLExtractorFactory {
             return null;
         }
         PackagePart corePart = 
pkg.getPart(packageRelationshipCollection.getRelationship(0));
+        if (corePart == null) {
+            throw new TikaException("Couldn't find core part");
+        }
         String targetContentType = corePart.getContentType();
 
         XSLFRelation[] xslfRelations = 
org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 7f29c2c..3fb3f98 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -42,6 +42,7 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.ctakes.typesystem.type.syntax.O;
 import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
@@ -1766,6 +1767,15 @@ public class OOXMLParserTest extends TikaTest {
         assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE));
 
     }
+
+    @Test(expected = org.apache.tika.exception.TikaException.class)
+    public void testTruncatedSAXDocx() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig c = new OfficeParserConfig();
+        c.setUseSAXDocxExtractor(true);
+        pc.set(OfficeParserConfig.class, c);
+        getRecursiveMetadata("testWORD_truncated.docx", pc);
+    }
 }
 
 

Reply via email to