This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit a004dd56492fa53feb772f3d8e9b86aae52b25e7 Author: tallison <[email protected]> AuthorDate: Mon Dec 16 16:34:25 2019 -0500 prevent NPE in SAX parsing options on truncated files. --- .../tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java | 11 +++++++++-- .../apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java | 10 ++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 141dee3..15f2c33 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -240,7 +240,7 @@ public class OOXMLExtractorFactory { } } - private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException { + private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws TikaException, XmlException, OpenXML4JException, IOException { PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"); if (packageRelationshipCollection.size() == 0) { packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument"); @@ -250,6 +250,9 @@ public class OOXMLExtractorFactory { return null; } PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0)); + if (corePart == null) { + throw new TikaException("Couldn't find core part."); + } String targetContentType = corePart.getContentType(); for (XWPFRelation relation : XWPFWordExtractor.SUPPORTED_TYPES) { if (targetContentType.equals(relation.getContentType())) { @@ -259,7 +262,8 @@ public class OOXMLExtractorFactory { return null; } - private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) throws XmlException, OpenXML4JException, IOException { + private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) throws TikaException, XmlException, + OpenXML4JException, IOException { PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"); if (packageRelationshipCollection.size() == 0) { @@ -270,6 +274,9 @@ public class OOXMLExtractorFactory { return null; } PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0)); + if (corePart == null) { + throw new TikaException("Couldn't find core part"); + } String targetContentType = corePart.getContentType(); XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 7f29c2c..3fb3f98 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -42,6 +42,7 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.ctakes.typesystem.type.syntax.O; import org.apache.poi.util.LocaleUtil; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; @@ -1766,6 +1767,15 @@ public class OOXMLParserTest extends TikaTest { assertEquals("true", m.get(TikaCoreProperties.HAS_SIGNATURE)); } + + @Test(expected = org.apache.tika.exception.TikaException.class) + public void testTruncatedSAXDocx() throws Exception { + ParseContext pc = new ParseContext(); + OfficeParserConfig c = new OfficeParserConfig(); + c.setUseSAXDocxExtractor(true); + pc.set(OfficeParserConfig.class, c); + getRecursiveMetadata("testWORD_truncated.docx", pc); + } }
