This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 67612b8f805ad5d1085db14922d3b3b6ddce19bf Author: tballison <[email protected]> AuthorDate: Wed Apr 19 10:11:29 2017 -0400 TIKA-1195 and TIKA-2329 --- CHANGES.txt | 14 +- tika-parsers/pom.xml | 2 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 5 +- .../tika/parser/microsoft/ooxml/OOXMLParser.java | 9 +- .../ooxml/XSSFBExcelExtractorDecorator.java | 282 +++++++++++++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 10 +- .../tika/parser/microsoft/ExcelParserTest.java | 38 --- .../parser/microsoft/ooxml/OOXMLParserTest.java | 61 +++-- 8 files changed, 347 insertions(+), 74 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1fe98a7..610c186 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,12 @@ Release 1.15 - ?? + * Change default behavior to parse embedded documents even if the user + forgets to specify a Parser.class in the ParseContext (TIKA-2096). + Users who wish to parse only the container document should set + an EmptyParser as the Parser.class in the ParseContext. + + * Add support for the XLSB format (TIKA-1195). + * Change default behavior of Office Parsers to _not_ extract Macros. User needs to setExtractMacros to "true" (TIKA-2302). @@ -64,14 +71,9 @@ Release 1.15 - ?? * Added experimental SAX parser for .docx files. To select this parser, set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191). - * Change default behavior to parse embedded documents even if the user - forgets to specify a Parser.class in the ParseContext (TIKA-2096). - Users who wish to parse only the container document should set - an EmptyParser as the Parser.class in the ParseContext. - * Add mime detection and parser for Word 2006ML format (TIKA-2179). - * Upgrade to POI 3.16-beta2 (TIKA-2116, TIKA-2181). + * Upgrade to POI 3.16 (TIKA-2116, TIKA-2181, TIKA-2329). * Allow configuration of timeout for ForkParser (TIKA-2170). diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index e4d04ca..58ac745 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>3.16-beta2</poi.version> + <poi.version>3.16</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 86d74df..92963a8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -32,6 +32,7 @@ import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; @@ -104,8 +105,10 @@ public class OOXMLExtractorFactory { } POIXMLDocument document = poiExtractor.getDocument(); + if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) { + extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale); - if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { + } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator( context, poiExtractor, locale); } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java index 10af01c..fbc0f93 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java @@ -74,6 +74,8 @@ public class OOXMLParser extends AbstractOfficeParser { MediaType.application("vnd.ms-visio.drawing"), MediaType.application("vnd.ms-xpsdocument"), MediaType.parse("model/vnd.dwfx+xps") + // MediaType.application("x-tika-ooxml") + ))); /** * We claim to support all OOXML files, but we actually don't support a small @@ -82,10 +84,9 @@ public class OOXMLParser extends AbstractOfficeParser { * by Tika and/or POI. */ protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("vnd.ms-xpsdocument"), - MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12") - ))); + Collections.singleton( + MediaType.application("vnd.ms-xpsdocument") + ); /** * Serial version UID */ diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java new file mode 100644 index 0000000..374fcb6 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.xssf.binary.XSSFBCommentsTable; +import org.apache.poi.xssf.binary.XSSFBSharedStringsTable; +import org.apache.poi.xssf.binary.XSSFBSheetHandler; +import org.apache.poi.xssf.binary.XSSFBStylesTable; +import org.apache.poi.xssf.eventusermodel.XSSFBReader; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; +import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlException; +import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; +import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps; +import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; +import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { + + public XSSFBExcelExtractorDecorator( + ParseContext context, POIXMLTextExtractor extractor, Locale locale) { + super(context, extractor, locale); + } + + @Override + protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { + //need to override this because setFormulasNotResults is not yet available + //for xlsb + //((XSSFBEventBasedExcelExtractor)extractor).setFormulasNotResults(false); + ((XSSFBEventBasedExcelExtractor)extractor).setLocale(locale); + } + + @Override + public void getXHTML( + ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, XmlException, IOException, TikaException { + + this.metadata = metadata; + this.parseContext = context; + metadata.set(TikaMetadataKeys.PROTECTED, "false"); + + super.getXHTML(handler, metadata, context); + } + + /** + * @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText() + */ + @Override + protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, + XmlException, IOException { + OPCPackage container = extractor.getPackage(); + + XSSFBSharedStringsTable strings; + XSSFBReader.SheetIterator iter; + XSSFBReader xssfReader; + XSSFBStylesTable styles; + try { + xssfReader = new XSSFBReader(container); + styles = xssfReader.getXSSFBStylesTable(); + iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData(); + strings = new XSSFBSharedStringsTable(container); + } catch (InvalidFormatException e) { + throw new XmlException(e); + } catch (OpenXML4JException oe) { + throw new XmlException(oe); + } + + while (iter.hasNext()) { + InputStream stream = iter.next(); + PackagePart sheetPart = iter.getSheetPart(); + addDrawingHyperLinks(sheetPart); + sheetParts.add(sheetPart); + + SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); + XSSFBCommentsTable comments = iter.getXSSFBSheetComments(); + + // Start, and output the sheet name + xhtml.startElement("div"); + xhtml.element("h1", iter.getSheetName()); + + // Extract the main sheet contents + xhtml.startElement("table"); + xhtml.startElement("tbody"); + + processSheet(sheetExtractor, comments, styles, strings, stream); + + xhtml.endElement("tbody"); + xhtml.endElement("table"); + + // Output any headers and footers + // (Need to process the sheet to get them, so we can't + // do the headers before the contents) + for (String header : sheetExtractor.headers) { + extractHeaderFooter(header, xhtml); + } + for (String footer : sheetExtractor.footers) { + extractHeaderFooter(footer, xhtml); + } + List<XSSFShape> shapes = iter.getShapes(); + processShapes(shapes, xhtml); + + //for now dump sheet hyperlinks at bottom of page + //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes + //step 1: extract hyperlink info from bottom of page + //step 2: process as we do now, but with cached hyperlink relationship info + extractHyperLinks(sheetPart, xhtml); + // All done with this sheet + xhtml.endElement("div"); + } + } + + @Override + protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) + throws SAXException { + if (hf.length() > 0) { + xhtml.element("p", hf); + } + } + + private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException { + try { + for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { + xhtml.startElement("a", "href", rel.getTargetURI().toString()); + xhtml.characters(rel.getTargetURI().toString()); + xhtml.endElement("a"); + } + } catch (InvalidFormatException e) { + //swallow + } + } + + private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { + if (shapes == null) { + return; + } + for (XSSFShape shape : shapes) { + if (shape instanceof XSSFSimpleShape) { + String sText = ((XSSFSimpleShape) shape).getText(); + if (sText != null && sText.length() > 0) { + xhtml.element("p", sText); + } + extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml); + } + } + } + + private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException { + + if (ctShape == null) + return; + + CTShapeNonVisual nvSpPR = ctShape.getNvSpPr(); + if (nvSpPR == null) + return; + + CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr(); + if (cNvPr == null) + return; + + CTHyperlink ctHyperlink = cNvPr.getHlinkClick(); + if (ctHyperlink == null) + return; + + String url = drawingHyperlinks.get(ctHyperlink.getId()); + if (url != null) { + xhtml.startElement("a", "href", url); + xhtml.characters(url); + xhtml.endElement("a"); + } + + CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover(); + if (ctHoverHyperlink == null) + return; + + url = drawingHyperlinks.get(ctHoverHyperlink.getId()); + if (url != null) { + xhtml.startElement("a", "href", url); + xhtml.characters(url); + xhtml.endElement("a"); + } + + } + + private void processSheet( + SheetContentsHandler sheetContentsExtractor, + XSSFBCommentsTable comments, + XSSFBStylesTable styles, + XSSFBSharedStringsTable strings, + InputStream sheetInputStream) + throws IOException, SAXException { + + XSSFBSheetHandler xssfbSheetHandler = new XSSFBSheetHandler( + sheetInputStream, + styles, + comments, + strings, + sheetContentsExtractor, + formatter, + false + ); + xssfbSheetHandler.parse(); + } + + /** + * In Excel files, sheets have things embedded in them, + * and sheet drawings which have the images + */ + @Override + protected List<PackagePart> getMainDocumentParts() throws TikaException { + List<PackagePart> parts = new ArrayList<PackagePart>(); + for (PackagePart part : sheetParts) { + // Add the sheet + parts.add(part); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + + //add main document so that macros can be extracted + //by AbstractOOXMLExtractor + for (PackagePart part : extractor.getPackage(). + getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) { + parts.add(part); + } + + return parts; + } +} \ No newline at end of file diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index a8bee1e..dbf21d1 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -71,7 +71,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { * Allows access to headers/footers from raw xml strings */ protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); - private final XSSFEventBasedExcelExtractor extractor; protected final DataFormatter formatter; protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); protected final Map<String, String> drawingHyperlinks = new HashMap<>(); @@ -84,9 +83,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { this.parseContext = context; this.extractor = (XSSFEventBasedExcelExtractor)extractor; - // not yet supported in POI-3.16-beta3 - // this.extractor.setFormulasNotResults(false); - this.extractor.setLocale(locale); + configureExtractor(this.extractor, locale); if (locale == null) { formatter = new TikaExcelDataFormatter(); @@ -95,6 +92,11 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { + ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false); + ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale); + } + @Override public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index cea5e9f..fc31958 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -267,44 +267,6 @@ public class ExcelParserTest extends TikaTest { } } - /** - * We don't currently support the .xlsb file format - * (an OOXML container with binary blobs), but we - * shouldn't break on these files either (TIKA-826) - */ - @Test - public void testExcelXLSB() throws Exception { - Detector detector = new DefaultDetector(); - AutoDetectParser parser = new AutoDetectParser(); - - Metadata m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); - - // Should be detected correctly - MediaType type; - try (InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL.xlsb")) { - type = detector.detect(input, m); - assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); - } - - // OfficeParser won't handle it - assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // OOXMLParser will (soon) handle it - assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // AutoDetectParser doesn't break on it - try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - parser.parse(input, handler, m, context); - - String content = handler.toString(); - assertEquals("", content); - } - } /** * Excel 5 and 95 are older formats, and only get basic support diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index af1ba27..6420545 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -27,7 +27,6 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import java.io.ByteArrayOutputStream; -import java.io.EOFException; import java.io.File; import java.io.InputStream; import java.io.PrintStream; @@ -43,8 +42,9 @@ import java.util.Map; import org.apache.poi.util.LocaleUtil; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -52,19 +52,21 @@ import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.microsoft.ExcelParserTest; +import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.WordParserTest; import org.apache.tika.sax.BodyContentHandler; import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; public class OOXMLParserTest extends TikaTest { @@ -1430,10 +1432,43 @@ public class OOXMLParserTest extends TikaTest { } @Test - @Ignore("until poi-3.16-beta3") + public void testExcelXLSB() throws Exception { + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + + Metadata m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); + + // Should be detected correctly + MediaType type; + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xlsb")) { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); + } + + // OfficeParser won't handle it + assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // OOXMLParser will (soon) handle it + assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // AutoDetectParser doesn't break on it + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + assertContains("This is an example spreadsheet", content); + } + } + + @Test public void testXLSBVarious() throws Exception { - //make sure to turn MACROs on, after we turn them off by default OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); ParseContext parseContext = new ParseContext(); parseContext.set(OfficeParserConfig.class, officeParserConfig); List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext); @@ -1473,22 +1508,8 @@ public class OOXMLParserTest extends TikaTest { assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml); assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml); assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml); - } - @Test - public void testTruncated() throws Exception { - Parser p = new AutoDetectParser(); - ContentHandler handler = new DefaultHandler(); - Metadata metadata = new Metadata(); - ParseContext parseContext = new ParseContext(); - try (InputStream is = getTestDocument("testWORD_truncated.docx")) { - p.parse(is, handler, metadata, parseContext); - fail("should have thrown an EOF exception?!"); - } catch (TikaException e) { - Throwable cause = e.getCause(); - assertTrue(cause instanceof EOFException); - assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE)); - } + } } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
