This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch 2.x in repository https://gitbox.apache.org/repos/asf/tika.git
commit a847a863d1e25a9ba8209cd28c3e98be153f34a5 Author: tballison <[email protected]> AuthorDate: Wed Apr 19 10:54:07 2017 -0400 TIKA-1195 and TIKA-2329, upgrade to POI 3.16-final and add xlsb parser --- CHANGES.txt | 14 +- tika-parser-modules/pom.xml | 2 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 5 +- .../tika/parser/microsoft/ooxml/OOXMLParser.java | 7 +- .../ooxml/XSSFBExcelExtractorDecorator.java | 281 +++++++++++++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 29 ++- .../tika/parser/microsoft/ExcelParserTest.java | 31 --- .../parser/microsoft/ooxml/OOXMLParserTest.java | 83 ++++++ .../test-documents/testEXCEL_various.xlsb | Bin 0 -> 22715 bytes 9 files changed, 396 insertions(+), 56 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index ec54b80..23696b4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,13 @@ Release 2.0 - ??? Release 1.15 -??? + * Change default behavior to parse embedded documents even if the user + forgets to specify a Parser.class in the ParseContext (TIKA-2096). + Users who wish to parse only the container document should set + an EmptyParser as the Parser.class in the ParseContext. + + * Add support for the XLSB format (TIKA-1195). + * Change default behavior of Office Parsers to _not_ extract Macros. User needs to setExtractMacros to "true" (TIKA-2302). @@ -71,14 +78,9 @@ Release 1.15 -??? * Added experimental SAX parser for .docx files. To select this parser, set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321). - * Change default behavior to parse embedded documents even if the user - forgets to specify a Parser.class in the ParseContext (TIKA-2096). - Users who wish to parse only the container document should set - an EmptyParser as the Parser.class in the ParseContext. - * Add mime detection and parser for Word 2006ML format (TIKA-2179). - * Upgrade to POI 3.16-beta2 (TIKA-2116, TIKA-2181). + * Upgrade to POI 3.16 (TIKA-2116, TIKA-2181, TIKA-2329). * Allow configuration of timeout for ForkParser (TIKA-2170). diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index 04c44de..d1496d3 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>3.16-beta2</poi.version> + <poi.version>3.16</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <pdfbox.version>2.0.5</pdfbox.version> diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 3443cf5..92dc385 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -32,6 +32,7 @@ import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; @@ -104,7 +105,9 @@ public class OOXMLExtractorFactory { } POIXMLDocument document = poiExtractor.getDocument(); - if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { + if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) { + extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale); + } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator( context, (XSSFEventBasedExcelExtractor) poiExtractor, locale); } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) { diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java index f5b1905..53b21fa 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java @@ -74,6 +74,8 @@ public class OOXMLParser extends AbstractOfficeParser { MediaType.application("vnd.ms-visio.drawing"), MediaType.application("vnd.ms-xpsdocument"), MediaType.parse("model/vnd.dwfx+xps") + // MediaType.application("x-tika-ooxml") + ))); /** * We claim to support all OOXML files, but we actually don't support a small @@ -82,10 +84,7 @@ public class OOXMLParser extends AbstractOfficeParser { * by Tika and/or POI. */ protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"), - MediaType.application("vnd.ms-xpsdocument") - ))); + Collections.singleton(MediaType.application("vnd.ms-xpsdocument")); /** * Serial version UID */ diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java new file mode 100644 index 0000000..54060b3 --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.xssf.binary.XSSFBCommentsTable; +import org.apache.poi.xssf.binary.XSSFBSharedStringsTable; +import org.apache.poi.xssf.binary.XSSFBSheetHandler; +import org.apache.poi.xssf.binary.XSSFBStylesTable; +import org.apache.poi.xssf.eventusermodel.XSSFBReader; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlException; +import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; +import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps; +import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; +import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator { + + public XSSFBExcelExtractorDecorator( + ParseContext context, POIXMLTextExtractor extractor, Locale locale) { + super(context, extractor, locale); + } + + @Override + protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { + //need to override this because setFormulasNotResults is not yet available + //for xlsb + //((XSSFBEventBasedExcelExtractor)extractor).setFormulasNotResults(false); + ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale); + } + + @Override + public void getXHTML( + ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, XmlException, IOException, TikaException { + + this.metadata = metadata; + this.parseContext = context; + metadata.set(TikaMetadataKeys.PROTECTED, "false"); + + super.getXHTML(handler, metadata, context); + } + + /** + * @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText() + */ + @Override + protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, + XmlException, IOException { + OPCPackage container = extractor.getPackage(); + + XSSFBSharedStringsTable strings; + XSSFBReader.SheetIterator iter; + XSSFBReader xssfReader; + XSSFBStylesTable styles; + try { + xssfReader = new XSSFBReader(container); + styles = xssfReader.getXSSFBStylesTable(); + iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData(); + strings = new XSSFBSharedStringsTable(container); + } catch (InvalidFormatException e) { + throw new XmlException(e); + } catch (OpenXML4JException oe) { + throw new XmlException(oe); + } + + while (iter.hasNext()) { + InputStream stream = iter.next(); + PackagePart sheetPart = iter.getSheetPart(); + addDrawingHyperLinks(sheetPart); + sheetParts.add(sheetPart); + + SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); + XSSFBCommentsTable comments = iter.getXSSFBSheetComments(); + + // Start, and output the sheet name + xhtml.startElement("div"); + xhtml.element("h1", iter.getSheetName()); + + // Extract the main sheet contents + xhtml.startElement("table"); + xhtml.startElement("tbody"); + + processSheet(sheetExtractor, comments, styles, strings, stream); + + xhtml.endElement("tbody"); + xhtml.endElement("table"); + + // Output any headers and footers + // (Need to process the sheet to get them, so we can't + // do the headers before the contents) + for (String header : sheetExtractor.headers) { + extractHeaderFooter(header, xhtml); + } + for (String footer : sheetExtractor.footers) { + extractHeaderFooter(footer, xhtml); + } + List<XSSFShape> shapes = iter.getShapes(); + processShapes(shapes, xhtml); + + //for now dump sheet hyperlinks at bottom of page + //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes + //step 1: extract hyperlink info from bottom of page + //step 2: process as we do now, but with cached hyperlink relationship info + extractHyperLinks(sheetPart, xhtml); + // All done with this sheet + xhtml.endElement("div"); + } + } + + private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) + throws SAXException { + if (hf.length() > 0) { + xhtml.element("p", hf); + } + } + + private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException { + try { + for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { + xhtml.startElement("a", "href", rel.getTargetURI().toString()); + xhtml.characters(rel.getTargetURI().toString()); + xhtml.endElement("a"); + } + } catch (InvalidFormatException e) { + //swallow + } + } + + private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { + if (shapes == null) { + return; + } + for (XSSFShape shape : shapes) { + if (shape instanceof XSSFSimpleShape) { + String sText = ((XSSFSimpleShape) shape).getText(); + if (sText != null && sText.length() > 0) { + xhtml.element("p", sText); + } + extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml); + } + } + } + + private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException { + + if (ctShape == null) + return; + + CTShapeNonVisual nvSpPR = ctShape.getNvSpPr(); + if (nvSpPR == null) + return; + + CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr(); + if (cNvPr == null) + return; + + CTHyperlink ctHyperlink = cNvPr.getHlinkClick(); + if (ctHyperlink == null) + return; + + String url = drawingHyperlinks.get(ctHyperlink.getId()); + if (url != null) { + xhtml.startElement("a", "href", url); + xhtml.characters(url); + xhtml.endElement("a"); + } + + CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover(); + if (ctHoverHyperlink == null) + return; + + url = drawingHyperlinks.get(ctHoverHyperlink.getId()); + if (url != null) { + xhtml.startElement("a", "href", url); + xhtml.characters(url); + xhtml.endElement("a"); + } + + } + + private void processSheet( + SheetContentsHandler sheetContentsExtractor, + XSSFBCommentsTable comments, + XSSFBStylesTable styles, + XSSFBSharedStringsTable strings, + InputStream sheetInputStream) + throws IOException, SAXException { + + XSSFBSheetHandler xssfbSheetHandler = new XSSFBSheetHandler( + sheetInputStream, + styles, + comments, + strings, + sheetContentsExtractor, + formatter, + false + ); + xssfbSheetHandler.parse(); + } + + /** + * In Excel files, sheets have things embedded in them, + * and sheet drawings which have the images + */ + @Override + protected List<PackagePart> getMainDocumentParts() throws TikaException { + List<PackagePart> parts = new ArrayList<PackagePart>(); + for (PackagePart part : sheetParts) { + // Add the sheet + parts.add(part); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + + //add main document so that macros can be extracted + //by AbstractOOXMLExtractor + for (PackagePart part : extractor.getPackage(). + getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) { + parts.add(part); + } + + return parts; + } +} \ No newline at end of file diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index f3d7377..08bddc8 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; @@ -70,21 +71,18 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { * Allows access to headers/footers from raw xml strings */ private static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); - private final XSSFEventBasedExcelExtractor extractor; - private final DataFormatter formatter; - private final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); - private final Map<String, String> drawingHyperlinks = new HashMap<>(); - private Metadata metadata; - private ParseContext parseContext; + protected final DataFormatter formatter; + protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); + protected final Map<String, String> drawingHyperlinks = new HashMap<>(); + protected Metadata metadata; + protected ParseContext parseContext; public XSSFExcelExtractorDecorator( - ParseContext context, XSSFEventBasedExcelExtractor extractor, Locale locale) { + ParseContext context, POIXMLTextExtractor extractor, Locale locale) { super(context, extractor); this.parseContext = context; - this.extractor = extractor; - extractor.setFormulasNotResults(false); - extractor.setLocale(locale); + configureExtractor(extractor, locale); if (locale == null) { formatter = new TikaExcelDataFormatter(); @@ -93,6 +91,11 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { + ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false); + ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale); + } + @Override public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) @@ -172,7 +175,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } - private void addDrawingHyperLinks(PackagePart sheetPart) { + protected void addDrawingHyperLinks(PackagePart sheetPart) { try { for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { if (rel.getTargetMode() == TargetMode.INTERNAL) { @@ -340,8 +343,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { */ protected static class SheetTextAsHTML implements SheetContentsHandler { private XHTMLContentHandler xhtml; - private List<String> headers; - private List<String> footers; + protected List<String> headers; + protected List<String> footers; protected SheetTextAsHTML(XHTMLContentHandler xhtml) { this.xhtml = xhtml; diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index 265453f..5d0f5b8 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -229,37 +229,6 @@ public class ExcelParserTest extends TikaTest { } /** - * We don't currently support the .xlsb file format - * (an OOXML container with binary blobs), but we - * shouldn't break on these files either (TIKA-826) - */ - @Test - public void testExcelXLSB() throws Exception { - Detector detector = new DefaultDetector(); - AutoDetectParser parser = new AutoDetectParser(); - - Metadata m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); - - // Should be detected correctly - MediaType type; - try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) { - type = detector.detect(input, m); - assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); - } - - // OfficeParser won't handle it - assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // OOXMLParser will (soon) handle it - assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // AutoDetectParser doesn't break on it - assertContains("<body />", getXML("testEXCEL.xlsb").xml); - - } - - /** * Excel 5 and 95 are older formats, and only get basic support */ @Test diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 847c0b0..eca3e99 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -43,6 +43,8 @@ import java.util.Map; import org.apache.poi.util.LocaleUtil; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -52,12 +54,15 @@ import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.microsoft.ExcelParserTest; +import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.WordParserTest; import org.apache.tika.sax.BodyContentHandler; @@ -1429,6 +1434,84 @@ public class OOXMLParserTest extends TikaTest { assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE)); } } + + @Test + public void testExcelXLSB() throws Exception { + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + + Metadata m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); + + // Should be detected correctly + MediaType type; + try (InputStream input = getTestDocument("testEXCEL.xlsb")) { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); + } + + // OfficeParser won't handle it + assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // OOXMLParser will (soon) handle it + assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // AutoDetectParser doesn't break on it + try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + assertContains("This is an example spreadsheet", content); + } + } + + @Test + public void testXLSBVarious() throws Exception { + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + ParseContext parseContext = new ParseContext(); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext); + assertEquals(4, metadataList.size()); + + String xml = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + assertContains("<td>13</td>", xml); + assertContains("<td>13.1211231321</td>", xml); + assertContains("<td>$ 3.03</td>", xml); + assertContains("<td>20%</td>", xml); + assertContains("<td>13.12</td>", xml); + assertContains("<td>123456789012345</td>", xml); + assertContains("<td>1.23456789012345E+15</td>", xml); + assertContains("test comment2", xml); + + assertContains("comment4 (end of row)", xml); + + + assertContains("<td>1/4</td>", xml); + assertContains("<td>3/9/17</td>", xml); + assertContains("<td>4</td>", xml); + assertContains("<td>2</td>", xml); + + assertContains("<td> 46/1963</td>", xml); + assertContains("<td> 3/128</td>", xml); + assertContains("test textbox", xml); + + assertContains("test WordArt", xml); + + assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml); + assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml); + + assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml); + assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml); + + assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml); + assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml); + assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml); + assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml); + } } diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_various.xlsb b/tika-test-resources/src/test/resources/test-documents/testEXCEL_various.xlsb new file mode 100644 index 0000000..22cc9b4 Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_various.xlsb differ -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
