Author: nick
Date: Mon Mar 14 14:27:05 2011
New Revision: 1081392
URL: http://svn.apache.org/viewvc?rev=1081392&view=rev
Log:
Update the OOXML Excel (.xlsx) extractor to be largely SAX based, to reduce the
memory use (it now works in a similar-ish way to the .xls one). Bumps the POI
dependency up to 3.8 beta 1. (TIKA-521)
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Mar 14 14:27:05 2011
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.7</poi.version>
+ <poi.version>3.8-beta1</poi.version>
</properties>
<dependencies>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
Mon Mar 14 14:27:05 2011
@@ -23,6 +23,7 @@ import org.apache.poi.POIXMLProperties.C
import org.apache.poi.POIXMLProperties.ExtendedProperties;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.openxml4j.util.Nullable;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
@@ -50,7 +51,9 @@ public class MetadataExtractor {
public void extract(Metadata metadata) throws TikaException {
addProperty(metadata, Metadata.CONTENT_TYPE, type);
- if (extractor.getDocument()!=null) {
+ if (extractor.getDocument() != null ||
+ (extractor instanceof XSSFEventBasedExcelExtractor &&
+ extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
Mon Mar 14 14:27:05 2011
@@ -28,8 +28,7 @@ import org.apache.poi.openxml4j.exceptio
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.exception.TikaException;
@@ -51,7 +50,8 @@ public class OOXMLExtractorFactory {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
-
+ ExtractorFactory.setThreadPrefersEventExtractors(true);
+
try {
OOXMLExtractor extractor;
@@ -66,12 +66,17 @@ public class OOXMLExtractorFactory {
}
POIXMLDocument document = poiExtractor.getDocument();
- if (document instanceof XSLFSlideShow) {
+ if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+ extractor = new XSSFExcelExtractorDecorator(
+ context, (XSSFEventBasedExcelExtractor)poiExtractor,
locale);
+ } else if (document == null) {
+ throw new TikaException(
+ "Expecting UserModel based POI OOXML extractor with a
document, but none found. " +
+ "The extractor returned was a " + poiExtractor
+ );
+ } else if (document instanceof XSLFSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(
context, (XSLFPowerPointExtractor) poiExtractor);
- } else if (document instanceof XSSFWorkbook) {
- extractor = new XSSFExcelExtractorDecorator(
- context, (XSSFExcelExtractor) poiExtractor, locale);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(
context, (XWPFWordExtractor) poiExtractor);
@@ -79,8 +84,8 @@ public class OOXMLExtractorFactory {
extractor = new POIXMLTextExtractorDecorator(context,
poiExtractor);
}
- extractor.getMetadataExtractor().extract(metadata);
extractor.getXHTML(handler, metadata, context);
+ extractor.getMetadataExtractor().extract(metadata);
} catch (IllegalArgumentException e) {
if (e.getMessage().startsWith("No supported documents found")) {
throw new TikaException(
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
Mon Mar 14 14:27:05 2011
@@ -17,53 +17,69 @@
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import java.util.Locale;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Comment;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
+import
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.model.CommentsTable;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
-
- /**
- * Internal <code>DataFormatter</code> for formatting Numbers.
- */
+ private final XSSFEventBasedExcelExtractor extractor;
private final DataFormatter formatter;
-
- private final XSSFExcelExtractor extractor;
+ private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
+ private final List<Boolean> sheetProtected = new ArrayList<Boolean>();
private static final String TYPE =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
public XSSFExcelExtractorDecorator(
- ParseContext context, XSSFExcelExtractor extractor, Locale locale)
{
+ ParseContext context, XSSFEventBasedExcelExtractor extractor,
Locale locale) {
super(context, extractor, TYPE);
this.extractor = extractor;
- formatter = new DataFormatter(locale);
+ extractor.setFormulasNotResults(false);
+ extractor.setLocale(locale);
+
+ if(locale == null) {
+ formatter = new DataFormatter();
+ } else {
+ formatter = new DataFormatter(locale);
+ }
}
/**
@@ -72,80 +88,229 @@ public class XSSFExcelExtractorDecorator
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
XmlException, IOException {
- XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+ OPCPackage container = extractor.getPackage();
+
+ ReadOnlySharedStringsTable strings;
+ XSSFReader.SheetIterator iter;
+ XSSFReader xssfReader;
+ StylesTable styles;
+ try {
+ xssfReader = new XSSFReader(container);
+ styles = xssfReader.getStylesTable();
+ iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+ strings = new ReadOnlySharedStringsTable(container);
+ } catch(InvalidFormatException e) {
+ throw new XmlException(e);
+ } catch (OpenXML4JException oe) {
+ throw new XmlException(oe);
+ }
- for (int i = 0; i < document.getNumberOfSheets(); i++) {
- xhtml.startElement("div");
- XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
- xhtml.element("h1", document.getSheetName(i));
-
- // Header(s), if present
- extractHeaderFooter(sheet.getFirstHeader(), xhtml);
- extractHeaderFooter(sheet.getOddHeader(), xhtml);
- extractHeaderFooter(sheet.getEvenHeader(), xhtml);
-
- xhtml.startElement("table");
- xhtml.startElement("tbody");
-
- // Rows and cells
- for (Object rawR : sheet) {
- xhtml.startElement("tr");
- Row row = (Row) rawR;
- for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
- xhtml.startElement("td");
- Cell cell = ri.next();
-
- int type = cell.getCellType();
- if (type == Cell.CELL_TYPE_FORMULA) {
- type = cell.getCachedFormulaResultType();
- }
- if (type == Cell.CELL_TYPE_STRING) {
- xhtml.characters(cell.getRichStringCellValue()
- .getString());
- } else if (type == Cell.CELL_TYPE_NUMERIC) {
- CellStyle style = cell.getCellStyle();
- xhtml.characters(
-
formatter.formatRawCellContents(cell.getNumericCellValue(),
-
style.getDataFormat(),
-
style.getDataFormatString()));
- } else {
- XSSFCell xc = (XSSFCell) cell;
- String rawValue = xc.getRawValue();
- if (rawValue != null) {
- xhtml.characters(rawValue);
- }
-
- }
-
- // Output the comment in the same cell as the content
- Comment comment = cell.getCellComment();
- if (comment != null) {
- xhtml.characters(comment.getString().getString());
- }
+ while (iter.hasNext()) {
+ InputStream stream = iter.next();
+ sheetParts.add(iter.getSheetPart());
+ SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml,
iter.getSheetComments());
+
+ // Start, and output the sheet name
+ xhtml.startElement("div");
+ xhtml.element("h1", iter.getSheetName());
+
+ // Extract the main sheet contents
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ processSheet(sheetExtractor, styles, strings, stream);
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Output any headers and footers
+ // (Need to process the sheet to get them, so we can't
+ // do the headers before the contents)
+ for(String header : sheetExtractor.headers) {
+ extractHeaderFooter(header, xhtml);
+ }
+ for(String footer : sheetExtractor.footers) {
+ extractHeaderFooter(footer, xhtml);
+ }
+
+ // All done with this sheet
+ xhtml.endElement("div");
+ }
+ }
- xhtml.endElement("td");
- }
- xhtml.endElement("tr");
- }
+ private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+ throws SAXException {
+ String content = ExcelExtractor._extractHeaderFooter(
+ new HeaderFooterFromString(hf));
+ if (content.length() > 0) {
+ xhtml.element("p", content);
+ }
+ }
+
+ public void processSheet(
+ SheetContentsHandler sheetContentsExtractor,
+ StylesTable styles,
+ ReadOnlySharedStringsTable strings,
+ InputStream sheetInputStream)
+ throws IOException, SAXException {
+ InputSource sheetSource = new InputSource(sheetInputStream);
+ SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+ try {
+ SAXParser saxParser = saxFactory.newSAXParser();
+ XMLReader sheetParser = saxParser.getXMLReader();
+ XSSFSheetInterestingPartsCapturer handler =
+ new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
+ styles, strings, sheetContentsExtractor, formatter, false));
+ sheetParser.setContentHandler(handler);
+ sheetParser.parse(sheetSource);
+ sheetInputStream.close();
+
+ sheetProtected.add(handler.hasProtection);
+ } catch(ParserConfigurationException e) {
+ throw new RuntimeException("SAX parser appears to be broken - " +
e.getMessage());
+ }
+ }
+
+ /**
+ * Turns formatted sheet events into HTML
+ */
+ protected class SheetTextAsHTML implements SheetContentsHandler {
+ private XHTMLContentHandler xhtml;
+ private CommentsTable comments;
+ private List<String> headers;
+ private List<String> footers;
+
+ protected SheetTextAsHTML(XHTMLContentHandler xhtml, CommentsTable
comments) {
+ this.xhtml = xhtml;
+ this.comments = comments;
+ headers = new ArrayList<String>();
+ footers = new ArrayList<String>();
+ }
+
+ public void startRow(int rowNum) {
+ try {
+ xhtml.startElement("tr");
+ } catch(SAXException e) {}
+ }
+
+ public void endRow() {
+ try {
+ xhtml.endElement("tr");
+ } catch(SAXException e) {}
+ }
- xhtml.endElement("tbody");
- xhtml.endElement("table");
+ public void cell(String cellRef, String formattedValue) {
+ try {
+ xhtml.startElement("td");
- // Finally footer(s), if present
- extractHeaderFooter(sheet.getFirstFooter(), xhtml);
- extractHeaderFooter(sheet.getOddFooter(), xhtml);
- extractHeaderFooter(sheet.getEvenFooter(), xhtml);
+ // Main cell contents
+ xhtml.characters(formattedValue);
- xhtml.endElement("div");
- }
+ // Comments
+ if(comments != null) {
+ XSSFComment comment = comments.findCellComment(cellRef);
+ if(comment != null) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ xhtml.characters(comment.getAuthor());
+ xhtml.characters(": ");
+ xhtml.characters(comment.getString().getString());
+ }
+ }
+
+ xhtml.endElement("td");
+ } catch(SAXException e) {}
+ }
+
+ public void headerFooter(String text, boolean isHeader, String tagName)
{
+ if(isHeader) {
+ headers.add(text);
+ } else {
+ footers.add(text);
+ }
+ }
}
+
+ /**
+ * Allows access to headers/footers from raw xml strings
+ */
+ private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
+ protected class HeaderFooterFromString implements HeaderFooter {
+ private String text;
+ protected HeaderFooterFromString(String text) {
+ this.text = text;
+ }
+
+ public String getCenter() {
+ return hfHelper.getCenterSection(text);
+ }
+ public String getLeft() {
+ return hfHelper.getLeftSection(text);
+ }
+ public String getRight() {
+ return hfHelper.getRightSection(text);
+ }
+
+ public void setCenter(String paramString) {}
+ public void setLeft(String paramString) {}
+ public void setRight(String paramString) {}
+ }
+
+ /**
+ * Captures information on interesting tags, whilst
+ * delegating the main work to the formatting handler
+ */
+ protected class XSSFSheetInterestingPartsCapturer implements
ContentHandler {
+ private ContentHandler delegate;
+ private boolean hasProtection = false;
+
+ protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
+ this.delegate = delegate;
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ if("sheetProtection".equals(qName)) {
+ hasProtection = true;
+ }
+ delegate.startElement(uri, localName, qName, atts);
+ }
- private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler
xhtml)
+ public void characters(char[] ch, int start, int length)
throws SAXException {
- String content = ExcelExtractor._extractHeaderFooter(hf);
- if (content.length() > 0) {
- xhtml.element("p", content);
- }
+ delegate.characters(ch, start, length);
+ }
+ public void endDocument() throws SAXException {
+ delegate.endDocument();
+ }
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ delegate.endElement(uri, localName, qName);
+ }
+ public void endPrefixMapping(String prefix) throws SAXException {
+ delegate.endPrefixMapping(prefix);
+ }
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ delegate.ignorableWhitespace(ch, start, length);
+ }
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ delegate.processingInstruction(target, data);
+ }
+ public void setDocumentLocator(Locator locator) {
+ delegate.setDocumentLocator(locator);
+ }
+ public void skippedEntity(String name) throws SAXException {
+ delegate.skippedEntity(name);
+ }
+ public void startDocument() throws SAXException {
+ delegate.startDocument();
+ }
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ delegate.startPrefixMapping(prefix, uri);
+ }
}
/**
@@ -155,10 +320,7 @@ public class XSSFExcelExtractorDecorator
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<PackagePart>();
- XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
- for(XSSFSheet sheet : document) {
- PackagePart part = sheet.getPackagePart();
-
+ for(PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);
@@ -192,15 +354,10 @@ public class XSSFExcelExtractorDecorator
super.extract(metadata);
metadata.set(TikaMetadataKeys.PROTECTED, "false");
-
- XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
-
- for (int i = 0; i < document.getNumberOfSheets(); i++) {
- XSSFSheet sheet = document.getSheetAt(i);
-
- if (sheet.getProtect()) {
- metadata.set(TikaMetadataKeys.PROTECTED, "true");
- }
+ for(boolean prot : sheetProtected) {
+ if(prot) {
+ metadata.set(TikaMetadataKeys.PROTECTED, "true");
+ }
}
}
};
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Mon Mar 14 14:27:05 2011
@@ -90,6 +90,10 @@ public class TestContainerAwareDetector
assertDetect("testPPT.pptm",
"application/vnd.ms-powerpoint.presentation.macroenabled.12");
assertDetect("testPPT.ppsx",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow");
assertDetect("testPPT.ppsm",
"application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
+
+ // .xlsb is an OOXML file containing the binary parts, and not
+ // an OLE2 file as you might initially expect!
+ assertDetect("testEXCEL.xlsb",
"application/vnd.ms-excel.sheet.binary.macroEnabled.12");
}
public void testDetectIWork() throws Exception {