Author: nick
Date: Mon Mar 14 14:27:05 2011
New Revision: 1081392

URL: http://svn.apache.org/viewvc?rev=1081392&view=rev
Log:
Update the OOXML Excel (.xlsx) extractor to be largely SAX based, to reduce the 
memory use (it now works in a similar-ish way to the .xls one). Bumps the POI 
dependency up to 3.8 beta 1. (TIKA-521)

Modified:
    tika/trunk/tika-parsers/pom.xml
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Mar 14 14:27:05 2011
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>3.7</poi.version>
+    <poi.version>3.8-beta1</poi.version>
   </properties>
 
   <dependencies>

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 Mon Mar 14 14:27:05 2011
@@ -23,6 +23,7 @@ import org.apache.poi.POIXMLProperties.C
 import org.apache.poi.POIXMLProperties.ExtendedProperties;
 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
 import org.apache.poi.openxml4j.util.Nullable;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
@@ -50,7 +51,9 @@ public class MetadataExtractor {
     public void extract(Metadata metadata) throws TikaException {
         addProperty(metadata, Metadata.CONTENT_TYPE, type);
         
-        if (extractor.getDocument()!=null) {
+        if (extractor.getDocument() != null ||
+              (extractor instanceof XSSFEventBasedExcelExtractor && 
+               extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
         }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 Mon Mar 14 14:27:05 2011
@@ -28,8 +28,7 @@ import org.apache.poi.openxml4j.exceptio
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.tika.exception.TikaException;
@@ -51,7 +50,8 @@ public class OOXMLExtractorFactory {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         Locale locale = context.get(Locale.class, Locale.getDefault());
-       
+        ExtractorFactory.setThreadPrefersEventExtractors(true);
+        
         try {
             OOXMLExtractor extractor;
 
@@ -66,12 +66,17 @@ public class OOXMLExtractorFactory {
             }
             
             POIXMLDocument document = poiExtractor.getDocument();
-            if (document instanceof XSLFSlideShow) {
+            if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+               extractor = new XSSFExcelExtractorDecorator(
+                   context, (XSSFEventBasedExcelExtractor)poiExtractor, 
locale);
+            } else if (document == null) {
+               throw new TikaException(
+                     "Expecting UserModel based POI OOXML extractor with a 
document, but none found. " +
+                     "The extractor returned was a " + poiExtractor
+               );
+            } else if (document instanceof XSLFSlideShow) {
                 extractor = new XSLFPowerPointExtractorDecorator(
                         context, (XSLFPowerPointExtractor) poiExtractor);
-            } else if (document instanceof XSSFWorkbook) {
-                extractor = new XSSFExcelExtractorDecorator(
-                        context, (XSSFExcelExtractor) poiExtractor, locale);
             } else if (document instanceof XWPFDocument) {
                 extractor = new XWPFWordExtractorDecorator(
                         context, (XWPFWordExtractor) poiExtractor);
@@ -79,8 +84,8 @@ public class OOXMLExtractorFactory {
                 extractor = new POIXMLTextExtractorDecorator(context, 
poiExtractor);
             }
 
-            extractor.getMetadataExtractor().extract(metadata);
             extractor.getXHTML(handler, metadata, context);
+            extractor.getMetadataExtractor().extract(metadata);
         } catch (IllegalArgumentException e) {
             if (e.getMessage().startsWith("No supported documents found")) {
                 throw new TikaException(

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Mon Mar 14 14:27:05 2011
@@ -17,53 +17,69 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Comment;
 import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.usermodel.HeaderFooter;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
+import 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.model.CommentsTable;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
 
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
-
-    /**
-     * Internal <code>DataFormatter</code> for formatting Numbers.
-     */
+    private final XSSFEventBasedExcelExtractor extractor;
     private final DataFormatter formatter;
-
-    private final XSSFExcelExtractor extractor;
+    private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
+    private final List<Boolean> sheetProtected = new ArrayList<Boolean>();
     private static final String TYPE = 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
 
     public XSSFExcelExtractorDecorator(
-            ParseContext context, XSSFExcelExtractor extractor, Locale locale) 
{
+            ParseContext context, XSSFEventBasedExcelExtractor extractor, 
Locale locale) {
         super(context, extractor, TYPE);
 
         this.extractor = extractor;
-        formatter = new DataFormatter(locale);
+        extractor.setFormulasNotResults(false);
+        extractor.setLocale(locale);
+        
+        if(locale == null) {
+           formatter = new DataFormatter();
+        } else  {
+           formatter = new DataFormatter(locale);
+        }
     }
 
     /**
@@ -72,80 +88,229 @@ public class XSSFExcelExtractorDecorator
     @Override
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
             XmlException, IOException {
-        XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+       OPCPackage container = extractor.getPackage();
+       
+       ReadOnlySharedStringsTable strings;
+       XSSFReader.SheetIterator iter;
+       XSSFReader xssfReader;
+       StylesTable styles;
+       try {
+          xssfReader = new XSSFReader(container);
+          styles = xssfReader.getStylesTable();
+          iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+          strings = new ReadOnlySharedStringsTable(container);
+       } catch(InvalidFormatException e) {
+          throw new XmlException(e);
+       } catch (OpenXML4JException oe) {
+          throw new XmlException(oe);
+       }
 
-        for (int i = 0; i < document.getNumberOfSheets(); i++) {
-            xhtml.startElement("div");
-            XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
-            xhtml.element("h1", document.getSheetName(i));
-
-            // Header(s), if present
-            extractHeaderFooter(sheet.getFirstHeader(), xhtml);
-            extractHeaderFooter(sheet.getOddHeader(), xhtml);
-            extractHeaderFooter(sheet.getEvenHeader(), xhtml);
-
-            xhtml.startElement("table");
-            xhtml.startElement("tbody");
-
-            // Rows and cells
-            for (Object rawR : sheet) {
-                xhtml.startElement("tr");
-                Row row = (Row) rawR;
-                for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
-                    xhtml.startElement("td");
-                    Cell cell = ri.next();
-
-                    int type = cell.getCellType();
-                    if (type == Cell.CELL_TYPE_FORMULA) {
-                        type = cell.getCachedFormulaResultType();
-                    }
-                    if (type == Cell.CELL_TYPE_STRING) {
-                        xhtml.characters(cell.getRichStringCellValue()
-                                .getString());
-                    } else if (type == Cell.CELL_TYPE_NUMERIC) {
-                        CellStyle style = cell.getCellStyle();
-                        xhtml.characters(
-                            
formatter.formatRawCellContents(cell.getNumericCellValue(),
-                                                            
style.getDataFormat(),
-                                                            
style.getDataFormatString()));
-                    } else {
-                        XSSFCell xc = (XSSFCell) cell;
-                        String rawValue = xc.getRawValue();
-                        if (rawValue != null) {
-                            xhtml.characters(rawValue);
-                        }
-
-                    }
-
-                    // Output the comment in the same cell as the content
-                    Comment comment = cell.getCellComment();
-                    if (comment != null) {
-                        xhtml.characters(comment.getString().getString());
-                    }
+       while (iter.hasNext()) {
+           InputStream stream = iter.next();
+           sheetParts.add(iter.getSheetPart());
+           SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml, 
iter.getSheetComments());
+
+           // Start, and output the sheet name
+           xhtml.startElement("div");
+           xhtml.element("h1", iter.getSheetName());
+           
+           // Extract the main sheet contents
+           xhtml.startElement("table");
+           xhtml.startElement("tbody");
+           
+           processSheet(sheetExtractor, styles, strings, stream);
+
+           xhtml.endElement("tbody");
+           xhtml.endElement("table");
+           
+           // Output any headers and footers
+           // (Need to process the sheet to get them, so we can't
+           //  do the headers before the contents)
+           for(String header : sheetExtractor.headers) {
+              extractHeaderFooter(header, xhtml);
+           }
+           for(String footer : sheetExtractor.footers) {
+              extractHeaderFooter(footer, xhtml);
+           }
+           
+           // All done with this sheet
+           xhtml.endElement("div");
+       }
+    }
 
-                    xhtml.endElement("td");
-                }
-                xhtml.endElement("tr");
-            }
+    private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+            throws SAXException {
+        String content = ExcelExtractor._extractHeaderFooter(
+              new HeaderFooterFromString(hf));
+        if (content.length() > 0) {
+            xhtml.element("p", content);
+        }
+    }
+    
+    public void processSheet(
+          SheetContentsHandler sheetContentsExtractor,
+          StylesTable styles,
+          ReadOnlySharedStringsTable strings,
+          InputStream sheetInputStream)
+          throws IOException, SAXException {
+      InputSource sheetSource = new InputSource(sheetInputStream);
+      SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+      try {
+         SAXParser saxParser = saxFactory.newSAXParser();
+         XMLReader sheetParser = saxParser.getXMLReader();
+         XSSFSheetInterestingPartsCapturer handler =  
+            new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
+               styles, strings, sheetContentsExtractor, formatter, false));
+         sheetParser.setContentHandler(handler);
+         sheetParser.parse(sheetSource);
+         sheetInputStream.close();
+         
+         sheetProtected.add(handler.hasProtection);
+      } catch(ParserConfigurationException e) {
+         throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
+      }
+    }
+     
+    /**
+     * Turns formatted sheet events into HTML
+     */
+    protected class SheetTextAsHTML implements SheetContentsHandler {
+       private XHTMLContentHandler xhtml;
+       private CommentsTable comments;
+       private List<String> headers;
+       private List<String> footers;
+       
+       protected SheetTextAsHTML(XHTMLContentHandler xhtml, CommentsTable 
comments) {
+          this.xhtml = xhtml;
+          this.comments = comments;
+          headers = new ArrayList<String>();
+          footers = new ArrayList<String>();
+       }
+       
+       public void startRow(int rowNum) {
+          try {
+             xhtml.startElement("tr");
+          } catch(SAXException e) {}
+       }
+       
+       public void endRow() {
+          try {
+             xhtml.endElement("tr");
+          } catch(SAXException e) {}
+       }
 
-            xhtml.endElement("tbody");
-            xhtml.endElement("table");
+       public void cell(String cellRef, String formattedValue) {
+          try {
+             xhtml.startElement("td");
 
-            // Finally footer(s), if present
-            extractHeaderFooter(sheet.getFirstFooter(), xhtml);
-            extractHeaderFooter(sheet.getOddFooter(), xhtml);
-            extractHeaderFooter(sheet.getEvenFooter(), xhtml);
+             // Main cell contents
+             xhtml.characters(formattedValue);
 
-            xhtml.endElement("div");
-        }
+             // Comments
+             if(comments != null) {
+                XSSFComment comment = comments.findCellComment(cellRef);
+                if(comment != null) {
+                   xhtml.startElement("br");
+                   xhtml.endElement("br");
+                   xhtml.characters(comment.getAuthor());
+                   xhtml.characters(": ");
+                   xhtml.characters(comment.getString().getString());
+                }
+             }
+
+             xhtml.endElement("td");
+          } catch(SAXException e) {}
+       }
+       
+       public void headerFooter(String text, boolean isHeader, String tagName) 
{
+          if(isHeader) {
+             headers.add(text);
+          } else {
+             footers.add(text);
+          }
+       }
     }
+    
+    /**
+     * Allows access to headers/footers from raw xml strings
+     */
+    private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
+    protected class HeaderFooterFromString implements HeaderFooter {
+      private String text;
+      protected HeaderFooterFromString(String text) {
+         this.text = text;
+      }
+
+      public String getCenter() {
+         return hfHelper.getCenterSection(text);
+      }
+      public String getLeft() {
+         return hfHelper.getLeftSection(text);
+      }
+      public String getRight() {
+         return hfHelper.getRightSection(text);
+      }
+
+      public void setCenter(String paramString) {}
+      public void setLeft(String paramString) {}
+      public void setRight(String paramString) {}
+    }
+    
+    /**
+     * Captures information on interesting tags, whilst
+     *  delegating the main work to the formatting handler
+     */
+    protected class XSSFSheetInterestingPartsCapturer implements 
ContentHandler {
+      private ContentHandler delegate;
+      private boolean hasProtection = false;
+      
+      protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
+         this.delegate = delegate;
+      }
+      
+      public void startElement(String uri, String localName, String qName,
+            Attributes atts) throws SAXException {
+         if("sheetProtection".equals(qName)) {
+            hasProtection = true;
+         }
+         delegate.startElement(uri, localName, qName, atts);
+      }
 
-    private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler 
xhtml)
+      public void characters(char[] ch, int start, int length)
             throws SAXException {
-        String content = ExcelExtractor._extractHeaderFooter(hf);
-        if (content.length() > 0) {
-            xhtml.element("p", content);
-        }
+         delegate.characters(ch, start, length);
+      }
+      public void endDocument() throws SAXException {
+         delegate.endDocument();
+      }
+      public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+         delegate.endElement(uri, localName, qName);
+      }
+      public void endPrefixMapping(String prefix) throws SAXException {
+         delegate.endPrefixMapping(prefix);
+      }
+      public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+         delegate.ignorableWhitespace(ch, start, length);
+      }
+      public void processingInstruction(String target, String data)
+            throws SAXException {
+         delegate.processingInstruction(target, data);
+      }
+      public void setDocumentLocator(Locator locator) {
+         delegate.setDocumentLocator(locator);
+      }
+      public void skippedEntity(String name) throws SAXException {
+         delegate.skippedEntity(name);
+      }
+      public void startDocument() throws SAXException {
+         delegate.startDocument();
+      }
+      public void startPrefixMapping(String prefix, String uri)
+            throws SAXException {
+         delegate.startPrefixMapping(prefix, uri);
+      }
     }
     
     /**
@@ -155,10 +320,7 @@ public class XSSFExcelExtractorDecorator
     @Override
     protected List<PackagePart> getMainDocumentParts() throws TikaException {
        List<PackagePart> parts = new ArrayList<PackagePart>();
-       XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
-       for(XSSFSheet sheet : document) {
-          PackagePart part = sheet.getPackagePart();
-          
+       for(PackagePart part : sheetParts) {
           // Add the sheet
           parts.add(part);
           
@@ -192,15 +354,10 @@ public class XSSFExcelExtractorDecorator
                 super.extract(metadata);
 
                 metadata.set(TikaMetadataKeys.PROTECTED, "false");
-
-                XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
-
-                for (int i = 0; i < document.getNumberOfSheets(); i++) {
-                    XSSFSheet sheet = document.getSheetAt(i);
-
-                    if (sheet.getProtect()) {
-                        metadata.set(TikaMetadataKeys.PROTECTED, "true");
-                    }
+                for(boolean prot : sheetProtected) {
+                   if(prot) {
+                      metadata.set(TikaMetadataKeys.PROTECTED, "true");
+                   }
                 }
             }
         };

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1081392&r1=1081391&r2=1081392&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Mon Mar 14 14:27:05 2011
@@ -90,6 +90,10 @@ public class TestContainerAwareDetector 
         assertDetect("testPPT.pptm", 
"application/vnd.ms-powerpoint.presentation.macroenabled.12");
         assertDetect("testPPT.ppsx", 
"application/vnd.openxmlformats-officedocument.presentationml.slideshow");
         assertDetect("testPPT.ppsm", 
"application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
+        
+        // .xlsb is an OOXML file containing the binary parts, and not
+        //  an OLE2 file as you might initially expect!
+        assertDetect("testEXCEL.xlsb", 
"application/vnd.ms-excel.sheet.binary.macroEnabled.12");
     }
 
     public void testDetectIWork() throws Exception {


Reply via email to