tik...

bob Tue, 05 Jan 2016 19:52:00 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.awt.*;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.poi.ddf.EscherBSERecord;
+import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.DrawingGroupRecord;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FooterRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HeaderRecord;
+import org.apache.poi.hssf.record.HyperlinkRecord;
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.hssf.record.TextObjectRecord;
+import org.apache.poi.hssf.record.chart.SeriesTextRecord;
+import org.apache.poi.hssf.record.common.UnicodeString;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFPictureData;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p/>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p/>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api";>
+ * POI Event API How To</a>
+ */
+public class ExcelExtractor extends AbstractPOIFSExtractor {
+
+    private static final String WORKBOOK_ENTRY = "Workbook";
+    private static final String BOOK_ENTRY = "Book";
+    /**
+     * <code>true</code> if the HSSFListener should be registered
+     * to listen for all records or <code>false</code> (the default)
+     * if the listener should be configured to only receive specified
+     * records.
+     */
+    private boolean listenForAllRecords = false;
+
+    public ExcelExtractor(ParseContext context, Metadata metadata) {
+        super(context, metadata);
+    }
+
+    /**
+     * Returns <code>true</code> if this parser is configured to listen
+     * for all records instead of just the specified few.
+     */
+    public boolean isListenForAllRecords() {
+        return listenForAllRecords;
+    }
+
+    /**
+     * Specifies whether this parser should to listen for all
+     * records or just for the specified few.
+     * <p/>
+     * <strong>Note:</strong> Under normal operation this setting should
+     * be <code>false</code> (the default), but you can experiment with
+     * this setting for testing and debugging purposes.
+     *
+     * @param listenForAllRecords <code>true</code> if the HSSFListener
+     *                            should be registered to listen for all 
records or <code>false</code>
+     *                            if the listener should be configured to only 
receive specified records.
+     */
+    public void setListenForAllRecords(boolean listenForAllRecords) {
+        this.listenForAllRecords = listenForAllRecords;
+    }
+
+    /**
+     * Extracts text from an Excel Workbook writing the extracted content
+     * to the specified {@link Appendable}.
+     *
+     * @param filesystem POI file system
+     * @throws IOException if an error occurs processing the workbook
+     *                     or writing the extracted content
+     */
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+            Locale locale) throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml, locale);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml,
+            Locale locale) throws IOException, SAXException, TikaException {
+        if (!root.hasEntry(WORKBOOK_ENTRY)) {
+            if (root.hasEntry(BOOK_ENTRY)) {
+                // Excel 5 / Excel 95 file
+                // Records are in a different structure so needs a
+                //  different parser to process them
+                OldExcelExtractor extractor = new OldExcelExtractor(root);
+                OldExcelParser.parse(extractor, xhtml);
+                return;
+            } else {
+                // Corrupt file / very old file, just skip text extraction
+                return;
+            }
+        }
+
+        // If a password was supplied, use it, otherwise the default
+        Biff8EncryptionKey.setCurrentUserPassword(getPassword());
+
+        // Have the file processed in event mode
+        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
+        listener.processFile(root, isListenForAllRecords());
+        listener.throwStoredException();
+
+        for (Entry entry : root) {
+            if (entry.getName().startsWith("MBD")
+                    && entry instanceof DirectoryEntry) {
+                try {
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+                } catch (TikaException e) {
+                    // ignore parse errors from embedded documents
+                }
+            }
+        }
+    }
+
+    // ======================================================================
+
+    /**
+     * HSSF Listener implementation which processes the HSSF records.
+     */
+    private static class TikaHSSFListener implements HSSFListener {
+
+        /**
+         * XHTML content handler to which the document content is rendered.
+         */
+        private final XHTMLContentHandler handler;
+
+        /**
+         * The POIFS Extractor, used for embeded resources.
+         */
+        private final AbstractPOIFSExtractor extractor;
+        /**
+         * Format for rendering numbers in the worksheet. Currently we just
+         * use the platform default formatting.
+         *
+         * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-103";>TIKA-103</a>
+         */
+        private final NumberFormat format;
+        /**
+         * Potential exception thrown by the content handler. When set to
+         * non-<code>null</code>, causes all subsequent HSSF records to be
+         * ignored and the stored exception to be thrown when
+         * {@link #throwStoredException()} is invoked.
+         */
+        private Exception exception = null;
+        private SSTRecord sstRecord;
+        private FormulaRecord stringFormulaRecord;
+        private short previousSid;
+        /**
+         * Internal <code>FormatTrackingHSSFListener</code> to handle cell
+         * formatting within the extraction.
+         */
+        private FormatTrackingHSSFListener formatListener;
+        /**
+         * List of worksheet names.
+         */
+        private List<String> sheetNames = new ArrayList<String>();
+        /**
+         * Index of the current worksheet within the workbook.
+         * Used to find the worksheet name in the {@link #sheetNames} list.
+         */
+        private short currentSheetIndex;
+        /**
+         * Content of the current worksheet, or <code>null</code> if no
+         * worksheet is currently active.
+         */
+        private SortedMap<Point, Cell> currentSheet = null;
+        /**
+         * Extra text or cells that crops up, typically as part of a
+         * worksheet but not always.
+         */
+        private List<Cell> extraTextCells = new ArrayList<Cell>();
+        /**
+         * These aren't complete when we first see them, as the
+         * depend on continue records that aren't always
+         * contiguous. Collect them for later processing.
+         */
+        private List<DrawingGroupRecord> drawingGroups = new 
ArrayList<DrawingGroupRecord>();
+
+        /**
+         * Construct a new listener instance outputting parsed data to
+         * the specified XHTML content handler.
+         *
+         * @param handler Destination to write the parsed output to
+         */
+        private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, 
AbstractPOIFSExtractor extractor) {
+            this.handler = handler;
+            this.extractor = extractor;
+            this.format = NumberFormat.getInstance(locale);
+            this.formatListener = new FormatTrackingHSSFListener(this, locale);
+        }
+
+        /**
+         * Entry point to listener to start the processing of a file.
+         *
+         * @param filesystem          POI file system.
+         * @param listenForAllRecords sets whether the listener is configured 
to listen
+         *                            for all records types or not.
+         * @throws IOException  on any IO errors.
+         * @throws SAXException on any SAX parsing errors.
+         */
+        public void processFile(NPOIFSFileSystem filesystem, boolean 
listenForAllRecords)
+                throws IOException, SAXException, TikaException {
+            processFile(filesystem.getRoot(), listenForAllRecords);
+        }
+
+        public void processFile(DirectoryNode root, boolean 
listenForAllRecords)
+                throws IOException, SAXException, TikaException {
+
+            // Set up listener and register the records we want to process
+            HSSFRequest hssfRequest = new HSSFRequest();
+            if (listenForAllRecords) {
+                hssfRequest.addListenerForAllRecords(formatListener);
+            } else {
+                hssfRequest.addListener(formatListener, BOFRecord.sid);
+                hssfRequest.addListener(formatListener, EOFRecord.sid);
+                hssfRequest.addListener(formatListener, 
DateWindow1904Record.sid);
+                hssfRequest.addListener(formatListener, CountryRecord.sid);
+                hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
+                hssfRequest.addListener(formatListener, SSTRecord.sid);
+                hssfRequest.addListener(formatListener, FormulaRecord.sid);
+                hssfRequest.addListener(formatListener, LabelRecord.sid);
+                hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
+                hssfRequest.addListener(formatListener, NumberRecord.sid);
+                hssfRequest.addListener(formatListener, RKRecord.sid);
+                hssfRequest.addListener(formatListener, StringRecord.sid);
+                hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
+                hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+                hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
+                hssfRequest.addListener(formatListener, FormatRecord.sid);
+                hssfRequest.addListener(formatListener, 
ExtendedFormatRecord.sid);
+                hssfRequest.addListener(formatListener, 
DrawingGroupRecord.sid);
+                hssfRequest.addListener(formatListener, HeaderRecord.sid);
+                hssfRequest.addListener(formatListener, FooterRecord.sid);
+            }
+
+            // Create event factory and process Workbook (fire events)
+            DocumentInputStream documentInputStream = 
root.createDocumentInputStream(WORKBOOK_ENTRY);
+            HSSFEventFactory eventFactory = new HSSFEventFactory();
+            try {
+                eventFactory.processEvents(hssfRequest, documentInputStream);
+            } catch (org.apache.poi.EncryptedDocumentException e) {
+                throw new EncryptedDocumentException(e);
+            }
+
+            // Output any extra text that came after all the sheets
+            processExtraText();
+
+            // Look for embeded images, now that the drawing records
+            //  have been fully matched with their continue data
+            for (DrawingGroupRecord dgr : drawingGroups) {
+                dgr.decode();
+                findPictures(dgr.getEscherRecords());
+            }
+        }
+
+        /**
+         * Process a HSSF record.
+         *
+         * @param record HSSF Record
+         */
+        public void processRecord(Record record) {
+            if (exception == null) {
+                try {
+                    internalProcessRecord(record);
+                } catch (TikaException te) {
+                    exception = te;
+                } catch (IOException ie) {
+                    exception = ie;
+                } catch (SAXException se) {
+                    exception = se;
+                }
+            }
+        }
+
+        public void throwStoredException() throws TikaException, SAXException, 
IOException {
+            if (exception != null) {
+                if (exception instanceof IOException)
+                    throw (IOException) exception;
+                if (exception instanceof SAXException)
+                    throw (SAXException) exception;
+                if (exception instanceof TikaException)
+                    throw (TikaException) exception;
+                throw new TikaException(exception.getMessage());
+            }
+        }
+
+        private void internalProcessRecord(Record record) throws SAXException, 
TikaException, IOException {
+            switch (record.getSid()) {
+                case BOFRecord.sid: // start of workbook, worksheet etc. 
records
+                    BOFRecord bof = (BOFRecord) record;
+                    if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+                        currentSheetIndex = -1;
+                    } else if (bof.getType() == BOFRecord.TYPE_CHART) {
+                        if (previousSid == EOFRecord.sid) {
+                            // This is a sheet which contains only a chart
+                            newSheet();
+                        } else {
+                            // This is a chart within a normal sheet
+                            // Handling of this is a bit hacky...
+                            if (currentSheet != null) {
+                                processSheet();
+                                currentSheetIndex--;
+                                newSheet();
+                            }
+                        }
+                    } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+                        newSheet();
+                    }
+                    break;
+
+                case EOFRecord.sid: // end of workbook, worksheet etc. records
+                    if (currentSheet != null) {
+                        processSheet();
+                    }
+                    currentSheet = null;
+                    break;
+
+                case BoundSheetRecord.sid: // Worksheet index record
+                    BoundSheetRecord boundSheetRecord = (BoundSheetRecord) 
record;
+                    sheetNames.add(boundSheetRecord.getSheetname());
+                    break;
+
+                case SSTRecord.sid: // holds all the strings for 
LabelSSTRecords
+                    sstRecord = (SSTRecord) record;
+                    break;
+
+                case FormulaRecord.sid: // Cell value from a formula
+                    FormulaRecord formula = (FormulaRecord) record;
+                    if (formula.hasCachedResultString()) {
+                        // The String itself should be the next record
+                        stringFormulaRecord = formula;
+                    } else {
+                        addTextCell(record, 
formatListener.formatNumberDateCell(formula));
+                    }
+                    break;
+
+                case StringRecord.sid:
+                    if (previousSid == FormulaRecord.sid) {
+                        // Cached string value of a string formula
+                        StringRecord sr = (StringRecord) record;
+                        addTextCell(stringFormulaRecord, sr.getString());
+                    } else {
+                        // Some other string not associated with a cell, skip
+                    }
+                    break;
+
+                case LabelRecord.sid: // strings stored directly in the cell
+                    LabelRecord label = (LabelRecord) record;
+                    addTextCell(record, label.getValue());
+                    break;
+
+                case LabelSSTRecord.sid: // Ref. a string in the shared string 
table
+                    LabelSSTRecord sst = (LabelSSTRecord) record;
+                    UnicodeString unicode = 
sstRecord.getString(sst.getSSTIndex());
+                    addTextCell(record, unicode.getString());
+                    break;
+
+                case NumberRecord.sid: // Contains a numeric cell value
+                    NumberRecord number = (NumberRecord) record;
+                    addTextCell(record, 
formatListener.formatNumberDateCell(number));
+                    break;
+
+                case RKRecord.sid: // Excel internal number record
+                    RKRecord rk = (RKRecord) record;
+                    addCell(record, new NumberCell(rk.getRKNumber(), format));
+                    break;
+
+                case HyperlinkRecord.sid: // holds a URL associated with a cell
+                    if (currentSheet != null) {
+                        HyperlinkRecord link = (HyperlinkRecord) record;
+                        Point point =
+                                new Point(link.getFirstColumn(), 
link.getFirstRow());
+                        Cell cell = currentSheet.get(point);
+                        if (cell != null) {
+                            String address = link.getAddress();
+                            if (address != null) {
+                                addCell(record, new LinkedCell(cell, address));
+                            } else {
+                                addCell(record, cell);
+                            }
+                        }
+                    }
+                    break;
+
+                case TextObjectRecord.sid:
+                    TextObjectRecord tor = (TextObjectRecord) record;
+                    addTextCell(record, tor.getStr().getString());
+                    break;
+
+                case SeriesTextRecord.sid: // Chart label or title
+                    SeriesTextRecord str = (SeriesTextRecord) record;
+                    addTextCell(record, str.getText());
+                    break;
+
+                case DrawingGroupRecord.sid:
+                    // Collect this now, we'll process later when all
+                    //  the continue records are in
+                    drawingGroups.add((DrawingGroupRecord) record);
+                    break;
+                    
+                case HeaderRecord.sid:
+                       HeaderRecord headerRecord = (HeaderRecord) record;
+                       addTextCell(record, headerRecord.getText());
+                       break;
+                       
+                case FooterRecord.sid:
+                       FooterRecord footerRecord = (FooterRecord) record;
+                       addTextCell(record, footerRecord.getText());
+                       break;
+
+            }
+
+            previousSid = record.getSid();
+
+            if (stringFormulaRecord != record) {
+                stringFormulaRecord = null;
+            }
+        }
+
+        private void processExtraText() throws SAXException {
+            if (extraTextCells.size() > 0) {
+                for (Cell cell : extraTextCells) {
+                    handler.startElement("div", "class", "outside");
+                    cell.render(handler);
+                    handler.endElement("div");
+                }
+
+                // Reset
+                extraTextCells.clear();
+            }
+        }
+
+        /**
+         * Adds the given cell (unless <code>null</code>) to the current
+         * worksheet (if any) at the position (if any) of the given record.
+         *
+         * @param record record that holds the cell value
+         * @param cell   cell value (or <code>null</code>)
+         */
+        private void addCell(Record record, Cell cell) throws SAXException {
+            if (cell == null) {
+                // Ignore empty cells
+            } else if (currentSheet != null
+                    && record instanceof CellValueRecordInterface) {
+                // Normal cell inside a worksheet
+                CellValueRecordInterface value =
+                        (CellValueRecordInterface) record;
+                Point point = new Point(value.getColumn(), value.getRow());
+                currentSheet.put(point, cell);
+            } else {
+                // Cell outside the worksheets
+                extraTextCells.add(cell);
+            }
+        }
+
+        /**
+         * Adds a text cell with the given text comment. The given text
+         * is trimmed, and ignored if <code>null</code> or empty.
+         *
+         * @param record record that holds the text value
+         * @param text   text content, may be <code>null</code>
+         * @throws SAXException
+         */
+        private void addTextCell(Record record, String text) throws 
SAXException {
+            if (text != null) {
+                text = text.trim();
+                if (text.length() > 0) {
+                    addCell(record, new TextCell(text));
+                }
+            }
+        }
+
+        private void newSheet() {
+            currentSheetIndex++;
+            currentSheet = new TreeMap<Point, Cell>(new PointComparator());
+        }
+
+        /**
+         * Process an excel sheet.
+         *
+         * @throws SAXException if an error occurs
+         */
+        private void processSheet() throws SAXException {
+            // Sheet Start
+            handler.startElement("div", "class", "page");
+            if (currentSheetIndex < sheetNames.size()) {
+                handler.element("h1", sheetNames.get(currentSheetIndex));
+            }
+            handler.startElement("table");
+            handler.startElement("tbody");
+
+            // Process Rows
+            int currentRow = 0;
+            int currentColumn = 0;
+            handler.startElement("tr");
+            handler.startElement("td");
+            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
+                while (currentRow < entry.getKey().y) {
+                    handler.endElement("td");
+                    handler.endElement("tr");
+                    handler.startElement("tr");
+                    handler.startElement("td");
+                    currentRow++;
+                    currentColumn = 0;
+                }
+
+                while (currentColumn < entry.getKey().x) {
+                    handler.endElement("td");
+                    handler.startElement("td");
+                    currentColumn++;
+                }
+
+                entry.getValue().render(handler);
+            }
+            handler.endElement("td");
+            handler.endElement("tr");
+
+            // Sheet End
+            handler.endElement("tbody");
+            handler.endElement("table");
+
+            // Finish up
+            processExtraText();
+            handler.endElement("div");
+        }
+
+        private void findPictures(List<EscherRecord> records) throws 
IOException, SAXException, TikaException {
+            for (EscherRecord escherRecord : records) {
+                if (escherRecord instanceof EscherBSERecord) {
+                    EscherBlipRecord blip = ((EscherBSERecord) 
escherRecord).getBlipRecord();
+                    if (blip != null) {
+                        HSSFPictureData picture = new HSSFPictureData(blip);
+                        String mimeType = picture.getMimeType();
+                        TikaInputStream stream = 
TikaInputStream.get(picture.getData());
+
+                        // Handle the embeded resource
+                        extractor.handleEmbeddedResource(
+                                stream, null, null, mimeType,
+                                handler, true
+                        );
+                    }
+                }
+
+                // Recursive call.
+                findPictures(escherRecord.getChildRecords());
+            }
+        }
+    }
+
+    /**
+     * Utility comparator for points.
+     */
+    private static class PointComparator implements Comparator<Point> {
+
+        public int compare(Point a, Point b) {
+            int diff = a.y - b.y;
+            if (diff == 0) {
+                diff = a.x - b.x;
+            }
+            return diff;
+        }
+
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.poi.hslf.model.Comment;
+import org.apache.poi.hslf.model.HeadersFooters;
+import org.apache.poi.hslf.model.OLEShape;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextRun;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class HSLFExtractor extends AbstractPOIFSExtractor {
+    public HSLFExtractor(ParseContext context) {
+        super(context);
+    }
+
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HSLFSlideShow ss = new HSLFSlideShow(root);
+        List<HSLFSlide> _slides = ss.getSlides();
+
+        xhtml.startElement("div", "class", "slideShow");
+
+      /* Iterate over slides and extract text */
+        for (HSLFSlide slide : _slides) {
+            xhtml.startElement("div", "class", "slide");
+
+            // Slide header, if present
+            HeadersFooters hf = slide.getHeadersFooters();
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != 
null) {
+                xhtml.startElement("p", "class", "slide-header");
+
+                xhtml.characters(hf.getHeaderText());
+
+                xhtml.endElement("p");
+            }
+
+            // Slide master, if present
+            extractMaster(xhtml, slide.getMasterSheet());
+
+            // Slide text
+            {
+                xhtml.startElement("div", "class", "slide-content");
+
+                textRunsToText(xhtml, slide.getTextParagraphs());
+
+                xhtml.endElement("div");
+            }
+
+            // Table text
+            for (HSLFShape shape : slide.getShapes()) {
+                if (shape instanceof HSLFTable) {
+                    extractTableText(xhtml, (HSLFTable) shape);
+                }
+            }
+
+            // Slide footer, if present
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != 
null) {
+                xhtml.startElement("p", "class", "slide-footer");
+
+                xhtml.characters(hf.getFooterText());
+
+                xhtml.endElement("p");
+            }
+
+            // Comments, if present
+            StringBuilder authorStringBuilder = new StringBuilder();
+            for (Comment comment : slide.getComments()) {
+                authorStringBuilder.setLength(0);
+                xhtml.startElement("p", "class", "slide-comment");
+
+                if (comment.getAuthor() != null) {
+                    authorStringBuilder.append(comment.getAuthor());
+                }
+                if (comment.getAuthorInitials() != null) {
+                    if (authorStringBuilder.length() > 0) {
+                        authorStringBuilder.append(" ");
+                    }
+                    
authorStringBuilder.append("("+comment.getAuthorInitials()+")");
+                }
+                if (authorStringBuilder.length() > 0) {
+                    if (comment.getText() != null) {
+                        authorStringBuilder.append(" - ");
+                    }
+                    xhtml.startElement("b");
+                    xhtml.characters(authorStringBuilder.toString());
+                    xhtml.endElement("b");
+                }
+                if (comment.getText() != null) {
+                    xhtml.characters(comment.getText());
+                }
+                xhtml.endElement("p");
+            }
+
+            // Now any embedded resources
+            handleSlideEmbeddedResources(slide, xhtml);
+
+            // TODO Find the Notes for this slide and extract inline
+
+            // Slide complete
+            xhtml.endElement("div");
+        }
+
+        // All slides done
+        xhtml.endElement("div");
+
+      /* notes */
+        xhtml.startElement("div", "class", "slide-notes");
+        HashSet<Integer> seenNotes = new HashSet<>();
+        HeadersFooters hf = ss.getNotesHeadersFooters();
+
+        for (HSLFSlide slide : _slides) {
+            HSLFNotes notes = slide.getNotes();
+            if (notes == null) {
+                continue;
+            }
+            Integer id = notes._getSheetNumber();
+            if (seenNotes.contains(id)) {
+                continue;
+            }
+            seenNotes.add(id);
+
+            // Repeat the Notes header, if set
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != 
null) {
+                xhtml.startElement("p", "class", "slide-note-header");
+                xhtml.characters(hf.getHeaderText());
+                xhtml.endElement("p");
+            }
+
+            // Notes text
+            textRunsToText(xhtml, notes.getTextParagraphs());
+
+            // Repeat the notes footer, if set
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != 
null) {
+                xhtml.startElement("p", "class", "slide-note-footer");
+                xhtml.characters(hf.getFooterText());
+                xhtml.endElement("p");
+            }
+        }
+
+        handleSlideEmbeddedPictures(ss, xhtml);
+
+        xhtml.endElement("div");
+    }
+
+    private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet 
master) throws SAXException {
+        if (master == null) {
+            return;
+        }
+        List<HSLFShape> shapes = master.getShapes();
+        if (shapes == null || shapes.isEmpty()) {
+            return;
+        }
+
+        xhtml.startElement("div", "class", "slide-master-content");
+        for (HSLFShape shape : shapes) {
+            if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+                if (shape instanceof HSLFTextShape) {
+                       HSLFTextShape tsh = (HSLFTextShape) shape;
+                    String text = tsh.getText();
+                    if (text != null) {
+                        xhtml.element("p", text);
+                    }
+                }
+            }
+        }
+        xhtml.endElement("div");
+    }
+
+    private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) 
throws SAXException {
+        xhtml.startElement("table");
+        for (int row = 0; row < shape.getNumberOfRows(); row++) {
+            xhtml.startElement("tr");
+            for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+                HSLFTableCell cell = shape.getCell(row, col);
+                //insert empty string for empty cell if cell is null
+                String txt = "";
+                if (cell != null) {
+                    txt = cell.getText();
+                }
+                xhtml.element("td", txt);
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("table");
+    }
+
+    private void textRunsToText(XHTMLContentHandler xhtml, 
List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+        if (paragraphsList == null) {
+            return;
+        }
+
+        for (List<HSLFTextParagraph> run : paragraphsList) {
+            // Leaving in wisdom from TIKA-712 for easy revert.
+            // Avoid boiler-plate text on the master slide (0
+            // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+            //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 
1)) {
+
+            boolean isBullet = false;
+            for (HSLFTextParagraph htp : run) {
+                boolean nextBullet = htp.isBullet();
+                // TODO: identify bullet/list type
+                if (isBullet != nextBullet) {
+                    isBullet = nextBullet;
+                    if (isBullet) {
+                        xhtml.startElement("ul");
+                    } else {
+                        xhtml.endElement("ul");
+                    }
+                }
+
+                List<HSLFTextRun> textRuns = htp.getTextRuns();
+                String firstLine = removePBreak(textRuns.get(0).getRawText());
+                boolean showBullet = (isBullet && (textRuns.size() > 1 || 
!"".equals(firstLine)));
+                String paraTag = showBullet ? "li" : "p";
+
+                xhtml.startElement(paraTag);
+                for (HSLFTextRun htr : textRuns) {
+                    String line = htr.getRawText();
+                    if (line != null) {
+                        boolean isfirst = true;
+                        for (String fragment : line.split("\\u000b")) {
+                            if (!isfirst) {
+                                xhtml.startElement("br");
+                                xhtml.endElement("br");
+                            }
+                            isfirst = false;
+                            xhtml.characters(removePBreak(fragment));
+                        }
+                        if (line.endsWith("\u000b")) {
+                            xhtml.startElement("br");
+                            xhtml.endElement("br");
+                        }
+                    }
+                }
+                xhtml.endElement(paraTag);
+            }
+            if (isBullet) {
+                xhtml.endElement("ul");
+            }
+        }
+    }
+
+    // remove trailing paragraph break
+    private static String removePBreak(String fragment) {
+        // the last text run of a text paragraph contains the paragraph break 
(\r)
+        // line breaks (\\u000b) can happen more often
+        return fragment.replaceFirst("\\r$", "");
+    }
+
+    private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, 
XHTMLContentHandler xhtml)
+            throws TikaException, SAXException, IOException {
+        for (HSLFPictureData pic : slideshow.getPictureData()) {
+            String mediaType;
+
+            switch (pic.getType()) {
+                case EMF:
+                    mediaType = "application/x-emf";
+                    break;
+                case WMF:
+                    mediaType = "application/x-msmetafile";
+                    break;
+                case DIB:
+                    mediaType = "image/bmp";
+                    break;
+                default:
+                    mediaType = pic.getContentType();
+                    break;
+            }
+
+            handleEmbeddedResource(
+                    TikaInputStream.get(pic.getData()), null, null,
+                    mediaType, xhtml, false);
+        }
+    }
+
+    private void handleSlideEmbeddedResources(HSLFSlide slide, 
XHTMLContentHandler xhtml)
+            throws TikaException, SAXException, IOException {
+        List<HSLFShape> shapes;
+        try {
+            shapes = slide.getShapes();
+        } catch (NullPointerException e) {
+            // Sometimes HSLF hits problems
+            // Please open POI bugs for any you come across!
+            return;
+        }
+
+        for (HSLFShape shape : shapes) {
+            if (shape instanceof OLEShape) {
+                OLEShape oleShape = (OLEShape) shape;
+                HSLFObjectData data = null;
+                try {
+                    data = oleShape.getObjectData();
+                } catch (NullPointerException e) {
+                /* getObjectData throws NPE some times. */
+                }
+
+                if (data != null) {
+                    String objID = Integer.toString(oleShape.getObjectID());
+
+                    // Embedded Object: add a <div
+                    // class="embedded" id="X"/> so consumer can see where
+                    // in the main text each embedded document
+                    // occurred:
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                    attributes.addAttribute("", "id", "id", "CDATA", objID);
+                    xhtml.startElement("div", attributes);
+                    xhtml.endElement("div");
+
+                    try (TikaInputStream stream = 
TikaInputStream.get(data.getData())) {
+                        String mediaType = null;
+                        if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+                            mediaType = "application/vnd.ms-excel";
+                        }
+                        handleEmbeddedResource(
+                                stream, objID, objID,
+                                mediaType, xhtml, false);
+                    }
+                }
+            }
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.text.DateFormat;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.Column;
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.PropertyMap;
+import com.healthmarketscience.jackcess.Row;
+import com.healthmarketscience.jackcess.Table;
+import com.healthmarketscience.jackcess.query.Query;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Internal class.  Needs to be instantiated for each parse because of
+ * the lack of thread safety with the dateTimeFormatter
+ */
+class JackcessExtractor extends AbstractPOIFSExtractor {
+
+    final static String TITLE_PROP_KEY = "Title";
+    final static String AUTHOR_PROP_KEY = "Author";
+    final static String COMPANY_PROP_KEY = "Company";
+
+    final static String TEXT_FORMAT_KEY = "TextFormat";
+    final static String CURRENCY_FORMAT_KEY = "Format";
+    final static byte TEXT_FORMAT = 0;
+    final static byte RICH_TEXT_FORMAT = 1;
+    final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+    final NumberFormat currencyFormatter;
+    final DateFormat shortDateTimeFormatter;
+
+    final HtmlParser htmlParser = new HtmlParser();
+
+    protected JackcessExtractor(ParseContext context, Locale locale) {
+        super(context);
+        currencyFormatter = NumberFormat.getCurrencyInstance(locale);
+        shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, 
locale);
+    }
+
+    public void parse(Database db, XHTMLContentHandler xhtml, Metadata 
metadata) throws IOException, SAXException, TikaException {
+
+
+        String pw = db.getDatabasePassword();
+        if (pw != null) {
+            metadata.set(JackcessParser.MDB_PW, pw);
+        }
+
+        PropertyMap dbp = db.getDatabaseProperties();
+        for (PropertyMap.Property p : dbp) {
+            metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(),
+                    toString(p.getValue(), p.getType()));
+        }
+
+        PropertyMap up = db.getUserDefinedProperties();
+        for (PropertyMap.Property p : up) {
+            metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ 
p.getName(),
+                    toString(p.getValue(), p.getType()));
+        }
+
+        Set<String> found = new HashSet<>();
+        PropertyMap summaryProperties = db.getSummaryProperties();
+        if (summaryProperties != null) {
+            //try to get core properties
+            PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY);
+            if (title != null) {
+                metadata.set(TikaCoreProperties.TITLE, 
toString(title.getValue(), title.getType()));
+                found.add(title.getName());
+            }
+            PropertyMap.Property author = 
summaryProperties.get(AUTHOR_PROP_KEY);
+            if (author != null && author.getValue() != null) {
+                String authorString = toString(author.getValue(), 
author.getType());
+                SummaryExtractor.addMulti(metadata, 
TikaCoreProperties.CREATOR, authorString);
+                found.add(author.getName());
+            }
+            PropertyMap.Property company = 
summaryProperties.get(COMPANY_PROP_KEY);
+            if (company != null) {
+                metadata.set(OfficeOpenXMLExtended.COMPANY, 
toString(company.getValue(), company.getType()));
+                found.add(company.getName());
+            }
+
+            for (PropertyMap.Property p : db.getSummaryProperties()) {
+                if (! found.contains(p.getName())) {
+                    metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + 
p.getName(),
+                            toString(p.getValue(), p.getType()));
+                }
+            }
+
+        }
+
+        Iterator<Table> it = db.newIterable().
+                setIncludeLinkedTables(false).
+                setIncludeSystemTables(false).iterator();
+
+        while (it.hasNext()) {
+            Table table = it.next();
+            String tableName = table.getName();
+            List<? extends Column> columns = table.getColumns();
+            xhtml.startElement("table", "name", tableName);
+            addHeaders(columns, xhtml);
+            xhtml.startElement("tbody");
+
+            Row r = table.getNextRow();
+
+            while (r != null) {
+                xhtml.startElement("tr");
+                for (Column c : columns) {
+                    handleCell(r, c, xhtml);
+                }
+                xhtml.endElement("tr");
+                r = table.getNextRow();
+            }
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+        }
+
+        for (Query q : db.getQueries()) {
+            xhtml.startElement("div", "type", "sqlQuery");
+            xhtml.characters(q.toSQLString());
+            xhtml.endElement("div");
+        }
+    }
+
+    private void addHeaders(List<? extends Column> columns, 
XHTMLContentHandler xhtml) throws SAXException {
+        xhtml.startElement("thead");
+        xhtml.startElement("tr");
+        for (Column c : columns) {
+            xhtml.startElement("th");
+            xhtml.characters(c.getName());
+            xhtml.endElement("th");
+        }
+        xhtml.endElement("tr");
+        xhtml.endElement("thead");
+
+    }
+
+    private void handleCell(Row r, Column c, XHTMLContentHandler handler)
+            throws SAXException, IOException, TikaException {
+
+        handler.startElement("td");
+        if (c.getType().equals(DataType.OLE)) {
+            handleOLE(r, c.getName(), handler);
+        } else if (c.getType().equals(DataType.BINARY)) {
+            Object obj = r.get(c.getName());
+            if (obj != null) {
+                byte[] bytes = (byte[])obj;
+                handleEmbeddedResource(
+                        TikaInputStream.get(bytes),
+                        null,//filename
+                        null,//relationshipId
+                        null,//mediatype
+                        handler, false);
+            }
+        } else {
+            Object obj = r.get(c.getName());
+            String v = toString(obj, c.getType());
+            if (isRichText(c)) {
+                BodyContentHandler h = new BodyContentHandler();
+                Metadata m = new Metadata();
+                m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+                try {
+                    htmlParser.parse(new 
ByteArrayInputStream(v.getBytes(UTF_8)),
+                            h,
+                           m, EMPTY_PARSE_CONTEXT);
+                    handler.characters(h.toString());
+                } catch (SAXException e) {
+                    //if something went wrong in htmlparser, just append the 
characters
+                    handler.characters(v);
+                }
+            } else {
+                handler.characters(v);
+            }
+        }
+        handler.endElement("td");
+    }
+
+    private boolean isRichText(Column c) throws IOException {
+
+        if (c == null) {
+            return false;
+        }
+
+        PropertyMap m = c.getProperties();
+        if (m == null) {
+            return false;
+        }
+        if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) {
+            return false;
+        }
+        Object b = m.getValue(TEXT_FORMAT_KEY);
+        if (b instanceof Byte) {
+            if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private String toString(Object value, DataType type) {
+        if (value == null) {
+            return "";
+        }
+        if (type == null) {
+            //this shouldn't happen
+            return value.toString();
+        }
+        switch (type) {
+            case LONG:
+                return Integer.toString((Integer)value);
+            case TEXT:
+                return (String)value;
+            case MONEY:
+                //TODO: consider getting parsing "Format" field from
+                //field properties.
+                return formatCurrency(((BigDecimal)value).doubleValue(), type);
+            case SHORT_DATE_TIME:
+                return formatShortDateTime((Date)value);
+            case BOOLEAN:
+                return Boolean.toString((Boolean) value);
+            case MEMO:
+                return (String)value;
+            case INT:
+                return Short.toString((Short)value);
+            case DOUBLE:
+                return Double.toString((Double)value);
+            case FLOAT:
+                return Float.toString((Float)value);
+            case NUMERIC:
+                return value.toString();
+            case BYTE:
+                return Byte.toString((Byte)value);
+            case GUID:
+                return value.toString();
+            case COMPLEX_TYPE: //skip all these
+            case UNKNOWN_0D:
+            case UNKNOWN_11:
+            case UNSUPPORTED_FIXEDLEN:
+            case UNSUPPORTED_VARLEN:
+            default:
+                return "";
+
+        }
+    }
+
+    private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) 
throws IOException, SAXException, TikaException {
+        OleBlob blob = row.getBlob(cName);
+        //lifted shamelessly from Jackcess's OleBlobTest
+        if (blob == null)
+            return;
+
+        OleBlob.Content content = blob.getContent();
+        if (content == null)
+            return;
+
+        switch (content.getType()) {
+            case LINK:
+                xhtml.characters(((OleBlob.LinkContent) 
content).getLinkPath());
+                break;
+            case SIMPLE_PACKAGE:
+                OleBlob.SimplePackageContent spc = 
(OleBlob.SimplePackageContent) content;
+
+                handleEmbeddedResource(
+                        TikaInputStream.get(spc.getStream()),
+                        spc.getFileName(),//filename
+                        null,//relationshipId
+                        spc.getTypeName(),//mediatype
+                        xhtml, false);
+                break;
+            case OTHER:
+                OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
+                handleEmbeddedResource(
+                        TikaInputStream.get(oc.getStream()),
+                        null,//filename
+                        null,//relationshipId
+                        oc.getTypeName(),//mediatype
+                        xhtml, false);
+                break;
+            case COMPOUND_STORAGE:
+                OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
+                handleCompoundContent(cc, xhtml);
+                break;
+        }
+    }
+
+    private void handleCompoundContent(OleBlob.CompoundContent cc, 
XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+        NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
+        handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+    }
+
+    String formatCurrency(Double d, DataType type) {
+        if (d == null) {
+            return "";
+        }
+        return currencyFormatter.format(d);
+    }
+
+    String formatShortDateTime(Date d) {
+        if (d == null) {
+            return "";
+        }
+        return shortDateTimeFormatter.format(d);
+    }
+}
+

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.CryptCodecProvider;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.DatabaseBuilder;
+import com.healthmarketscience.jackcess.util.LinkResolver;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that handles Microsoft Access files via
+ * <a href="http://jackcess.sourceforge.net/>Jackcess</a>
+ * <p>
+ * Many, many thanks to LexisNexisÂ®/Health Market Science (HMS), Brian 
O'Neill,
+ * and James Ahlborn for relicensing Jackcess to Apache v2.0!
+ */
+public class JackcessParser extends AbstractParser {
+
+    public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + 
Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + 
Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + 
Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static Property MDB_PW = Property.externalText("Password");
+    private final static LinkResolver IGNORE_LINK_RESOLVER = new 
IgnoreLinkResolver();
+
+    //TODO: figure out how to get this info
+    // public static Property LINKED_DATABASES = 
Property.externalTextBag("LinkedDatabases");
+
+    private static final long serialVersionUID = -752276948656079347L;
+
+    private static final MediaType MEDIA_TYPE = 
MediaType.application("x-msaccess");
+
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MEDIA_TYPE);
+
+    private Locale locale = Locale.ROOT;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+        TikaInputStream tis = TikaInputStream.get(stream);
+        Database db = null;
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        String password = null;
+        PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
+        if (passwordProvider != null) {
+            password = passwordProvider.getPassword(metadata);
+        }
+        try {
+            if (password == null) {
+                //do this to ensure encryption/wrong password exception vs. 
more generic
+                //"need right codec" error message.
+                db = new DatabaseBuilder(tis.getFile())
+                        .setCodecProvider(new CryptCodecProvider())
+                        .setReadOnly(true).open();
+            } else {
+                db = new DatabaseBuilder(tis.getFile())
+                        .setCodecProvider(new CryptCodecProvider(password))
+                        .setReadOnly(true).open();
+            }
+            db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case
+            JackcessExtractor ex = new JackcessExtractor(context, locale);
+            ex.parse(db, xhtml, metadata);
+        } catch (IllegalStateException e) {
+            if (e.getMessage() != null && e.getMessage().contains("Incorrect 
password")) {
+                throw new EncryptedDocumentException(e);
+            }
+            throw e;
+        } finally {
+            if (db != null) {
+                try {
+                    db.close();
+                } catch (IOException e) {
+                    //swallow = silent close
+                }
+            }
+        }
+        xhtml.endDocument();
+    }
+
+    private static final class IgnoreLinkResolver implements LinkResolver {
+        //If links are resolved, Jackcess might try to open and process
+        //any file on the current system that is specified as a linked db.
+        //This could be a nasty security issue.
+        @Override
+        public Database resolveLinkedDatabase(Database database, String s) 
throws IOException {
+            throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+    private final String link;
+
+    public LinkedCell(Cell cell, String link) {
+        super(cell);
+        assert link != null;
+        this.link = link;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.startElement("a", "href", link);
+        super.render(handler);
+        handler.endElement("a");
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.NoSuchElementException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListData;
+import org.apache.poi.hwpf.model.ListFormatOverrideLevel;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+
+/**
+ * Computes the number text which goes at the beginning of each list paragraph
+ * <p/>
+ * <p><em>Note:</em> This class only handles the raw number text and does not 
apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 
3 to it.<p>
+ * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance 
of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of 
in this class.</p>
+ * <p>Further, this class does not yet handle overrides</p>
+ */
+public class ListManager extends AbstractListManager {
+
+    private static final Log logger = LogFactory.getLog(ListManager.class);
+    private final ListTables listTables;
+
+    /**
+     * Ordinary constructor for a new list reader
+     *
+     * @param document Document to process
+     */
+    public ListManager(final HWPFDocument document) {
+        this.listTables = document.getListTables();
+    }
+
+    /**
+     * Get the formatted number for a given paragraph
+     * <p/>
+     * <p><em>Note:</em> This only works correctly if called subsequently for 
<em>all</em> paragraphs in a valid selection (main document, text field, ...) 
which are part of a list.</p>
+     *
+     * @param paragraph list paragraph to process
+     * @return String which represents the numbering of this list paragraph; 
never {@code null}, can be empty string, though, 
+     *        if something goes wrong in getList()
+     * @throws IllegalArgumentException If the given paragraph is {@code null} 
or is not part of a list
+     */
+    public String getFormattedNumber(final Paragraph paragraph) {
+        if (paragraph == null) throw new IllegalArgumentException("Given 
paragraph cannot be null.");
+        if (!paragraph.isInList()) throw new IllegalArgumentException("Can 
only process list paragraphs.");
+        //lsid is equivalent to docx's abnum
+        //ilfo is equivalent to docx's num
+        int currAbNumId = -1;
+        try{
+            currAbNumId = paragraph.getList().getLsid();
+        } catch (NoSuchElementException e) {
+            //somewhat frequent exception when initializing HWPFList
+            return "";
+        } catch (IllegalArgumentException e) {
+            return "";
+        } catch (NullPointerException e) {
+            return "";
+        }
+
+        int currNumId = paragraph.getIlfo();
+        ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+        LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+
+        if (lc == null) {
+            ListData listData = 
listTables.getListData(paragraph.getList().getLsid());
+            LevelTuple[] levelTuples = new 
LevelTuple[listData.getLevels().length];
+            for (int i = 0; i < listData.getLevels().length; i++) {
+                levelTuples[i] = buildTuple(i, listData.getLevels()[i]);
+            }
+            lc = new ParagraphLevelCounter(levelTuples);
+        }
+        if (overrideTuples == null) {
+            overrideTuples = buildOverrideTuples(paragraph, 
lc.getNumberOfLevels());
+        }
+        String formattedString = lc.incrementLevel(paragraph.getIlvl(), 
overrideTuples);
+
+        listLevelMap.put(currAbNumId, lc);
+        overrideTupleMap.put(currNumId, overrideTuples);
+        return formattedString;
+    }
+
+    private LevelTuple buildTuple(int i, ListLevel listLevel) {
+        boolean isLegal = false;
+        int start = 1;
+        int restart = -1;
+        String lvlText = "%" + i + ".";
+        String numFmt = "decimal";
+
+        start = listLevel.getStartAt();
+        restart = listLevel.getRestart();
+        isLegal = listLevel.isLegalNumbering();
+        numFmt = convertToNewNumFormat(listLevel.getNumberFormat());
+        lvlText = convertToNewNumberText(listLevel.getNumberText(), 
listLevel.getLevelNumberingPlaceholderOffsets());
+        return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+    }
+
+    private LevelTuple[] buildOverrideTuples(Paragraph par, int length) {
+        ListFormatOverrideLevel overrideLevel;
+        // find the override for this level
+        if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) {
+            return null;
+        }
+        overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0];
+        if (overrideLevel == null) {
+            return null;
+        }
+        LevelTuple[] levelTuples = new LevelTuple[length];
+        ListLevel listLevel = overrideLevel.getLevel();
+        if (listLevel == null) {
+            return null;
+        }
+        for (int i = 0; i < length; i++) {
+            levelTuples[i] = buildTuple(i, listLevel);
+        }
+
+        return levelTuples;
+
+    }
+
+    private String convertToNewNumberText(String numberText, byte[] 
numberOffsets) {
+
+        StringBuilder sb = new StringBuilder();
+        int last = 0;
+        for (int i = 0; i < numberOffsets.length; i++) {
+            int offset = (int) numberOffsets[i];
+
+            if (offset == 0) {
+                break;
+            }
+            sb.append(numberText.substring(last, offset - 1));
+            //need to add one because newer format
+            //adds one.  In .doc, this was the array index;
+            //but in .docx, this is the level number
+            int lvlNum = (int) numberText.charAt(offset - 1) + 1;
+            sb.append("%" + lvlNum);
+            last = offset;
+        }
+        if (last < numberText.length()) {
+            sb.append(numberText.substring(last));
+        }
+        return sb.toString();
+    }
+
+    private String convertToNewNumFormat(int numberFormat) {
+        switch (numberFormat) {
+            case -1:
+                return "none";
+            case 0:
+                return "decimal";
+            case 1:
+                return "upperRoman";
+            case 2:
+                return "lowerRoman";
+            case 3:
+                return "upperLetter";
+            case 4:
+                return "lowerLetter";
+            case 5:
+                return "ordinal";
+            case 22:
+                return "decimalZero";
+            case 23:
+                return "bullet";
+            case 47:
+                return "none";
+            default:
+                //do we really want to silently swallow these uncovered cases?
+                //throw new RuntimeException("NOT COVERED: " + numberFormat);
+                return "decimal";
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.text.NumberFormat;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Number cell.
+ */
+public class NumberCell implements Cell {
+
+    private final double number;
+
+    private final NumberFormat format;
+
+    public NumberCell(double number, NumberFormat format) {
+        this.number = number;
+        this.format = format;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(format.format(number));
+    }
+
+    public String toString() {
+        return "Numeric Cell: " + format.format(number);
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.GeneralSecurityException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.crypt.EncryptionInfo;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Defines a Microsoft document content extractor.
+ */
+public class OfficeParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7393462244028653479L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    POIFSDocumentType.WORKBOOK.type,
+                    POIFSDocumentType.OLE10_NATIVE.type,
+                    POIFSDocumentType.WORDDOCUMENT.type,
+                    POIFSDocumentType.UNKNOWN.type,
+                    POIFSDocumentType.ENCRYPTED.type,
+                    POIFSDocumentType.POWERPOINT.type,
+                    POIFSDocumentType.PUBLISHER.type,
+                    POIFSDocumentType.PROJECT.type,
+                    POIFSDocumentType.VISIO.type,
+                    // Works isn't supported
+                    POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
+                    POIFSDocumentType.OUTLOOK.type,
+                    POIFSDocumentType.SOLIDWORKS_PART.type,
+                    POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
+                    POIFSDocumentType.SOLIDWORKS_DRAWING.type
+            )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        final DirectoryNode root;
+        TikaInputStream tstream = TikaInputStream.cast(stream);
+        if (tstream == null) {
+            root = new NPOIFSFileSystem(new 
CloseShieldInputStream(stream)).getRoot();
+        } else {
+            final Object container = tstream.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                root = ((NPOIFSFileSystem) container).getRoot();
+            } else if (container instanceof DirectoryNode) {
+                root = (DirectoryNode) container;
+            } else {
+                NPOIFSFileSystem fs;
+                if (tstream.hasFile()) {
+                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
+                } else {
+                    fs = new NPOIFSFileSystem(new 
CloseShieldInputStream(tstream));
+                }
+                tstream.setOpenContainer(fs);
+                root = fs.getRoot();
+            }
+        }
+        parse(root, context, metadata, xhtml);
+        xhtml.endDocument();
+    }
+
+    protected void parse(
+            DirectoryNode root, ParseContext context, Metadata metadata, 
XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+
+        // Parse summary entries first, to make metadata available early
+        new SummaryExtractor(metadata).parseSummaries(root);
+
+        // Parse remaining document entries
+        POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+        if (type != POIFSDocumentType.UNKNOWN) {
+            setType(metadata, type.getType());
+        }
+
+        switch (type) {
+            case SOLIDWORKS_PART:
+            case SOLIDWORKS_ASSEMBLY:
+            case SOLIDWORKS_DRAWING:
+                break;
+            case PUBLISHER:
+                PublisherTextExtractor publisherTextExtractor =
+                        new PublisherTextExtractor(root);
+                xhtml.element("p", publisherTextExtractor.getText());
+                break;
+            case WORDDOCUMENT:
+                new WordExtractor(context).parse(root, xhtml);
+                break;
+            case POWERPOINT:
+                new HSLFExtractor(context).parse(root, xhtml);
+                break;
+            case WORKBOOK:
+            case XLR:
+                Locale locale = context.get(Locale.class, Locale.getDefault());
+                new ExcelExtractor(context, metadata).parse(root, xhtml, 
locale);
+                break;
+            case PROJECT:
+                // We currently can't do anything beyond the metadata
+                break;
+            case VISIO:
+                VisioTextExtractor visioTextExtractor =
+                        new VisioTextExtractor(root);
+                for (String text : visioTextExtractor.getAllText()) {
+                    xhtml.element("p", text);
+                }
+                break;
+            case OUTLOOK:
+                OutlookExtractor extractor =
+                        new OutlookExtractor(root, context);
+
+                extractor.parse(xhtml, metadata);
+                break;
+            case ENCRYPTED:
+                EncryptionInfo info = new EncryptionInfo(root);
+                Decryptor d = Decryptor.getInstance(info);
+
+                try {
+                    // By default, use the default Office Password
+                    String password = Decryptor.DEFAULT_PASSWORD;
+
+                    // If they supplied a Password Provider, ask that for the 
password,
+                    //  and use the provider given one if available (stick 
with default if not)
+                    PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
+                    if (passwordProvider != null) {
+                        String suppliedPassword = 
passwordProvider.getPassword(metadata);
+                        if (suppliedPassword != null) {
+                            password = suppliedPassword;
+                        }
+                    }
+
+                    // Check if we've the right password or not
+                    if (!d.verifyPassword(password)) {
+                        throw new EncryptedDocumentException();
+                    }
+
+                    // Decrypt the OLE2 stream, and delegate the resulting 
OOXML
+                    //  file to the regular OOXML parser for normal handling
+                    OOXMLParser parser = new OOXMLParser();
+
+                    parser.parse(d.getDataStream(root), new 
EmbeddedContentHandler(
+                                    new BodyContentHandler(xhtml)),
+                            metadata, context);
+                } catch (GeneralSecurityException ex) {
+                    throw new EncryptedDocumentException(ex);
+                }
+            default:
+                // For unsupported / unhandled types, just the metadata
+                //  is extracted, which happened above
+                break;
+        }
+    }
+
+    private void setType(Metadata metadata, MediaType type) {
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
+    }
+
+    public enum POIFSDocumentType {
+        WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
+        OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
+        COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
+        WORDDOCUMENT("doc", MediaType.application("msword")),
+        UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+        ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
+        POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
+        PUBLISHER("pub", MediaType.application("x-mspublisher")),
+        PROJECT("mpp", MediaType.application("vnd.ms-project")),
+        VISIO("vsd", MediaType.application("vnd.visio")),
+        WORKS("wps", MediaType.application("vnd.ms-works")),
+        XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
+        OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+        SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+        SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+        SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
+
+        private final String extension;
+        private final MediaType type;
+
+        POIFSDocumentType(String extension, MediaType type) {
+            this.extension = extension;
+            this.type = type;
+        }
+
+        public static POIFSDocumentType detectType(POIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(DirectoryEntry node) {
+            Set<String> names = new HashSet<String>();
+            for (Entry entry : node) {
+                names.add(entry.getName());
+            }
+            MediaType type = POIFSContainerDetector.detect(names, node);
+            for (POIFSDocumentType poifsType : values()) {
+                if (type.equals(poifsType.type)) {
+                    return poifsType;
+                }
+            }
+            return UNKNOWN;
+        }
+
+        public String getExtension() {
+            return extension;
+        }
+
+        public MediaType getType() {
+            return type;
+        }
+    }
+
+}

svn commit: r1723223 [9/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ tik...

Reply via email to