Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,633 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.awt.*; +import java.io.IOException; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.poi.ddf.EscherBSERecord; +import org.apache.poi.ddf.EscherBlipRecord; +import org.apache.poi.ddf.EscherRecord; +import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; +import org.apache.poi.hssf.eventusermodel.HSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFRequest; +import org.apache.poi.hssf.extractor.OldExcelExtractor; +import org.apache.poi.hssf.record.BOFRecord; +import org.apache.poi.hssf.record.BoundSheetRecord; +import org.apache.poi.hssf.record.CellValueRecordInterface; +import org.apache.poi.hssf.record.CountryRecord; +import org.apache.poi.hssf.record.DateWindow1904Record; +import org.apache.poi.hssf.record.DrawingGroupRecord; +import org.apache.poi.hssf.record.EOFRecord; +import org.apache.poi.hssf.record.ExtendedFormatRecord; +import org.apache.poi.hssf.record.FooterRecord; +import org.apache.poi.hssf.record.FormatRecord; +import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.HeaderRecord; +import org.apache.poi.hssf.record.HyperlinkRecord; +import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.LabelSSTRecord; +import org.apache.poi.hssf.record.NumberRecord; +import org.apache.poi.hssf.record.RKRecord; +import org.apache.poi.hssf.record.Record; +import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.record.StringRecord; +import org.apache.poi.hssf.record.TextObjectRecord; +import org.apache.poi.hssf.record.chart.SeriesTextRecord; +import org.apache.poi.hssf.record.common.UnicodeString; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; +import org.apache.poi.hssf.usermodel.HSSFPictureData; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Excel parser implementation which uses POI's Event API + * to handle the contents of a Workbook. + * <p/> + * The Event API uses a much smaller memory footprint than + * <code>HSSFWorkbook</code> when processing excel files + * but at the cost of more complexity. + * <p/> + * With the Event API a <i>listener</i> is registered for + * specific record types and those records are created, + * fired off to the listener and then discarded as the stream + * is being processed. + * + * @see org.apache.poi.hssf.eventusermodel.HSSFListener + * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api"> + * POI Event API How To</a> + */ +public class ExcelExtractor extends AbstractPOIFSExtractor { + + private static final String WORKBOOK_ENTRY = "Workbook"; + private static final String BOOK_ENTRY = "Book"; + /** + * <code>true</code> if the HSSFListener should be registered + * to listen for all records or <code>false</code> (the default) + * if the listener should be configured to only receive specified + * records. + */ + private boolean listenForAllRecords = false; + + public ExcelExtractor(ParseContext context, Metadata metadata) { + super(context, metadata); + } + + /** + * Returns <code>true</code> if this parser is configured to listen + * for all records instead of just the specified few. + */ + public boolean isListenForAllRecords() { + return listenForAllRecords; + } + + /** + * Specifies whether this parser should to listen for all + * records or just for the specified few. + * <p/> + * <strong>Note:</strong> Under normal operation this setting should + * be <code>false</code> (the default), but you can experiment with + * this setting for testing and debugging purposes. + * + * @param listenForAllRecords <code>true</code> if the HSSFListener + * should be registered to listen for all records or <code>false</code> + * if the listener should be configured to only receive specified records. + */ + public void setListenForAllRecords(boolean listenForAllRecords) { + this.listenForAllRecords = listenForAllRecords; + } + + /** + * Extracts text from an Excel Workbook writing the extracted content + * to the specified {@link Appendable}. + * + * @param filesystem POI file system + * @throws IOException if an error occurs processing the workbook + * or writing the extracted content + */ + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, + Locale locale) throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml, locale); + } + + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml, + Locale locale) throws IOException, SAXException, TikaException { + if (!root.hasEntry(WORKBOOK_ENTRY)) { + if (root.hasEntry(BOOK_ENTRY)) { + // Excel 5 / Excel 95 file + // Records are in a different structure so needs a + // different parser to process them + OldExcelExtractor extractor = new OldExcelExtractor(root); + OldExcelParser.parse(extractor, xhtml); + return; + } else { + // Corrupt file / very old file, just skip text extraction + return; + } + } + + // If a password was supplied, use it, otherwise the default + Biff8EncryptionKey.setCurrentUserPassword(getPassword()); + + // Have the file processed in event mode + TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); + listener.processFile(root, isListenForAllRecords()); + listener.throwStoredException(); + + for (Entry entry : root) { + if (entry.getName().startsWith("MBD") + && entry instanceof DirectoryEntry) { + try { + handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); + } catch (TikaException e) { + // ignore parse errors from embedded documents + } + } + } + } + + // ====================================================================== + + /** + * HSSF Listener implementation which processes the HSSF records. + */ + private static class TikaHSSFListener implements HSSFListener { + + /** + * XHTML content handler to which the document content is rendered. + */ + private final XHTMLContentHandler handler; + + /** + * The POIFS Extractor, used for embeded resources. + */ + private final AbstractPOIFSExtractor extractor; + /** + * Format for rendering numbers in the worksheet. Currently we just + * use the platform default formatting. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> + */ + private final NumberFormat format; + /** + * Potential exception thrown by the content handler. When set to + * non-<code>null</code>, causes all subsequent HSSF records to be + * ignored and the stored exception to be thrown when + * {@link #throwStoredException()} is invoked. + */ + private Exception exception = null; + private SSTRecord sstRecord; + private FormulaRecord stringFormulaRecord; + private short previousSid; + /** + * Internal <code>FormatTrackingHSSFListener</code> to handle cell + * formatting within the extraction. + */ + private FormatTrackingHSSFListener formatListener; + /** + * List of worksheet names. + */ + private List<String> sheetNames = new ArrayList<String>(); + /** + * Index of the current worksheet within the workbook. + * Used to find the worksheet name in the {@link #sheetNames} list. + */ + private short currentSheetIndex; + /** + * Content of the current worksheet, or <code>null</code> if no + * worksheet is currently active. + */ + private SortedMap<Point, Cell> currentSheet = null; + /** + * Extra text or cells that crops up, typically as part of a + * worksheet but not always. + */ + private List<Cell> extraTextCells = new ArrayList<Cell>(); + /** + * These aren't complete when we first see them, as the + * depend on continue records that aren't always + * contiguous. Collect them for later processing. + */ + private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>(); + + /** + * Construct a new listener instance outputting parsed data to + * the specified XHTML content handler. + * + * @param handler Destination to write the parsed output to + */ + private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) { + this.handler = handler; + this.extractor = extractor; + this.format = NumberFormat.getInstance(locale); + this.formatListener = new FormatTrackingHSSFListener(this, locale); + } + + /** + * Entry point to listener to start the processing of a file. + * + * @param filesystem POI file system. + * @param listenForAllRecords sets whether the listener is configured to listen + * for all records types or not. + * @throws IOException on any IO errors. + * @throws SAXException on any SAX parsing errors. + */ + public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { + processFile(filesystem.getRoot(), listenForAllRecords); + } + + public void processFile(DirectoryNode root, boolean listenForAllRecords) + throws IOException, SAXException, TikaException { + + // Set up listener and register the records we want to process + HSSFRequest hssfRequest = new HSSFRequest(); + if (listenForAllRecords) { + hssfRequest.addListenerForAllRecords(formatListener); + } else { + hssfRequest.addListener(formatListener, BOFRecord.sid); + hssfRequest.addListener(formatListener, EOFRecord.sid); + hssfRequest.addListener(formatListener, DateWindow1904Record.sid); + hssfRequest.addListener(formatListener, CountryRecord.sid); + hssfRequest.addListener(formatListener, BoundSheetRecord.sid); + hssfRequest.addListener(formatListener, SSTRecord.sid); + hssfRequest.addListener(formatListener, FormulaRecord.sid); + hssfRequest.addListener(formatListener, LabelRecord.sid); + hssfRequest.addListener(formatListener, LabelSSTRecord.sid); + hssfRequest.addListener(formatListener, NumberRecord.sid); + hssfRequest.addListener(formatListener, RKRecord.sid); + hssfRequest.addListener(formatListener, StringRecord.sid); + hssfRequest.addListener(formatListener, HyperlinkRecord.sid); + hssfRequest.addListener(formatListener, TextObjectRecord.sid); + hssfRequest.addListener(formatListener, SeriesTextRecord.sid); + hssfRequest.addListener(formatListener, FormatRecord.sid); + hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid); + hssfRequest.addListener(formatListener, DrawingGroupRecord.sid); + hssfRequest.addListener(formatListener, HeaderRecord.sid); + hssfRequest.addListener(formatListener, FooterRecord.sid); + } + + // Create event factory and process Workbook (fire events) + DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY); + HSSFEventFactory eventFactory = new HSSFEventFactory(); + try { + eventFactory.processEvents(hssfRequest, documentInputStream); + } catch (org.apache.poi.EncryptedDocumentException e) { + throw new EncryptedDocumentException(e); + } + + // Output any extra text that came after all the sheets + processExtraText(); + + // Look for embeded images, now that the drawing records + // have been fully matched with their continue data + for (DrawingGroupRecord dgr : drawingGroups) { + dgr.decode(); + findPictures(dgr.getEscherRecords()); + } + } + + /** + * Process a HSSF record. + * + * @param record HSSF Record + */ + public void processRecord(Record record) { + if (exception == null) { + try { + internalProcessRecord(record); + } catch (TikaException te) { + exception = te; + } catch (IOException ie) { + exception = ie; + } catch (SAXException se) { + exception = se; + } + } + } + + public void throwStoredException() throws TikaException, SAXException, IOException { + if (exception != null) { + if (exception instanceof IOException) + throw (IOException) exception; + if (exception instanceof SAXException) + throw (SAXException) exception; + if (exception instanceof TikaException) + throw (TikaException) exception; + throw new TikaException(exception.getMessage()); + } + } + + private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { + switch (record.getSid()) { + case BOFRecord.sid: // start of workbook, worksheet etc. records + BOFRecord bof = (BOFRecord) record; + if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { + currentSheetIndex = -1; + } else if (bof.getType() == BOFRecord.TYPE_CHART) { + if (previousSid == EOFRecord.sid) { + // This is a sheet which contains only a chart + newSheet(); + } else { + // This is a chart within a normal sheet + // Handling of this is a bit hacky... + if (currentSheet != null) { + processSheet(); + currentSheetIndex--; + newSheet(); + } + } + } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { + newSheet(); + } + break; + + case EOFRecord.sid: // end of workbook, worksheet etc. records + if (currentSheet != null) { + processSheet(); + } + currentSheet = null; + break; + + case BoundSheetRecord.sid: // Worksheet index record + BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; + sheetNames.add(boundSheetRecord.getSheetname()); + break; + + case SSTRecord.sid: // holds all the strings for LabelSSTRecords + sstRecord = (SSTRecord) record; + break; + + case FormulaRecord.sid: // Cell value from a formula + FormulaRecord formula = (FormulaRecord) record; + if (formula.hasCachedResultString()) { + // The String itself should be the next record + stringFormulaRecord = formula; + } else { + addTextCell(record, formatListener.formatNumberDateCell(formula)); + } + break; + + case StringRecord.sid: + if (previousSid == FormulaRecord.sid) { + // Cached string value of a string formula + StringRecord sr = (StringRecord) record; + addTextCell(stringFormulaRecord, sr.getString()); + } else { + // Some other string not associated with a cell, skip + } + break; + + case LabelRecord.sid: // strings stored directly in the cell + LabelRecord label = (LabelRecord) record; + addTextCell(record, label.getValue()); + break; + + case LabelSSTRecord.sid: // Ref. a string in the shared string table + LabelSSTRecord sst = (LabelSSTRecord) record; + UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); + addTextCell(record, unicode.getString()); + break; + + case NumberRecord.sid: // Contains a numeric cell value + NumberRecord number = (NumberRecord) record; + addTextCell(record, formatListener.formatNumberDateCell(number)); + break; + + case RKRecord.sid: // Excel internal number record + RKRecord rk = (RKRecord) record; + addCell(record, new NumberCell(rk.getRKNumber(), format)); + break; + + case HyperlinkRecord.sid: // holds a URL associated with a cell + if (currentSheet != null) { + HyperlinkRecord link = (HyperlinkRecord) record; + Point point = + new Point(link.getFirstColumn(), link.getFirstRow()); + Cell cell = currentSheet.get(point); + if (cell != null) { + String address = link.getAddress(); + if (address != null) { + addCell(record, new LinkedCell(cell, address)); + } else { + addCell(record, cell); + } + } + } + break; + + case TextObjectRecord.sid: + TextObjectRecord tor = (TextObjectRecord) record; + addTextCell(record, tor.getStr().getString()); + break; + + case SeriesTextRecord.sid: // Chart label or title + SeriesTextRecord str = (SeriesTextRecord) record; + addTextCell(record, str.getText()); + break; + + case DrawingGroupRecord.sid: + // Collect this now, we'll process later when all + // the continue records are in + drawingGroups.add((DrawingGroupRecord) record); + break; + + case HeaderRecord.sid: + HeaderRecord headerRecord = (HeaderRecord) record; + addTextCell(record, headerRecord.getText()); + break; + + case FooterRecord.sid: + FooterRecord footerRecord = (FooterRecord) record; + addTextCell(record, footerRecord.getText()); + break; + + } + + previousSid = record.getSid(); + + if (stringFormulaRecord != record) { + stringFormulaRecord = null; + } + } + + private void processExtraText() throws SAXException { + if (extraTextCells.size() > 0) { + for (Cell cell : extraTextCells) { + handler.startElement("div", "class", "outside"); + cell.render(handler); + handler.endElement("div"); + } + + // Reset + extraTextCells.clear(); + } + } + + /** + * Adds the given cell (unless <code>null</code>) to the current + * worksheet (if any) at the position (if any) of the given record. + * + * @param record record that holds the cell value + * @param cell cell value (or <code>null</code>) + */ + private void addCell(Record record, Cell cell) throws SAXException { + if (cell == null) { + // Ignore empty cells + } else if (currentSheet != null + && record instanceof CellValueRecordInterface) { + // Normal cell inside a worksheet + CellValueRecordInterface value = + (CellValueRecordInterface) record; + Point point = new Point(value.getColumn(), value.getRow()); + currentSheet.put(point, cell); + } else { + // Cell outside the worksheets + extraTextCells.add(cell); + } + } + + /** + * Adds a text cell with the given text comment. The given text + * is trimmed, and ignored if <code>null</code> or empty. + * + * @param record record that holds the text value + * @param text text content, may be <code>null</code> + * @throws SAXException + */ + private void addTextCell(Record record, String text) throws SAXException { + if (text != null) { + text = text.trim(); + if (text.length() > 0) { + addCell(record, new TextCell(text)); + } + } + } + + private void newSheet() { + currentSheetIndex++; + currentSheet = new TreeMap<Point, Cell>(new PointComparator()); + } + + /** + * Process an excel sheet. + * + * @throws SAXException if an error occurs + */ + private void processSheet() throws SAXException { + // Sheet Start + handler.startElement("div", "class", "page"); + if (currentSheetIndex < sheetNames.size()) { + handler.element("h1", sheetNames.get(currentSheetIndex)); + } + handler.startElement("table"); + handler.startElement("tbody"); + + // Process Rows + int currentRow = 0; + int currentColumn = 0; + handler.startElement("tr"); + handler.startElement("td"); + for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { + while (currentRow < entry.getKey().y) { + handler.endElement("td"); + handler.endElement("tr"); + handler.startElement("tr"); + handler.startElement("td"); + currentRow++; + currentColumn = 0; + } + + while (currentColumn < entry.getKey().x) { + handler.endElement("td"); + handler.startElement("td"); + currentColumn++; + } + + entry.getValue().render(handler); + } + handler.endElement("td"); + handler.endElement("tr"); + + // Sheet End + handler.endElement("tbody"); + handler.endElement("table"); + + // Finish up + processExtraText(); + handler.endElement("div"); + } + + private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException { + for (EscherRecord escherRecord : records) { + if (escherRecord instanceof EscherBSERecord) { + EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); + if (blip != null) { + HSSFPictureData picture = new HSSFPictureData(blip); + String mimeType = picture.getMimeType(); + TikaInputStream stream = TikaInputStream.get(picture.getData()); + + // Handle the embeded resource + extractor.handleEmbeddedResource( + stream, null, null, mimeType, + handler, true + ); + } + } + + // Recursive call. + findPictures(escherRecord.getChildRecords()); + } + } + } + + /** + * Utility comparator for points. + */ + private static class PointComparator implements Comparator<Point> { + + public int compare(Point a, Point b) { + int diff = a.y - b.y; + if (diff == 0) { + diff = a.x - b.x; + } + return diff; + } + + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,366 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; + +import org.apache.poi.hslf.model.Comment; +import org.apache.poi.hslf.model.HeadersFooters; +import org.apache.poi.hslf.model.OLEShape; +import org.apache.poi.hslf.usermodel.HSLFMasterSheet; +import org.apache.poi.hslf.usermodel.HSLFNotes; +import org.apache.poi.hslf.usermodel.HSLFObjectData; +import org.apache.poi.hslf.usermodel.HSLFPictureData; +import org.apache.poi.hslf.usermodel.HSLFShape; +import org.apache.poi.hslf.usermodel.HSLFSlide; +import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTable; +import org.apache.poi.hslf.usermodel.HSLFTableCell; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; +import org.apache.poi.hslf.usermodel.HSLFTextRun; +import org.apache.poi.hslf.usermodel.HSLFTextShape; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class HSLFExtractor extends AbstractPOIFSExtractor { + public HSLFExtractor(ParseContext context) { + super(context); + } + + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml); + } + + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + HSLFSlideShow ss = new HSLFSlideShow(root); + List<HSLFSlide> _slides = ss.getSlides(); + + xhtml.startElement("div", "class", "slideShow"); + + /* Iterate over slides and extract text */ + for (HSLFSlide slide : _slides) { + xhtml.startElement("div", "class", "slide"); + + // Slide header, if present + HeadersFooters hf = slide.getHeadersFooters(); + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-header"); + + xhtml.characters(hf.getHeaderText()); + + xhtml.endElement("p"); + } + + // Slide master, if present + extractMaster(xhtml, slide.getMasterSheet()); + + // Slide text + { + xhtml.startElement("div", "class", "slide-content"); + + textRunsToText(xhtml, slide.getTextParagraphs()); + + xhtml.endElement("div"); + } + + // Table text + for (HSLFShape shape : slide.getShapes()) { + if (shape instanceof HSLFTable) { + extractTableText(xhtml, (HSLFTable) shape); + } + } + + // Slide footer, if present + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-footer"); + + xhtml.characters(hf.getFooterText()); + + xhtml.endElement("p"); + } + + // Comments, if present + StringBuilder authorStringBuilder = new StringBuilder(); + for (Comment comment : slide.getComments()) { + authorStringBuilder.setLength(0); + xhtml.startElement("p", "class", "slide-comment"); + + if (comment.getAuthor() != null) { + authorStringBuilder.append(comment.getAuthor()); + } + if (comment.getAuthorInitials() != null) { + if (authorStringBuilder.length() > 0) { + authorStringBuilder.append(" "); + } + authorStringBuilder.append("("+comment.getAuthorInitials()+")"); + } + if (authorStringBuilder.length() > 0) { + if (comment.getText() != null) { + authorStringBuilder.append(" - "); + } + xhtml.startElement("b"); + xhtml.characters(authorStringBuilder.toString()); + xhtml.endElement("b"); + } + if (comment.getText() != null) { + xhtml.characters(comment.getText()); + } + xhtml.endElement("p"); + } + + // Now any embedded resources + handleSlideEmbeddedResources(slide, xhtml); + + // TODO Find the Notes for this slide and extract inline + + // Slide complete + xhtml.endElement("div"); + } + + // All slides done + xhtml.endElement("div"); + + /* notes */ + xhtml.startElement("div", "class", "slide-notes"); + HashSet<Integer> seenNotes = new HashSet<>(); + HeadersFooters hf = ss.getNotesHeadersFooters(); + + for (HSLFSlide slide : _slides) { + HSLFNotes notes = slide.getNotes(); + if (notes == null) { + continue; + } + Integer id = notes._getSheetNumber(); + if (seenNotes.contains(id)) { + continue; + } + seenNotes.add(id); + + // Repeat the Notes header, if set + if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { + xhtml.startElement("p", "class", "slide-note-header"); + xhtml.characters(hf.getHeaderText()); + xhtml.endElement("p"); + } + + // Notes text + textRunsToText(xhtml, notes.getTextParagraphs()); + + // Repeat the notes footer, if set + if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { + xhtml.startElement("p", "class", "slide-note-footer"); + xhtml.characters(hf.getFooterText()); + xhtml.endElement("p"); + } + } + + handleSlideEmbeddedPictures(ss, xhtml); + + xhtml.endElement("div"); + } + + private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException { + if (master == null) { + return; + } + List<HSLFShape> shapes = master.getShapes(); + if (shapes == null || shapes.isEmpty()) { + return; + } + + xhtml.startElement("div", "class", "slide-master-content"); + for (HSLFShape shape : shapes) { + if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) { + if (shape instanceof HSLFTextShape) { + HSLFTextShape tsh = (HSLFTextShape) shape; + String text = tsh.getText(); + if (text != null) { + xhtml.element("p", text); + } + } + } + } + xhtml.endElement("div"); + } + + private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException { + xhtml.startElement("table"); + for (int row = 0; row < shape.getNumberOfRows(); row++) { + xhtml.startElement("tr"); + for (int col = 0; col < shape.getNumberOfColumns(); col++) { + HSLFTableCell cell = shape.getCell(row, col); + //insert empty string for empty cell if cell is null + String txt = ""; + if (cell != null) { + txt = cell.getText(); + } + xhtml.element("td", txt); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + } + + private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException { + if (paragraphsList == null) { + return; + } + + for (List<HSLFTextParagraph> run : paragraphsList) { + // Leaving in wisdom from TIKA-712 for easy revert. + // Avoid boiler-plate text on the master slide (0 + // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): + //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { + + boolean isBullet = false; + for (HSLFTextParagraph htp : run) { + boolean nextBullet = htp.isBullet(); + // TODO: identify bullet/list type + if (isBullet != nextBullet) { + isBullet = nextBullet; + if (isBullet) { + xhtml.startElement("ul"); + } else { + xhtml.endElement("ul"); + } + } + + List<HSLFTextRun> textRuns = htp.getTextRuns(); + String firstLine = removePBreak(textRuns.get(0).getRawText()); + boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine))); + String paraTag = showBullet ? "li" : "p"; + + xhtml.startElement(paraTag); + for (HSLFTextRun htr : textRuns) { + String line = htr.getRawText(); + if (line != null) { + boolean isfirst = true; + for (String fragment : line.split("\\u000b")) { + if (!isfirst) { + xhtml.startElement("br"); + xhtml.endElement("br"); + } + isfirst = false; + xhtml.characters(removePBreak(fragment)); + } + if (line.endsWith("\u000b")) { + xhtml.startElement("br"); + xhtml.endElement("br"); + } + } + } + xhtml.endElement(paraTag); + } + if (isBullet) { + xhtml.endElement("ul"); + } + } + } + + // remove trailing paragraph break + private static String removePBreak(String fragment) { + // the last text run of a text paragraph contains the paragraph break (\r) + // line breaks (\\u000b) can happen more often + return fragment.replaceFirst("\\r$", ""); + } + + private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) + throws TikaException, SAXException, IOException { + for (HSLFPictureData pic : slideshow.getPictureData()) { + String mediaType; + + switch (pic.getType()) { + case EMF: + mediaType = "application/x-emf"; + break; + case WMF: + mediaType = "application/x-msmetafile"; + break; + case DIB: + mediaType = "image/bmp"; + break; + default: + mediaType = pic.getContentType(); + break; + } + + handleEmbeddedResource( + TikaInputStream.get(pic.getData()), null, null, + mediaType, xhtml, false); + } + } + + private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) + throws TikaException, SAXException, IOException { + List<HSLFShape> shapes; + try { + shapes = slide.getShapes(); + } catch (NullPointerException e) { + // Sometimes HSLF hits problems + // Please open POI bugs for any you come across! + return; + } + + for (HSLFShape shape : shapes) { + if (shape instanceof OLEShape) { + OLEShape oleShape = (OLEShape) shape; + HSLFObjectData data = null; + try { + data = oleShape.getObjectData(); + } catch (NullPointerException e) { + /* getObjectData throws NPE some times. */ + } + + if (data != null) { + String objID = Integer.toString(oleShape.getObjectID()); + + // Embedded Object: add a <div + // class="embedded" id="X"/> so consumer can see where + // in the main text each embedded document + // occurred: + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", objID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + + try (TikaInputStream stream = TikaInputStream.get(data.getData())) { + String mediaType = null; + if ("Excel.Chart.8".equals(oleShape.getProgID())) { + mediaType = "application/vnd.ms-excel"; + } + handleEmbeddedResource( + stream, objID, objID, + mediaType, xhtml, false); + } + } + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.math.BigDecimal; +import java.text.DateFormat; +import java.text.NumberFormat; +import java.util.Date; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import com.healthmarketscience.jackcess.Column; +import com.healthmarketscience.jackcess.DataType; +import com.healthmarketscience.jackcess.Database; +import com.healthmarketscience.jackcess.PropertyMap; +import com.healthmarketscience.jackcess.Row; +import com.healthmarketscience.jackcess.Table; +import com.healthmarketscience.jackcess.query.Query; +import com.healthmarketscience.jackcess.util.OleBlob; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Internal class. Needs to be instantiated for each parse because of + * the lack of thread safety with the dateTimeFormatter + */ +class JackcessExtractor extends AbstractPOIFSExtractor { + + final static String TITLE_PROP_KEY = "Title"; + final static String AUTHOR_PROP_KEY = "Author"; + final static String COMPANY_PROP_KEY = "Company"; + + final static String TEXT_FORMAT_KEY = "TextFormat"; + final static String CURRENCY_FORMAT_KEY = "Format"; + final static byte TEXT_FORMAT = 0; + final static byte RICH_TEXT_FORMAT = 1; + final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext(); + + final NumberFormat currencyFormatter; + final DateFormat shortDateTimeFormatter; + + final HtmlParser htmlParser = new HtmlParser(); + + protected JackcessExtractor(ParseContext context, Locale locale) { + super(context); + currencyFormatter = NumberFormat.getCurrencyInstance(locale); + shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale); + } + + public void parse(Database db, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException { + + + String pw = db.getDatabasePassword(); + if (pw != null) { + metadata.set(JackcessParser.MDB_PW, pw); + } + + PropertyMap dbp = db.getDatabaseProperties(); + for (PropertyMap.Property p : dbp) { + metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(), + toString(p.getValue(), p.getType())); + } + + PropertyMap up = db.getUserDefinedProperties(); + for (PropertyMap.Property p : up) { + metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ p.getName(), + toString(p.getValue(), p.getType())); + } + + Set<String> found = new HashSet<>(); + PropertyMap summaryProperties = db.getSummaryProperties(); + if (summaryProperties != null) { + //try to get core properties + PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY); + if (title != null) { + metadata.set(TikaCoreProperties.TITLE, toString(title.getValue(), title.getType())); + found.add(title.getName()); + } + PropertyMap.Property author = summaryProperties.get(AUTHOR_PROP_KEY); + if (author != null && author.getValue() != null) { + String authorString = toString(author.getValue(), author.getType()); + SummaryExtractor.addMulti(metadata, TikaCoreProperties.CREATOR, authorString); + found.add(author.getName()); + } + PropertyMap.Property company = summaryProperties.get(COMPANY_PROP_KEY); + if (company != null) { + metadata.set(OfficeOpenXMLExtended.COMPANY, toString(company.getValue(), company.getType())); + found.add(company.getName()); + } + + for (PropertyMap.Property p : db.getSummaryProperties()) { + if (! found.contains(p.getName())) { + metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + p.getName(), + toString(p.getValue(), p.getType())); + } + } + + } + + Iterator<Table> it = db.newIterable(). + setIncludeLinkedTables(false). + setIncludeSystemTables(false).iterator(); + + while (it.hasNext()) { + Table table = it.next(); + String tableName = table.getName(); + List<? extends Column> columns = table.getColumns(); + xhtml.startElement("table", "name", tableName); + addHeaders(columns, xhtml); + xhtml.startElement("tbody"); + + Row r = table.getNextRow(); + + while (r != null) { + xhtml.startElement("tr"); + for (Column c : columns) { + handleCell(r, c, xhtml); + } + xhtml.endElement("tr"); + r = table.getNextRow(); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + } + + for (Query q : db.getQueries()) { + xhtml.startElement("div", "type", "sqlQuery"); + xhtml.characters(q.toSQLString()); + xhtml.endElement("div"); + } + } + + private void addHeaders(List<? extends Column> columns, XHTMLContentHandler xhtml) throws SAXException { + xhtml.startElement("thead"); + xhtml.startElement("tr"); + for (Column c : columns) { + xhtml.startElement("th"); + xhtml.characters(c.getName()); + xhtml.endElement("th"); + } + xhtml.endElement("tr"); + xhtml.endElement("thead"); + + } + + private void handleCell(Row r, Column c, XHTMLContentHandler handler) + throws SAXException, IOException, TikaException { + + handler.startElement("td"); + if (c.getType().equals(DataType.OLE)) { + handleOLE(r, c.getName(), handler); + } else if (c.getType().equals(DataType.BINARY)) { + Object obj = r.get(c.getName()); + if (obj != null) { + byte[] bytes = (byte[])obj; + handleEmbeddedResource( + TikaInputStream.get(bytes), + null,//filename + null,//relationshipId + null,//mediatype + handler, false); + } + } else { + Object obj = r.get(c.getName()); + String v = toString(obj, c.getType()); + if (isRichText(c)) { + BodyContentHandler h = new BodyContentHandler(); + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + try { + htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), + h, + m, EMPTY_PARSE_CONTEXT); + handler.characters(h.toString()); + } catch (SAXException e) { + //if something went wrong in htmlparser, just append the characters + handler.characters(v); + } + } else { + handler.characters(v); + } + } + handler.endElement("td"); + } + + private boolean isRichText(Column c) throws IOException { + + if (c == null) { + return false; + } + + PropertyMap m = c.getProperties(); + if (m == null) { + return false; + } + if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) { + return false; + } + Object b = m.getValue(TEXT_FORMAT_KEY); + if (b instanceof Byte) { + if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) { + return true; + } + } + return false; + } + + private String toString(Object value, DataType type) { + if (value == null) { + return ""; + } + if (type == null) { + //this shouldn't happen + return value.toString(); + } + switch (type) { + case LONG: + return Integer.toString((Integer)value); + case TEXT: + return (String)value; + case MONEY: + //TODO: consider getting parsing "Format" field from + //field properties. + return formatCurrency(((BigDecimal)value).doubleValue(), type); + case SHORT_DATE_TIME: + return formatShortDateTime((Date)value); + case BOOLEAN: + return Boolean.toString((Boolean) value); + case MEMO: + return (String)value; + case INT: + return Short.toString((Short)value); + case DOUBLE: + return Double.toString((Double)value); + case FLOAT: + return Float.toString((Float)value); + case NUMERIC: + return value.toString(); + case BYTE: + return Byte.toString((Byte)value); + case GUID: + return value.toString(); + case COMPLEX_TYPE: //skip all these + case UNKNOWN_0D: + case UNKNOWN_11: + case UNSUPPORTED_FIXEDLEN: + case UNSUPPORTED_VARLEN: + default: + return ""; + + } + } + + private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { + OleBlob blob = row.getBlob(cName); + //lifted shamelessly from Jackcess's OleBlobTest + if (blob == null) + return; + + OleBlob.Content content = blob.getContent(); + if (content == null) + return; + + switch (content.getType()) { + case LINK: + xhtml.characters(((OleBlob.LinkContent) content).getLinkPath()); + break; + case SIMPLE_PACKAGE: + OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content; + + handleEmbeddedResource( + TikaInputStream.get(spc.getStream()), + spc.getFileName(),//filename + null,//relationshipId + spc.getTypeName(),//mediatype + xhtml, false); + break; + case OTHER: + OleBlob.OtherContent oc = (OleBlob.OtherContent) content; + handleEmbeddedResource( + TikaInputStream.get(oc.getStream()), + null,//filename + null,//relationshipId + oc.getTypeName(),//mediatype + xhtml, false); + break; + case COMPOUND_STORAGE: + OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content; + handleCompoundContent(cc, xhtml); + break; + } + } + + private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { + NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream()); + handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + } + + String formatCurrency(Double d, DataType type) { + if (d == null) { + return ""; + } + return currencyFormatter.format(d); + } + + String formatShortDateTime(Date d) { + if (d == null) { + return ""; + } + return shortDateTimeFormatter.format(d); + } +} + Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft; + + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Locale; +import java.util.Set; + +import com.healthmarketscience.jackcess.CryptCodecProvider; +import com.healthmarketscience.jackcess.Database; +import com.healthmarketscience.jackcess.DatabaseBuilder; +import com.healthmarketscience.jackcess.util.LinkResolver; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser that handles Microsoft Access files via + * <a href="http://jackcess.sourceforge.net/>Jackcess</a> + * <p> + * Many, many thanks to LexisNexis®/Health Market Science (HMS), Brian O'Neill, + * and James Ahlborn for relicensing Jackcess to Apache v2.0! + */ +public class JackcessParser extends AbstractParser { + + public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER; + public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER; + public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER; + public static Property MDB_PW = Property.externalText("Password"); + private final static LinkResolver IGNORE_LINK_RESOLVER = new IgnoreLinkResolver(); + + //TODO: figure out how to get this info + // public static Property LINKED_DATABASES = Property.externalTextBag("LinkedDatabases"); + + private static final long serialVersionUID = -752276948656079347L; + + private static final MediaType MEDIA_TYPE = MediaType.application("x-msaccess"); + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); + + private Locale locale = Locale.ROOT; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + TikaInputStream tis = TikaInputStream.get(stream); + Database db = null; + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + String password = null; + PasswordProvider passwordProvider = context.get(PasswordProvider.class); + if (passwordProvider != null) { + password = passwordProvider.getPassword(metadata); + } + try { + if (password == null) { + //do this to ensure encryption/wrong password exception vs. more generic + //"need right codec" error message. + db = new DatabaseBuilder(tis.getFile()) + .setCodecProvider(new CryptCodecProvider()) + .setReadOnly(true).open(); + } else { + db = new DatabaseBuilder(tis.getFile()) + .setCodecProvider(new CryptCodecProvider(password)) + .setReadOnly(true).open(); + } + db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case + JackcessExtractor ex = new JackcessExtractor(context, locale); + ex.parse(db, xhtml, metadata); + } catch (IllegalStateException e) { + if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) { + throw new EncryptedDocumentException(e); + } + throw e; + } finally { + if (db != null) { + try { + db.close(); + } catch (IOException e) { + //swallow = silent close + } + } + } + xhtml.endDocument(); + } + + private static final class IgnoreLinkResolver implements LinkResolver { + //If links are resolved, Jackcess might try to open and process + //any file on the current system that is specified as a linked db. + //This could be a nasty security issue. + @Override + public Database resolveLinkedDatabase(Database database, String s) throws IOException { + throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!"); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Linked cell. This class decorates another content cell with a hyperlink. + */ +public class LinkedCell extends CellDecorator { + + private final String link; + + public LinkedCell(Cell cell, String link) { + super(cell); + assert link != null; + this.link = link; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.startElement("a", "href", link); + super.render(handler); + handler.endElement("a"); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.util.NoSuchElementException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.ListData; +import org.apache.poi.hwpf.model.ListFormatOverrideLevel; +import org.apache.poi.hwpf.model.ListLevel; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.Paragraph; + +/** + * Computes the number text which goes at the beginning of each list paragraph + * <p/> + * <p><em>Note:</em> This class only handles the raw number text and does not apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p> + * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p> + * <p>Further, this class does not yet handle overrides</p> + */ +public class ListManager extends AbstractListManager { + + private static final Log logger = LogFactory.getLog(ListManager.class); + private final ListTables listTables; + + /** + * Ordinary constructor for a new list reader + * + * @param document Document to process + */ + public ListManager(final HWPFDocument document) { + this.listTables = document.getListTables(); + } + + /** + * Get the formatted number for a given paragraph + * <p/> + * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> paragraphs in a valid selection (main document, text field, ...) which are part of a list.</p> + * + * @param paragraph list paragraph to process + * @return String which represents the numbering of this list paragraph; never {@code null}, can be empty string, though, + * if something goes wrong in getList() + * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of a list + */ + public String getFormattedNumber(final Paragraph paragraph) { + if (paragraph == null) throw new IllegalArgumentException("Given paragraph cannot be null."); + if (!paragraph.isInList()) throw new IllegalArgumentException("Can only process list paragraphs."); + //lsid is equivalent to docx's abnum + //ilfo is equivalent to docx's num + int currAbNumId = -1; + try{ + currAbNumId = paragraph.getList().getLsid(); + } catch (NoSuchElementException e) { + //somewhat frequent exception when initializing HWPFList + return ""; + } catch (IllegalArgumentException e) { + return ""; + } catch (NullPointerException e) { + return ""; + } + + int currNumId = paragraph.getIlfo(); + ParagraphLevelCounter lc = listLevelMap.get(currAbNumId); + LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId); + + if (lc == null) { + ListData listData = listTables.getListData(paragraph.getList().getLsid()); + LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length]; + for (int i = 0; i < listData.getLevels().length; i++) { + levelTuples[i] = buildTuple(i, listData.getLevels()[i]); + } + lc = new ParagraphLevelCounter(levelTuples); + } + if (overrideTuples == null) { + overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels()); + } + String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples); + + listLevelMap.put(currAbNumId, lc); + overrideTupleMap.put(currNumId, overrideTuples); + return formattedString; + } + + private LevelTuple buildTuple(int i, ListLevel listLevel) { + boolean isLegal = false; + int start = 1; + int restart = -1; + String lvlText = "%" + i + "."; + String numFmt = "decimal"; + + start = listLevel.getStartAt(); + restart = listLevel.getRestart(); + isLegal = listLevel.isLegalNumbering(); + numFmt = convertToNewNumFormat(listLevel.getNumberFormat()); + lvlText = convertToNewNumberText(listLevel.getNumberText(), listLevel.getLevelNumberingPlaceholderOffsets()); + return new LevelTuple(start, restart, lvlText, numFmt, isLegal); + } + + private LevelTuple[] buildOverrideTuples(Paragraph par, int length) { + ListFormatOverrideLevel overrideLevel; + // find the override for this level + if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) { + return null; + } + overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0]; + if (overrideLevel == null) { + return null; + } + LevelTuple[] levelTuples = new LevelTuple[length]; + ListLevel listLevel = overrideLevel.getLevel(); + if (listLevel == null) { + return null; + } + for (int i = 0; i < length; i++) { + levelTuples[i] = buildTuple(i, listLevel); + } + + return levelTuples; + + } + + private String convertToNewNumberText(String numberText, byte[] numberOffsets) { + + StringBuilder sb = new StringBuilder(); + int last = 0; + for (int i = 0; i < numberOffsets.length; i++) { + int offset = (int) numberOffsets[i]; + + if (offset == 0) { + break; + } + sb.append(numberText.substring(last, offset - 1)); + //need to add one because newer format + //adds one. In .doc, this was the array index; + //but in .docx, this is the level number + int lvlNum = (int) numberText.charAt(offset - 1) + 1; + sb.append("%" + lvlNum); + last = offset; + } + if (last < numberText.length()) { + sb.append(numberText.substring(last)); + } + return sb.toString(); + } + + private String convertToNewNumFormat(int numberFormat) { + switch (numberFormat) { + case -1: + return "none"; + case 0: + return "decimal"; + case 1: + return "upperRoman"; + case 2: + return "lowerRoman"; + case 3: + return "upperLetter"; + case 4: + return "lowerLetter"; + case 5: + return "ordinal"; + case 22: + return "decimalZero"; + case 23: + return "bullet"; + case 47: + return "none"; + default: + //do we really want to silently swallow these uncovered cases? + //throw new RuntimeException("NOT COVERED: " + numberFormat); + return "decimal"; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.text.NumberFormat; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Number cell. + */ +public class NumberCell implements Cell { + + private final double number; + + private final NumberFormat format; + + public NumberCell(double number, NumberFormat format) { + this.number = number; + this.format = format; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.characters(format.format(number)); + } + + public String toString() { + return "Numeric Cell: " + format.format(number); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.security.GeneralSecurityException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hpbf.extractor.PublisherTextExtractor; +import org.apache.poi.poifs.crypt.Decryptor; +import org.apache.poi.poifs.crypt.EncryptionInfo; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Defines a Microsoft document content extractor. + */ +public class OfficeParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = 7393462244028653479L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + POIFSDocumentType.WORKBOOK.type, + POIFSDocumentType.OLE10_NATIVE.type, + POIFSDocumentType.WORDDOCUMENT.type, + POIFSDocumentType.UNKNOWN.type, + POIFSDocumentType.ENCRYPTED.type, + POIFSDocumentType.POWERPOINT.type, + POIFSDocumentType.PUBLISHER.type, + POIFSDocumentType.PROJECT.type, + POIFSDocumentType.VISIO.type, + // Works isn't supported + POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is + POIFSDocumentType.OUTLOOK.type, + POIFSDocumentType.SOLIDWORKS_PART.type, + POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type, + POIFSDocumentType.SOLIDWORKS_DRAWING.type + ))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts properties and text from an MS Document input stream + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + final DirectoryNode root; + TikaInputStream tstream = TikaInputStream.cast(stream); + if (tstream == null) { + root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot(); + } else { + final Object container = tstream.getOpenContainer(); + if (container instanceof NPOIFSFileSystem) { + root = ((NPOIFSFileSystem) container).getRoot(); + } else if (container instanceof DirectoryNode) { + root = (DirectoryNode) container; + } else { + NPOIFSFileSystem fs; + if (tstream.hasFile()) { + fs = new NPOIFSFileSystem(tstream.getFile(), true); + } else { + fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + } + tstream.setOpenContainer(fs); + root = fs.getRoot(); + } + } + parse(root, context, metadata, xhtml); + xhtml.endDocument(); + } + + protected void parse( + DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + + // Parse summary entries first, to make metadata available early + new SummaryExtractor(metadata).parseSummaries(root); + + // Parse remaining document entries + POIFSDocumentType type = POIFSDocumentType.detectType(root); + + if (type != POIFSDocumentType.UNKNOWN) { + setType(metadata, type.getType()); + } + + switch (type) { + case SOLIDWORKS_PART: + case SOLIDWORKS_ASSEMBLY: + case SOLIDWORKS_DRAWING: + break; + case PUBLISHER: + PublisherTextExtractor publisherTextExtractor = + new PublisherTextExtractor(root); + xhtml.element("p", publisherTextExtractor.getText()); + break; + case WORDDOCUMENT: + new WordExtractor(context).parse(root, xhtml); + break; + case POWERPOINT: + new HSLFExtractor(context).parse(root, xhtml); + break; + case WORKBOOK: + case XLR: + Locale locale = context.get(Locale.class, Locale.getDefault()); + new ExcelExtractor(context, metadata).parse(root, xhtml, locale); + break; + case PROJECT: + // We currently can't do anything beyond the metadata + break; + case VISIO: + VisioTextExtractor visioTextExtractor = + new VisioTextExtractor(root); + for (String text : visioTextExtractor.getAllText()) { + xhtml.element("p", text); + } + break; + case OUTLOOK: + OutlookExtractor extractor = + new OutlookExtractor(root, context); + + extractor.parse(xhtml, metadata); + break; + case ENCRYPTED: + EncryptionInfo info = new EncryptionInfo(root); + Decryptor d = Decryptor.getInstance(info); + + try { + // By default, use the default Office Password + String password = Decryptor.DEFAULT_PASSWORD; + + // If they supplied a Password Provider, ask that for the password, + // and use the provider given one if available (stick with default if not) + PasswordProvider passwordProvider = context.get(PasswordProvider.class); + if (passwordProvider != null) { + String suppliedPassword = passwordProvider.getPassword(metadata); + if (suppliedPassword != null) { + password = suppliedPassword; + } + } + + // Check if we've the right password or not + if (!d.verifyPassword(password)) { + throw new EncryptedDocumentException(); + } + + // Decrypt the OLE2 stream, and delegate the resulting OOXML + // file to the regular OOXML parser for normal handling + OOXMLParser parser = new OOXMLParser(); + + parser.parse(d.getDataStream(root), new EmbeddedContentHandler( + new BodyContentHandler(xhtml)), + metadata, context); + } catch (GeneralSecurityException ex) { + throw new EncryptedDocumentException(ex); + } + default: + // For unsupported / unhandled types, just the metadata + // is extracted, which happened above + break; + } + } + + private void setType(Metadata metadata, MediaType type) { + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + } + + public enum POIFSDocumentType { + WORKBOOK("xls", MediaType.application("vnd.ms-excel")), + OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), + COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), + WORDDOCUMENT("doc", MediaType.application("msword")), + UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), + ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")), + POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), + PUBLISHER("pub", MediaType.application("x-mspublisher")), + PROJECT("mpp", MediaType.application("vnd.ms-project")), + VISIO("vsd", MediaType.application("vnd.visio")), + WORKS("wps", MediaType.application("vnd.ms-works")), + XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")), + OUTLOOK("msg", MediaType.application("vnd.ms-outlook")), + SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")), + SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")), + SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")); + + private final String extension; + private final MediaType type; + + POIFSDocumentType(String extension, MediaType type) { + this.extension = extension; + this.type = type; + } + + public static POIFSDocumentType detectType(POIFSFileSystem fs) { + return detectType(fs.getRoot()); + } + + public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { + return detectType(fs.getRoot()); + } + + public static POIFSDocumentType detectType(DirectoryEntry node) { + Set<String> names = new HashSet<String>(); + for (Entry entry : node) { + names.add(entry.getName()); + } + MediaType type = POIFSContainerDetector.detect(names, node); + for (POIFSDocumentType poifsType : values()) { + if (type.equals(poifsType.type)) { + return poifsType; + } + } + return UNKNOWN; + } + + public String getExtension() { + return extension; + } + + public MediaType getType() { + return type; + } + } + +}
