Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.hmef.Attachment; +import org.apache.poi.hmef.HMEFMessage; +import org.apache.poi.hmef.attribute.MAPIAttribute; +import org.apache.poi.hmef.attribute.MAPIRtfAttribute; +import org.apache.poi.hsmf.datatypes.MAPIProperty; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A POI-powered Tika Parser for TNEF (Transport Neutral + * Encoding Format) messages, aka winmail.dat + */ +public class TNEFParser extends AbstractParser { + private static final long serialVersionUID = 4611820730372823452L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-tnef"), + MediaType.application("ms-tnef"), + MediaType.application("x-tnef") + ))); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Extracts properties and text from an MS Document input stream + */ + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // We work by recursing, so get the appropriate bits + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + EmbeddedDocumentExtractor embeddedExtractor; + if (ex == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + embeddedExtractor = ex; + } + + // Ask POI to process the file for us + HMEFMessage msg = new HMEFMessage(stream); + + // Set the message subject if known + String subject = msg.getSubject(); + if (subject != null && subject.length() > 0) { + // TODO: Move to title in Tika 2.0 + metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject); + } + + // Recurse into the message body RTF + MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED); + if (attr != null && attr instanceof MAPIRtfAttribute) { + MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr; + handleEmbedded( + "message.rtf", "application/rtf", + rtf.getData(), + embeddedExtractor, handler + ); + } + + // Recurse into each attachment in turn + for (Attachment attachment : msg.getAttachments()) { + String name = attachment.getLongFilename(); + if (name == null || name.length() == 0) { + name = attachment.getFilename(); + } + if (name == null || name.length() == 0) { + String ext = attachment.getExtension(); + if (ext != null) { + name = "unknown" + ext; + } + } + handleEmbedded( + name, null, attachment.getContents(), + embeddedExtractor, handler + ); + } + } + + private void handleEmbedded(String name, String type, byte[] contents, + EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) + throws IOException, SAXException, TikaException { + Metadata metadata = new Metadata(); + if (name != null) + metadata.set(Metadata.RESOURCE_NAME_KEY, name); + if (type != null) + metadata.set(Metadata.CONTENT_TYPE, type); + + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + TikaInputStream.get(contents), + new EmbeddedContentHandler(handler), + metadata, false); + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Text cell. + */ +public class TextCell implements Cell { + + private final String text; + + public TextCell(String text) { + this.text = text; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.characters(text); + } + + public String toString() { + return "Text Cell: \"" + text + "\""; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,711 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.extractor.Word6Extractor; +import org.apache.poi.hwpf.model.FieldsDocumentPart; +import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.model.StyleDescription; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Field; +import org.apache.poi.hwpf.usermodel.HeaderStories; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class WordExtractor extends AbstractPOIFSExtractor { + + private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; + private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; + // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3 + private static final String LIST_DELIMITER = " "; + private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<String, TagAndStyle>(); + private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null); + + static { + fixedParagraphStyles.put("Default", defaultParagraphStyle); + fixedParagraphStyles.put("Normal", defaultParagraphStyle); + fixedParagraphStyles.put("heading", new TagAndStyle("h1", null)); + fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null)); + fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title")); + fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle")); + fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null)); + } + + // True if we are currently in the named style tag: + private boolean curStrikeThrough; + private boolean curBold; + private boolean curItalic; + + public WordExtractor(ParseContext context) { + super(context); + } + + private static int countParagraphs(Range... ranges) { + int count = 0; + for (Range r : ranges) { + if (r != null) { + count += r.numParagraphs(); + } + } + return count; + } + + /** + * Given a style name, return what tag should be used, and + * what style should be applied to it. + */ + public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) { + TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName); + if (tagAndStyle != null) { + return tagAndStyle; + } + + if (styleName.equals("Table Contents") && isTable) { + return defaultParagraphStyle; + } + + String tag = "p"; + String styleClass = null; + + if (styleName.startsWith("heading") || styleName.startsWith("Heading")) { + // "Heading 3" or "Heading2" or "heading 4" + int num = 1; + try { + num = Integer.parseInt( + styleName.substring(styleName.length() - 1) + ); + } catch (NumberFormatException e) { + } + // Turn it into a H1 - H6 (H7+ isn't valid!) + tag = "h" + Math.min(num, 6); + } else { + styleClass = styleName.replace(' ', '_'); + styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) + + styleClass.substring(1); + } + + return new TagAndStyle(tag, styleClass); + } + + protected void parse( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + parse(filesystem.getRoot(), xhtml); + } + + protected void parse( + DirectoryNode root, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + HWPFDocument document; + try { + document = new HWPFDocument(root); + } catch (OldWordFileFormatException e) { + parseWord6(root, xhtml); + return; + } + org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = + new org.apache.poi.hwpf.extractor.WordExtractor(document); + HeaderStories headerFooter = new HeaderStories(document); + + // Grab the list of pictures. As far as we can tell, + // the pictures should be in order, and may be directly + // placed or referenced from an anchor + PicturesTable pictureTable = document.getPicturesTable(); + PicturesSource pictures = new PicturesSource(document); + + // Do any headers, if present + Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(), + headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()}; + handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); + + // Do the main paragraph text + Range r = document.getRange(); + ListManager listManager = new ListManager(document); + for (int i = 0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); + } + + // Do everything else + for (String paragraph : wordExtractor.getMainTextboxText()) { + xhtml.element("p", paragraph); + } + + for (String paragraph : wordExtractor.getFootnoteText()) { + xhtml.element("p", paragraph); + } + + for (String paragraph : wordExtractor.getCommentsText()) { + xhtml.element("p", paragraph); + } + + for (String paragraph : wordExtractor.getEndnoteText()) { + xhtml.element("p", paragraph); + } + + // Do any footers, if present + Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(), + headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()}; + handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); + + // Handle any pictures that we haven't output yet + for (Picture p = pictures.nextUnclaimed(); p != null; ) { + handlePictureCharacterRun( + null, p, pictures, xhtml + ); + p = pictures.nextUnclaimed(); + } + + // Handle any embeded office documents + try { + DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); + for (Entry entry : op) { + if (entry.getName().startsWith("_") + && entry instanceof DirectoryEntry) { + handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); + } + } + } catch (FileNotFoundException e) { + } + } + + private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, + PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { + if (countParagraphs(ranges) > 0) { + xhtml.startElement("div", "class", type); + ListManager listManager = new ListManager(document); + for (Range r : ranges) { + if (r != null) { + for (int i = 0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + + i += handleParagraph(p, 0, r, document, + FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml); + } + } + } + xhtml.endElement("div"); + } + } + + private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, + FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, + XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { + // Note - a poi bug means we can't currently properly recurse + // into nested tables, so currently we don't + if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { + Table t = r.getTable(p); + xhtml.startElement("table"); + xhtml.startElement("tbody"); + for (int rn = 0; rn < t.numRows(); rn++) { + TableRow row = t.getRow(rn); + xhtml.startElement("tr"); + for (int cn = 0; cn < row.numCells(); cn++) { + TableCell cell = row.getCell(cn); + xhtml.startElement("td"); + + for (int pn = 0; pn < cell.numParagraphs(); pn++) { + Paragraph cellP = cell.getParagraph(pn); + handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml); + } + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + return (t.numParagraphs() - 1); + } + + String text = p.text(); + if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { + // Skip empty paragraphs + return 0; + } + + TagAndStyle tas; + String numbering = null; + + if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { + StyleDescription style = + document.getStyleSheet().getStyleDescription(p.getStyleIndex()); + if (style != null && style.getName() != null && style.getName().length() > 0) { + if (p.isInList()) { + numbering = listManager.getFormattedNumber(p); + } + tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); + } else { + tas = new TagAndStyle("p", null); + } + } else { + tas = new TagAndStyle("p", null); + } + + if (tas.getStyleClass() != null) { + xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); + } else { + xhtml.startElement(tas.getTag()); + } + + if (numbering != null) { + xhtml.characters(numbering); + } + + for (int j = 0; j < p.numCharacterRuns(); j++) { + CharacterRun cr = p.getCharacterRun(j); + + // FIELD_BEGIN_MARK: + if (cr.text().getBytes(UTF_8)[0] == 0x13) { + Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); + // 58 is an embedded document + // 56 is a document link + if (field != null && (field.getType() == 58 || field.getType() == 56)) { + // Embedded Object: add a <div + // class="embedded" id="_X"/> so consumer can see where + // in the main text each embedded document + // occurred: + String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", id); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + + if (cr.text().equals("\u0013")) { + j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); + } else if (cr.text().startsWith("\u0008")) { + // Floating Picture(s) + for (int pn = 0; pn < cr.text().length(); pn++) { + // Assume they're in the order from the unclaimed list... + Picture picture = pictures.nextUnclaimed(); + + // Output + handlePictureCharacterRun(cr, picture, pictures, xhtml); + } + } else if (pictureTable.hasPicture(cr)) { + // Inline Picture + Picture picture = pictures.getFor(cr); + handlePictureCharacterRun(cr, picture, pictures, xhtml); + } else { + handleCharacterRun(cr, tas.isHeading(), xhtml); + } + } + + // Close any still open style tags + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (curItalic) { + xhtml.endElement("i"); + curItalic = false; + } + if (curBold) { + xhtml.endElement("b"); + curBold = false; + } + + xhtml.endElement(tas.getTag()); + + return 0; + } + + private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) + throws SAXException { + // Skip trailing newlines + if (!isRendered(cr) || cr.text().equals("\r")) + return; + + if (!skipStyling) { + if (cr.isBold() != curBold) { + // Enforce nesting -- must close s and i tags + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (curItalic) { + xhtml.endElement("i"); + curItalic = false; + } + if (cr.isBold()) { + xhtml.startElement("b"); + } else { + xhtml.endElement("b"); + } + curBold = cr.isBold(); + } + + if (cr.isItalic() != curItalic) { + // Enforce nesting -- must close s tag + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (cr.isItalic()) { + xhtml.startElement("i"); + } else { + xhtml.endElement("i"); + } + curItalic = cr.isItalic(); + } + + if (cr.isStrikeThrough() != curStrikeThrough) { + if (cr.isStrikeThrough()) { + xhtml.startElement("s"); + } else { + xhtml.endElement("s"); + } + curStrikeThrough = cr.isStrikeThrough(); + } + } + + // Clean up the text + String text = cr.text(); + text = text.replace('\r', '\n'); + if (text.endsWith("\u0007")) { + // Strip the table cell end marker + text = text.substring(0, text.length() - 1); + } + + // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: + + // Non-breaking hyphens are returned as char 30 + text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); + + // Non-required hyphens to zero-width space + text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); + + // Control characters as line break + text = text.replaceAll("[\u0000-\u001f]", "\n"); + xhtml.characters(text); + } + + /** + * Can be \13..text..\15 or \13..control..\14..text..\15 . + * Nesting is allowed + */ + private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, + PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException { + List<CharacterRun> controls = new ArrayList<CharacterRun>(); + List<CharacterRun> texts = new ArrayList<CharacterRun>(); + boolean has14 = false; + + // Split it into before and after the 14 + int i; + for (i = index + 1; i < p.numCharacterRuns(); i++) { + CharacterRun cr = p.getCharacterRun(i); + if (cr.text().equals("\u0013")) { + // Nested, oh joy... + int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml); + i += increment; + } else if (cr.text().equals("\u0014")) { + has14 = true; + } else if (cr.text().equals("\u0015")) { + if (!has14) { + texts = controls; + controls = new ArrayList<CharacterRun>(); + } + break; + } else { + if (has14) { + texts.add(cr); + } else { + controls.add(cr); + } + } + } + + // Do we need to do something special with this? + if (controls.size() > 0) { + String text = controls.get(0).text(); + for (int j = 1; j < controls.size(); j++) { + text += controls.get(j).text(); + } + + if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) + && text.indexOf('"') > -1) { + int start = text.indexOf('"') + 1; + int end = findHyperlinkEnd(text, start); + String url = ""; + if (start >= 0 && start < end && end <= text.length()) { + url = text.substring(start, end); + } + + xhtml.startElement("a", "href", url); + for (CharacterRun cr : texts) { + handleCharacterRun(cr, skipStyling, xhtml); + } + xhtml.endElement("a"); + } else { + // Just output the text ones + for (CharacterRun cr : texts) { + if (pictures.hasPicture(cr)) { + Picture picture = pictures.getFor(cr); + handlePictureCharacterRun(cr, picture, pictures, xhtml); + } else { + handleCharacterRun(cr, skipStyling, xhtml); + } + } + } + } else { + // We only had text + // Output as-is + for (CharacterRun cr : texts) { + handleCharacterRun(cr, skipStyling, xhtml); + } + } + + // Tell them how many to skip over + return i - index; + } + + //temporary work around for TIKA-1512 + private int findHyperlinkEnd(String text, int start) { + int end = text.lastIndexOf('"'); + if (end > start) { + return end; + } + end = text.lastIndexOf('\u201D');//smart right double quote + if (end > start) { + return end; + } + end = text.lastIndexOf('\r'); + if (end > start) { + return end; + } + //if nothing so far, take the full length of the string + //If the full string is > 256 characters, it appears + //that the url is truncated in the .doc file. This + //will return the value as it is in the file, which + //may be incorrect; but it is the same behavior as opening + //the link in MSWord. + //This code does not currently check that length is actually >= 256. + //we might want to add that? + return text.length(); + } + + private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { + if (!isRendered(cr) || picture == null) { + // Oh dear, we've run out... + // Probably caused by multiple \u0008 images referencing + // the same real image + return; + } + + // Which one is it? + String extension = picture.suggestFileExtension(); + int pictureNumber = pictures.pictureNumber(picture); + + // Make up a name for the picture + // There isn't one in the file, but we need to be able to reference + // the picture from the img tag and the embedded resource + String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); + + // Grab the mime type for the picture + String mimeType = picture.getMimeType(); + + // Output the img tag + AttributesImpl attr = new AttributesImpl(); + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); + attr.addAttribute("", "alt", "alt", "CDATA", filename); + xhtml.startElement("img", attr); + xhtml.endElement("img"); + + // Have we already output this one? + // (Only expose each individual image once) + if (!pictures.hasOutput(picture)) { + TikaInputStream stream = TikaInputStream.get(picture.getContent()); + handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); + pictures.recordOutput(picture); + } + } + + /** + * Outputs a section of text if the given text is non-empty. + * + * @param xhtml XHTML content handler + * @param section the class of the <div/> section emitted + * @param text text to be emitted, if any + * @throws SAXException if an error occurs + */ + private void addTextIfAny( + XHTMLContentHandler xhtml, String section, String text) + throws SAXException { + if (text != null && text.length() > 0) { + xhtml.startElement("div", "class", section); + xhtml.element("p", text); + xhtml.endElement("div"); + } + } + + protected void parseWord6( + NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + parseWord6(filesystem.getRoot(), xhtml); + } + + protected void parseWord6( + DirectoryNode root, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + HWPFOldDocument doc = new HWPFOldDocument(root); + Word6Extractor extractor = new Word6Extractor(doc); + + for (String p : extractor.getParagraphText()) { + xhtml.element("p", p); + } + } + + /** + * Determines if character run should be included in the extraction. + * + * @param cr character run. + * @return true if character run should be included in extraction. + */ + private boolean isRendered(final CharacterRun cr) { + return cr == null || !cr.isMarkedDeleted(); + } + + public static class TagAndStyle { + private String tag; + private String styleClass; + + public TagAndStyle(String tag, String styleClass) { + this.tag = tag; + this.styleClass = styleClass; + } + + public String getTag() { + return tag; + } + + public String getStyleClass() { + return styleClass; + } + + public boolean isHeading() { + return tag.length() == 2 && tag.startsWith("h"); + } + } + + /** + * Provides access to the pictures both by offset, iteration + * over the un-claimed, and peeking forward + */ + private static class PicturesSource { + private PicturesTable picturesTable; + private Set<Picture> output = new HashSet<Picture>(); + private Map<Integer, Picture> lookup; + private List<Picture> nonU1based; + private List<Picture> all; + private int pn = 0; + + private PicturesSource(HWPFDocument doc) { + picturesTable = doc.getPicturesTable(); + all = picturesTable.getAllPictures(); + + // Build the Offset-Picture lookup map + lookup = new HashMap<Integer, Picture>(); + for (Picture p : all) { + lookup.put(p.getStartOffset(), p); + } + + // Work out which Pictures aren't referenced by + // a \u0001 in the main text + // These are \u0008 escher floating ones, ones + // found outside the normal text, and who + // knows what else... + nonU1based = new ArrayList<Picture>(); + nonU1based.addAll(all); + Range r = doc.getRange(); + for (int i = 0; i < r.numCharacterRuns(); i++) { + CharacterRun cr = r.getCharacterRun(i); + if (picturesTable.hasPicture(cr)) { + Picture p = getFor(cr); + int at = nonU1based.indexOf(p); + nonU1based.set(at, null); + } + } + } + + private boolean hasPicture(CharacterRun cr) { + return picturesTable.hasPicture(cr); + } + + private void recordOutput(Picture picture) { + output.add(picture); + } + + private boolean hasOutput(Picture picture) { + return output.contains(picture); + } + + private int pictureNumber(Picture picture) { + return all.indexOf(picture) + 1; + } + + private Picture getFor(CharacterRun cr) { + return lookup.get(cr.getPicOffset()); + } + + /** + * Return the next unclaimed one, used towards + * the end + */ + private Picture nextUnclaimed() { + Picture p = null; + while (pn < nonU1based.size()) { + p = nonU1based.get(pn); + pn++; + if (p != null) return p; + } + return null; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.util.List; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Base class for all Tika OOXML extractors. + * <p/> + * Tika extractors decorate POI extractors so that the parsed content of + * documents is returned as a sequence of XHTML SAX events. Subclasses must + * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that + * populates the {@link XHTMLContentHandler} object received as parameter. + */ +public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { + static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio"; + static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; + static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject"; + static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package"; + + private static final String TYPE_OLE_OBJECT = + "application/vnd.openxmlformats-officedocument.oleObject"; + private final EmbeddedDocumentExtractor embeddedExtractor; + protected POIXMLTextExtractor extractor; + + public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { + this.extractor = extractor; + + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + + if (ex == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } else { + embeddedExtractor = ex; + } + + } + + /** + * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument() + */ + public POIXMLDocument getDocument() { + return extractor.getDocument(); + } + + /** + * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor() + */ + public MetadataExtractor getMetadataExtractor() { + return new MetadataExtractor(extractor); + } + + /** + * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata) + */ + public void getXHTML( + ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, XmlException, IOException, TikaException { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + buildXHTML(xhtml); + + // Now do any embedded parts + handleEmbeddedParts(handler); + + // thumbnail + handleThumbnail(handler); + + xhtml.endDocument(); + } + + protected String getJustFileName(String desc) { + int idx = desc.lastIndexOf('/'); + if (idx != -1) { + desc = desc.substring(idx + 1); + } + idx = desc.lastIndexOf('.'); + if (idx != -1) { + desc = desc.substring(0, idx); + } + + return desc; + } + + private void handleThumbnail(ContentHandler handler) { + try { + OPCPackage opcPackage = extractor.getPackage(); + for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { + PackagePart tPart = opcPackage.getPart(rel); + InputStream tStream = tPart.getInputStream(); + Metadata thumbnailMetadata = new Metadata(); + String thumbName = tPart.getPartName().getName(); + thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName); + + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); + attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); + handler.startElement(XHTML, "div", "div", attributes); + handler.endElement(XHTML, "div", "div"); + + thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName); + thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); + thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); + + if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { + embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); + } + + tStream.close(); + } + } catch (Exception ex) { + + } + } + + private void handleEmbeddedParts(ContentHandler handler) + throws TikaException, IOException, SAXException { + try { + for (PackagePart source : getMainDocumentParts()) { + for (PackageRelationship rel : source.getRelationships()) { + + URI sourceURI = rel.getSourceURI(); + String sourceDesc; + if (sourceURI != null) { + sourceDesc = getJustFileName(sourceURI.getPath()); + if (sourceDesc.startsWith("slide")) { + sourceDesc += "_"; + } else { + sourceDesc = ""; + } + } else { + sourceDesc = ""; + } + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePart target; + + try { + target = source.getRelatedPart(rel); + } catch (IllegalArgumentException ex) { + continue; + } + + String type = rel.getRelationshipType(); + if (RELATION_OLE_OBJECT.equals(type) + && TYPE_OLE_OBJECT.equals(target.getContentType())) { + handleEmbeddedOLE(target, handler, sourceDesc + rel.getId()); + } else if (RELATION_AUDIO.equals(type) + || RELATION_IMAGE.equals(type) + || RELATION_PACKAGE.equals(type) + || RELATION_OLE_OBJECT.equals(type)) { + handleEmbeddedFile(target, handler, sourceDesc + rel.getId()); + } + } + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + + /** + * Handles an embedded OLE object in the document + */ + private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) + throws IOException, SAXException { + // A POIFSFileSystem needs to be at least 3 blocks big to be valid + if (part.getSize() >= 0 && part.getSize() < 512 * 3) { + // Too small, skip + return; + } + + // Open the POIFS (OLE2) structure and process + POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); + try { + Metadata metadata = new Metadata(); + TikaInputStream stream = null; + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); + + DirectoryNode root = fs.getRoot(); + POIFSDocumentType type = POIFSDocumentType.detectType(root); + + if (root.hasEntry("CONTENTS") + && root.hasEntry("\u0001Ole") + && root.hasEntry("\u0001CompObj") + && root.hasEntry("\u0003ObjInfo")) { + // TIKA-704: OLE 2.0 embedded non-Office document? + stream = TikaInputStream.get( + fs.createDocumentInputStream("CONTENTS")); + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + stream, new EmbeddedContentHandler(handler), + metadata, false); + } + } else if (POIFSDocumentType.OLE10_NATIVE == type) { + // TIKA-704: OLE 1.0 embedded document + Ole10Native ole = + Ole10Native.createFromEmbeddedOleObject(fs); + if (ole.getLabel() != null) { + metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); + } + byte[] data = ole.getDataBuffer(); + if (data != null) { + stream = TikaInputStream.get(data); + } + + if (stream != null + && embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + stream, new EmbeddedContentHandler(handler), + metadata, false); + } + } else { + handleEmbeddedFile(part, handler, rel); + } + } catch (FileNotFoundException e) { + // There was no CONTENTS entry, so skip this part + } catch (Ole10NativeException e) { + // Could not process an OLE 1.0 entry, so skip this part + } + } + + /** + * Handles an embedded file in the document + */ + protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) + throws SAXException, IOException { + Metadata metadata = new Metadata(); + metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); + + // Get the name + String name = part.getPartName().getName(); + metadata.set( + Metadata.RESOURCE_NAME_KEY, + name.substring(name.lastIndexOf('/') + 1)); + + // Get the content type + metadata.set( + Metadata.CONTENT_TYPE, part.getContentType()); + + // Call the recursing handler + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + TikaInputStream.get(part.getInputStream()), + new EmbeddedContentHandler(handler), + metadata, false); + } + } + + /** + * Populates the {@link XHTMLContentHandler} object received as parameter. + */ + protected abstract void buildXHTML(XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException; + + /** + * Return a list of the main parts of the document, used + * when searching for embedded resources. + * This should be all the parts of the document that end + * up with things embedded into them. + */ + protected abstract List<PackagePart> getMainDocumentParts() + throws TikaException; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.math.BigDecimal; +import java.util.Date; + +import org.apache.poi.POIXMLProperties.CoreProperties; +import org.apache.poi.POIXMLProperties.CustomProperties; +import org.apache.poi.POIXMLProperties.ExtendedProperties; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart; +import org.apache.poi.openxml4j.util.Nullable; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.MSOffice; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.microsoft.SummaryExtractor; +import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty; +import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; + +/** + * OOXML metadata extractor. + * <p/> + * Currently POI doesn't support metadata extraction for OOXML. + * + * @see OOXMLExtractor#getMetadataExtractor() + */ +public class MetadataExtractor { + + private final POIXMLTextExtractor extractor; + + public MetadataExtractor(POIXMLTextExtractor extractor) { + this.extractor = extractor; + } + + public void extract(Metadata metadata) throws TikaException { + if (extractor.getDocument() != null || + (extractor instanceof XSSFEventBasedExcelExtractor && + extractor.getPackage() != null)) { + extractMetadata(extractor.getCoreProperties(), metadata); + extractMetadata(extractor.getExtendedProperties(), metadata); + extractMetadata(extractor.getCustomProperties(), metadata); + } + } + + private void extractMetadata(CoreProperties properties, Metadata metadata) { + PackagePropertiesPart propsHolder = properties + .getUnderlyingProperties(); + + addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty()); + addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder + .getContentStatusProperty()); + addProperty(metadata, TikaCoreProperties.CREATED, propsHolder + .getCreatedProperty()); + addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder + .getCreatorProperty()); + addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder + .getDescriptionProperty()); + addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder + .getIdentifierProperty()); + addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder + .getKeywordsProperty()); + addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder + .getLanguageProperty()); + addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder + .getLastModifiedByProperty()); + addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder + .getLastPrintedProperty()); + addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder + .getModifiedProperty()); + addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder + .getModifiedProperty()); + addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder + .getRevisionProperty()); + // TODO: Move to OO subject in Tika 2.0 + addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, + propsHolder.getSubjectProperty()); + addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty()); + addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty()); + + // Legacy Tika-1.0 style stats + // TODO Remove these in Tika 2.0 + addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty()); + addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder + .getContentStatusProperty()); + addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder + .getRevisionProperty()); + addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty()); + } + + private void extractMetadata(ExtendedProperties properties, + Metadata metadata) { + CTProperties propsHolder = properties.getUnderlyingProperties(); + + addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); + addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion()); + addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany()); + addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany()); + SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager()); + addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes()); + addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); + addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate()); + addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime()); + + if (propsHolder.getPages() > 0) { + metadata.set(PagedText.N_PAGES, propsHolder.getPages()); + } else if (propsHolder.getSlides() > 0) { + metadata.set(PagedText.N_PAGES, propsHolder.getSlides()); + } + + // Process the document statistics + addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages()); + addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides()); + addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs()); + addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines()); + addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords()); + addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters()); + addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); + + // Legacy Tika-1.0 style stats + // TODO Remove these in Tika 2.0 + addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication()); + addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion()); + addProperty(metadata, Metadata.MANAGER, propsHolder.getManager()); + addProperty(metadata, Metadata.NOTES, propsHolder.getNotes()); + addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); + addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate()); + addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime()); + addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages()); + addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides()); + addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs()); + addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines()); + addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords()); + addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters()); + addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); + } + + private void extractMetadata(CustomProperties properties, + Metadata metadata) { + org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties + props = properties.getUnderlyingProperties(); + for (int i = 0; i < props.sizeOfPropertyArray(); i++) { + CTProperty property = props.getPropertyArray(i); + String val = null; + Date date = null; + + if (property.isSetLpwstr()) { + val = property.getLpwstr(); + } else if (property.isSetLpstr()) { + val = property.getLpstr(); + } else if (property.isSetDate()) { + date = property.getDate().getTime(); + } else if (property.isSetFiletime()) { + date = property.getFiletime().getTime(); + } else if (property.isSetBool()) { + val = Boolean.toString(property.getBool()); + } + + // Integers + else if (property.isSetI1()) { + val = Integer.toString(property.getI1()); + } else if (property.isSetI2()) { + val = Integer.toString(property.getI2()); + } else if (property.isSetI4()) { + val = Integer.toString(property.getI4()); + } else if (property.isSetI8()) { + val = Long.toString(property.getI8()); + } else if (property.isSetInt()) { + val = Integer.toString(property.getInt()); + } + + // Unsigned Integers + else if (property.isSetUi1()) { + val = Integer.toString(property.getUi1()); + } else if (property.isSetUi2()) { + val = Integer.toString(property.getUi2()); + } else if (property.isSetUi4()) { + val = Long.toString(property.getUi4()); + } else if (property.isSetUi8()) { + val = property.getUi8().toString(); + } else if (property.isSetUint()) { + val = Long.toString(property.getUint()); + } + + // Reals + else if (property.isSetR4()) { + val = Float.toString(property.getR4()); + } else if (property.isSetR8()) { + val = Double.toString(property.getR8()); + } else if (property.isSetDecimal()) { + BigDecimal d = property.getDecimal(); + if (d == null) { + val = null; + } else { + val = d.toPlainString(); + } + } else if (property.isSetArray()) { + // TODO Fetch the array values and output + } else if (property.isSetVector()) { + // TODO Fetch the vector values and output + } else if (property.isSetBlob() || property.isSetOblob()) { + // TODO Decode, if possible + } else if (property.isSetStream() || property.isSetOstream() || + property.isSetVstream()) { + // TODO Decode, if possible + } else if (property.isSetStorage() || property.isSetOstorage()) { + // TODO Decode, if possible + } else { + // This type isn't currently supported yet, skip the property + } + + String propName = "custom:" + property.getName(); + if (date != null) { + Property tikaProp = Property.externalDate(propName); + metadata.set(tikaProp, date); + } else if (val != null) { + metadata.set(propName, val); + } + } + } + + private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) { + T value = nullableValue.getValue(); + if (value != null) { + if (value instanceof Date) { + metadata.set(property, (Date) value); + } else if (value instanceof String) { + metadata.set(property, (String) value); + } else if (value instanceof Integer) { + metadata.set(property, (Integer) value); + } else if (value instanceof Double) { + metadata.set(property, (Double) value); + } + } + } + + private void addProperty(Metadata metadata, String name, Nullable<?> value) { + if (value.getValue() != null) { + addProperty(metadata, name, value.getValue().toString()); + } + } + + private void addProperty(Metadata metadata, Property property, String value) { + if (value != null) { + metadata.set(property, value); + } + } + + private void addProperty(Metadata metadata, String name, String value) { + if (value != null) { + metadata.set(name, value); + } + } + + private void addProperty(Metadata metadata, Property property, int value) { + if (value > 0) { + metadata.set(property, value); + } + } + + private void addProperty(Metadata metadata, String name, int value) { + if (value > 0) { + metadata.set(name, Integer.toString(value)); + } + } + + private void addMultiProperty(Metadata metadata, Property property, Nullable<String> value) { + if (value == null) { + return; + } + SummaryExtractor.addMulti(metadata, property, value.getValue()); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Interface implemented by all Tika OOXML extractors. + * + * @see org.apache.poi.POIXMLTextExtractor + */ +public interface OOXMLExtractor { + + /** + * Returns the opened document. + * + * @see POIXMLTextExtractor#getDocument() + */ + POIXMLDocument getDocument(); + + /** + * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported + * for OOXML by POI. + */ + MetadataExtractor getMetadataExtractor(); + + /** + * Parses the document into a sequence of XHTML SAX events sent to the + * given content handler. + */ + void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, XmlException, IOException, TikaException; +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Locale; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackageAccess; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pkg.ZipContainerDetector; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Figures out the correct {@link OOXMLExtractor} for the supplied document and + * returns it. + */ +public class OOXMLExtractorFactory { + + public static void parse( + InputStream stream, ContentHandler baseHandler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + Locale locale = context.get(Locale.class, Locale.getDefault()); + ExtractorFactory.setThreadPrefersEventExtractors(true); + + try { + OOXMLExtractor extractor; + OPCPackage pkg; + + // Locate or Open the OPCPackage for the file + TikaInputStream tis = TikaInputStream.cast(stream); + if (tis != null && tis.getOpenContainer() instanceof OPCPackage) { + pkg = (OPCPackage) tis.getOpenContainer(); + } else if (tis != null && tis.hasFile()) { + pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); + tis.setOpenContainer(pkg); + } else { + InputStream shield = new CloseShieldInputStream(stream); + pkg = OPCPackage.open(shield); + } + + // Get the type, and ensure it's one we handle + MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); + if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { + // Not a supported type, delegate to Empty Parser + EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); + return; + } + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + + // Have the appropriate OOXML text extractor picked + POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg); + + POIXMLDocument document = poiExtractor.getDocument(); + if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { + extractor = new XSSFExcelExtractorDecorator( + context, (XSSFEventBasedExcelExtractor) poiExtractor, locale); + } else if (document == null) { + throw new TikaException( + "Expecting UserModel based POI OOXML extractor with a document, but none found. " + + "The extractor returned was a " + poiExtractor + ); + } else if (document instanceof XMLSlideShow) { + extractor = new XSLFPowerPointExtractorDecorator( + context, (XSLFPowerPointExtractor) poiExtractor); + } else if (document instanceof XWPFDocument) { + extractor = new XWPFWordExtractorDecorator( + context, (XWPFWordExtractor) poiExtractor); + } else { + extractor = new POIXMLTextExtractorDecorator(context, poiExtractor); + } + + // Get the bulk of the metadata first, so that it's accessible during + // parsing if desired by the client (see TIKA-1109) + extractor.getMetadataExtractor().extract(metadata); + + // Extract the text, along with any in-document metadata + extractor.getXHTML(baseHandler, metadata, context); + } catch (IllegalArgumentException e) { + if (e.getMessage() != null && + e.getMessage().startsWith("No supported documents found")) { + throw new TikaException( + "TIKA-418: RuntimeException while getting content" + + " for thmx and xps file types", e); + } else { + throw new TikaException("Error creating OOXML extractor", e); + } + } catch (InvalidFormatException e) { + throw new TikaException("Error creating OOXML extractor", e); + } catch (OpenXML4JException e) { + throw new TikaException("Error creating OOXML extractor", e); + } catch (XmlException e) { + throw new TikaException("Error creating OOXML extractor", e); + + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.openxml4j.util.ZipSecureFile; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Office Open XML (OOXML) parser. + */ +public class OOXMLParser extends AbstractParser { + static { + //turn off POI's zip bomb detection because we have our own + ZipSecureFile.setMinInflateRatio(-1.0d); + } + + protected static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("x-tika-ooxml"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), + MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"), + MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"), + MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + MediaType.application("vnd.ms-excel.sheet.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"), + MediaType.application("vnd.ms-excel.template.macroenabled.12"), + MediaType.application("vnd.ms-excel.addin.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"), + MediaType.application("vnd.ms-word.document.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"), + MediaType.application("vnd.ms-word.template.macroenabled.12")))); + /** + * We claim to support all OOXML files, but we actually don't support a small + * number of them. + * This list is used to decline certain formats that are not yet supported + * by Tika and/or POI. + */ + protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"), + MediaType.application("vnd.ms-xpsdocument") + ))); + /** + * Serial version UID + */ + private static final long serialVersionUID = 6535995710857776481L; + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // Have the OOXML file processed + OOXMLExtractorFactory.parse(stream, handler, metadata, context); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor { + + public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) { + super(context, extractor); + } + + @Override + protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException { + // extract document content as a single string (not structured) + xhtml.element("p", extractor.getText()); + } + + @Override + protected List<PackagePart> getMainDocumentParts() { + return new ArrayList<PackagePart>(); + } +}
