This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3968 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 325fda7c89e4e5cfe2eeb750069d0a98cb2d5662 Author: tballison <[email protected]> AuthorDate: Tue Feb 7 11:46:39 2023 -0500 TIKA-3968 -- extract actual embedded file names from associated EMF files in docx --- .../main/java/org/apache/tika/metadata/Office.java | 6 + .../apache/tika/parser/microsoft/EMFParser.java | 113 +++++++++++---- .../tika/parser/microsoft/HSLFExtractor.java | 1 + .../microsoft/ooxml/AbstractOOXMLExtractor.java | 91 +++++++++--- .../microsoft/ooxml/EmbeddedPartMetadata.java | 69 +++++++++ .../ooxml/XWPFWordExtractorDecorator.java | 158 ++++++++++++++++----- .../ooxml/OOXMLContainerExtractionTest.java | 2 +- .../test-documents/testWORD_EMFAndAttachments.docx | Bin 0 -> 61769 bytes .../parser/microsoft/ooxml/OOXMLParserTest.java | 56 ++++++++ 9 files changed, 417 insertions(+), 79 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 8c9243f94..aff57f701 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -168,4 +168,10 @@ public interface Office { Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate( PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-msg-client-submit-time"); + + /** + * Embedded files may have a "progID" associated with them, such as + * Word.Document.12 or AcroExch.Document.DC + */ + Property PROG_ID = Property.internalText("msoffice:progID"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java index f69975685..c82cc3e8a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java @@ -28,6 +28,7 @@ import org.apache.poi.hemf.record.emf.HemfRecordType; import org.apache.poi.hemf.record.emf.HemfText; import org.apache.poi.hemf.usermodel.HemfPicture; import org.apache.poi.util.RecordFormatException; +import org.apache.poi.util.StringUtil; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -36,6 +37,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; @@ -54,6 +56,11 @@ import org.apache.tika.sax.XHTMLContentHandler; */ public class EMFParser extends AbstractParser { + public static Property EMF_ICON_ONLY = Property.internalBoolean("emf:iconOnly"); + public static Property EMF_ICON_STRING = Property.internalText("emf:iconString"); + + private static String ICON_ONLY = "IconOnly"; + private static final MediaType MEDIA_TYPE = MediaType.image("emf"); private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf"); @@ -88,50 +95,48 @@ public class EMFParser extends AbstractParser { xhtml.startDocument(); try { HemfPicture ex = new HemfPicture(stream); - double lastY = -1; - double lastX = -1; + ParseState parseState = new ParseState(); long fudgeFactorX = 1000;//derive this from the font or frame/bounds information StringBuilder buffer = new StringBuilder(); + //iterate through the records. if you hit IconOnly in a comment + //and it is the first IconOnly, grab the string in the next comment record + //and that'll be the full name of the file. for (HemfRecord record : ex) { + parseState.isIconOnly = false; if (record.getEmfRecordType() == HemfRecordType.comment) { - HemfComment.EmfCommentData commentData = - ((HemfComment.EmfComment) record).getCommentData(); - if (commentData instanceof HemfComment.EmfCommentDataMultiformats) { - if (embeddedDocumentExtractor == null) { - embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); - } - handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData, - xhtml, embeddedDocumentExtractor); - } else if (commentData instanceof HemfComment.EmfCommentDataWMF) { - if (embeddedDocumentExtractor == null) { - embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); - } - handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml, - embeddedDocumentExtractor); - } + handleCommentData( + ((HemfComment.EmfComment) record).getCommentData(), parseState, xhtml, context); } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) { - HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record; //change equality to delta diff; - if (lastY > -1 && lastY != extTextOutW.getReference().getY()) { + if (parseState.lastY > -1 && + parseState.lastY != extTextOutW.getReference().getY()) { xhtml.startElement("p"); xhtml.characters(buffer.toString()); xhtml.endElement("p"); buffer.setLength(0); - lastX = -1; + parseState.lastX = -1; } - if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) { + if (parseState.lastX > -1 && extTextOutW.getReference().getX() - + parseState.lastX > fudgeFactorX) { buffer.append(" "); } String txt = extTextOutW.getText(); buffer.append(txt); - lastY = extTextOutW.getReference().getY(); - lastX = extTextOutW.getReference().getX(); + parseState.lastY = extTextOutW.getReference().getY(); + parseState.lastX = extTextOutW.getReference().getX(); + } + if (parseState.isIconOnly) { + parseState.lastWasIconOnly = true; + } else { + parseState.lastWasIconOnly = false; } } + if (parseState.iconOnlyString != null) { + metadata.set(EMF_ICON_ONLY, true); + metadata.set(EMF_ICON_STRING, parseState.iconOnlyString); + } if (buffer.length() > 0) { xhtml.startElement("p"); xhtml.characters(buffer.toString()); @@ -146,6 +151,53 @@ public class EMFParser extends AbstractParser { xhtml.endDocument(); } + private void handleCommentData( + HemfComment.EmfCommentData commentData, ParseState parseState, + XHTMLContentHandler xhtml, ParseContext context) + throws IOException, TikaException, SAXException { + + if (commentData instanceof HemfComment.EmfCommentDataMultiformats) { + if (parseState.extractor == null) { + parseState.extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + } + handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData, + xhtml, parseState.extractor); + } else if (commentData instanceof HemfComment.EmfCommentDataWMF) { + if (parseState.extractor == null) { + parseState.extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + } + handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml, + parseState.extractor); + } else if (commentData instanceof HemfComment.EmfCommentDataGeneric) { + String val = + tryToReadAsString((((HemfComment.EmfCommentDataGeneric) commentData).getPrivateData())); + if (ICON_ONLY.equals(val) && parseState.hitIconOnly == false) { + parseState.hitIconOnly = true; + parseState.isIconOnly = true; + } else if (parseState.lastWasIconOnly && parseState.iconOnlyString == null) { + parseState.iconOnlyString = val; + } + } + } + + private String tryToReadAsString(byte[] bytes) { + if (bytes.length < 2) { + return null; + } + //act like this is a null terminated unicode le + int stringLen = (bytes.length - 2) / 2; + try { + return StringUtil.getFromUnicodeLE0Terminated(bytes, 0, stringLen); + } catch (SecurityException e) { + throw e; + } catch (Exception e) { + //didn't work out...oh, well + } + return null; + } + private void handleWMF(byte[] bytes, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException { @@ -173,4 +225,15 @@ public class EMFParser extends AbstractParser { handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler); } } + + private static class ParseState { + double lastY = -1; + double lastX = -1; + boolean hitIconOnly = false; + boolean lastWasIconOnly = false; + boolean isIconOnly = false; + String iconOnlyString = null; + + EmbeddedDocumentExtractor extractor; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index ec64874bc..8a442383b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -573,6 +573,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { private void handleDataStream(InputStream dataStream, String objID, String progId, XHTMLContentHandler xhtml) { + //TODO -- inject progId into the metadata of the embedded file try (TikaInputStream stream = TikaInputStream.get(dataStream)) { String mediaType = null; if ("Excel.Chart.8".equals(progId)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 5a85d02cb..0493a2bd8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -23,6 +23,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.URI; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -56,6 +57,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.OfficeParser; @@ -64,6 +66,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; /** @@ -135,7 +138,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { buildXHTML(xhtml); // Now do any embedded parts - handleEmbeddedParts(xhtml, metadata); + handleEmbeddedParts(xhtml, metadata, getEmbeddedPartMetadataMap()); // thumbnail handleThumbnail(xhtml, metadata); @@ -143,6 +146,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { xhtml.endDocument(); } + protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() { + return Collections.emptyMap(); + } + protected String getJustFileName(String desc) { int idx = desc.lastIndexOf('/'); if (idx != -1) { @@ -199,7 +206,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { } } - private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata metadata) + private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata metadata, + Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap) throws TikaException, IOException, SAXException { //keep track of media items that have been handled //there can be multiple relationships pointing to the @@ -214,7 +222,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { } for (PackageRelationship rel : source.getRelationships()) { try { - handleEmbeddedPart(source, rel, xhtml, metadata, handledTarget); + handleEmbeddedPart(source, rel, xhtml, metadata, + embeddedPartMetadataMap, handledTarget); } catch (SAXException | SecurityException e) { throw e; } catch (Exception e) { @@ -229,6 +238,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { private void handleEmbeddedPart(PackagePart source, PackageRelationship rel, XHTMLContentHandler xhtml, Metadata parentMetadata, + Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap, Set<String> handledTarget) throws IOException, SAXException, TikaException, InvalidFormatException { URI targetURI = rel.getTargetURI(); @@ -260,19 +270,28 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { } catch (IllegalArgumentException ex) { return; } - + EmbeddedPartMetadata embeddedPartMetadata = embeddedPartMetadataMap.get(rel.getId()); String type = rel.getRelationshipType(); if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) && TYPE_OLE_OBJECT.equals(target.getContentType())) { - handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata); + handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata, + embeddedPartMetadata); + if (targetURI != null) { + handledTarget.add(targetURI.toString()); + } + } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) { + handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(), + embeddedPartMetadata, TikaCoreProperties.EmbeddedResourceType.INLINE); if (targetURI != null) { handledTarget.add(targetURI.toString()); } } else if (RELATION_MEDIA.equals(type) || RELATION_VIDEO.equals(type) || - RELATION_AUDIO.equals(type) || PackageRelationshipTypes.IMAGE_PART.equals(type) || + RELATION_AUDIO.equals(type) || POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) || POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) { - handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId()); + handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(), + embeddedPartMetadata, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT); if (targetURI != null) { handledTarget.add(targetURI.toString()); } @@ -289,7 +308,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { * Handles an embedded OLE object in the document */ private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler xhtml, String rel, - Metadata parentMetadata) throws IOException, SAXException { + Metadata parentMetadata, + EmbeddedPartMetadata embeddedPartMetadata) throws IOException, + SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip @@ -308,6 +329,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { TikaInputStream stream = null; try { Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); @@ -315,10 +338,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { String packageEntryName = getPackageEntryName(root); if (packageEntryName != null) { - // TIKA-704: OLE 2.0 embedded non-Office document? - //TODO: figure out if the equivalent of OLE 1.0's - //getCommand() and getFileName() exist for OLE 2.0 to populate - //TikaCoreProperties.ORIGINAL_RESOURCE_NAME + //OLE 2.0 + updateMetadata(metadata, embeddedPartMetadata); stream = TikaInputStream.get(fs.createDocumentInputStream(packageEntryName)); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor @@ -348,7 +369,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { true); } } else { - handleEmbeddedFile(part, xhtml, rel); + handleEmbeddedFile(part, xhtml, rel, embeddedPartMetadata, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part @@ -366,6 +388,16 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { } } + private void updateMetadata(Metadata metadata, EmbeddedPartMetadata embeddedPartMetadata) { + if (embeddedPartMetadata == null) { + return; + } + if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) { + metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId()); + } + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, embeddedPartMetadata.getFullName()); + } + private String getPackageEntryName(DirectoryNode root) { if (root.hasEntry("\u0001Ole")) { //we used to require this too: root.hasEntry("\u0001CompObj") before TIKA-3526 @@ -386,15 +418,18 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { /** * Handles an embedded file in the document */ - protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler xhtml, String rel) + protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler xhtml, + String rel, + EmbeddedPartMetadata embeddedPartMetadata, + TikaCoreProperties.EmbeddedResourceType embeddedResourceType) throws SAXException, IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + embeddedResourceType.name()); // Get the name - String name = part.getPartName().getName(); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, - name.substring(name.lastIndexOf('/') + 1)); + updateResourceName(part, embeddedPartMetadata, metadata); // Get the content type metadata.set(Metadata.CONTENT_TYPE, part.getContentType()); @@ -408,6 +443,28 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { } } + private void updateResourceName(PackagePart part, EmbeddedPartMetadata embeddedPartMetadata, + Metadata metadata) { + + if (embeddedPartMetadata != null) { + if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) { + metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId()); + } + String fullName = embeddedPartMetadata.getFullName(); + if (!StringUtils.isBlank(fullName)) { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullName); + return; + } + } + //TODO -- should we record the literal name of the embedded file? + String name = part.getPartName().getName(); + int lastSlash = name.lastIndexOf('/'); + if (lastSlash > -1) { + name = name.substring(lastSlash + 1); + } + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + } + /** * Populates the {@link XHTMLContentHandler} object received as parameter. */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java new file mode 100644 index 000000000..1e26aa1f9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +/** + * This class records metadata about embedded parts that exists in the xml + * of the main document. + */ +public class EmbeddedPartMetadata { + + private final String emfRelationshipId; + private String renderedName; + private String fullName; + + private String progId; + + //This is the rId of the EMF file that is associated with + //the embedded object + + /** + * + * @param emfRelationshipId relationship id of the EMF file + */ + public EmbeddedPartMetadata(String emfRelationshipId) { + this.emfRelationshipId = emfRelationshipId; + } + + public String getEmfRelationshipId() { + return emfRelationshipId; + } + + public String getRenderedName() { + return renderedName; + } + + public String getFullName() { + return fullName; + } + + public String getProgId() { + return progId; + } + + public void setRenderedName(String renderedName) { + this.renderedName = renderedName; + } + + public void setFullName(String fullName) { + this.fullName = fullName; + } + + public void setProgId(String progId) { + this.progId = progId; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index e5aacee15..8ad4953d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -17,13 +17,17 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; +import java.io.InputStream; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; import java.util.List; +import java.util.Map; import javax.xml.namespace.QName; +import com.microsoft.schemas.vml.impl.CTShapeImpl; +import org.apache.poi.ooxml.POIXMLDocumentPart; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; @@ -65,10 +69,13 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.FormattingUtils; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle; +import org.apache.tika.sax.ToTextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { @@ -90,6 +97,8 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { private XWPFStyles styles; private Metadata metadata; + private Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = new HashMap<>(); + public XWPFWordExtractorDecorator(Metadata metadata, ParseContext context, XWPFWordExtractor extractor) { super(context, extractor); @@ -144,6 +153,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } + @Override + protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() { + return embeddedPartMetadataMap; + } + private void extractIBodyText(IBody bodyElement, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { @@ -209,43 +223,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } writeParagraphNumber(paragraph, listManager, xhtml); - // Output placeholder for any embedded docs: - - // TODO: replace w/ XPath/XQuery: - for (XWPFRun run : paragraph.getRuns()) { - XmlCursor c = run.getCTR().newCursor(); - c.selectPath("./*"); - while (c.toNextSelection()) { - XmlObject o = c.getObject(); - if (o instanceof CTObject) { - XmlCursor c2 = o.newCursor(); - c2.selectPath("./*"); - while (c2.toNextSelection()) { - XmlObject o2 = c2.getObject(); - - XmlObject embedAtt = o2.selectAttribute(new QName("Type")); - if (embedAtt != null && - embedAtt.getDomNode().getNodeValue().equals("Embed")) { - // Type is "Embed" - XmlObject relIDAtt = o2.selectAttribute(new QName( - "http://schemas.openxmlformats.org/officeDocument/2006/relationships", - "id")); - if (relIDAtt != null) { - String relID = relIDAtt.getDomNode().getNodeValue(); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", relID); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); - } - } - } - c2.dispose(); - } - } - c.dispose(); - } + // Output placeholder for any embedded docs: + processEmbeddedObjects(paragraph.getRuns(), xhtml); // Attach bookmarks for the paragraph // (In future, we might put them in the right place, for now @@ -339,6 +319,112 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } + private void processEmbeddedObjects(List<XWPFRun> runs, XHTMLContentHandler xhtml) + throws SAXException { + // TODO: replace w/ XPath/XQuery: + for (XWPFRun run : runs) { + try (XmlCursor c = run.getCTR().newCursor()) { + c.selectPath("./*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTObject) { + try (XmlCursor objectCursor = o.newCursor()) { + processObject(objectCursor, xhtml); + } + } + } + } + } + } + + private void processObject(XmlCursor cursor, XHTMLContentHandler xhtml) throws SAXException { + + cursor.selectPath("./*"); + String objectRelId = null; + String progId = null; + EmbeddedPartMetadata embeddedPartMetadata = null; + while (cursor.toNextSelection()) { + XmlObject o2 = cursor.getObject(); + XmlObject embedAtt = o2.selectAttribute(new QName("Type")); + if (embedAtt != null && + embedAtt.getDomNode().getNodeValue().equals("Embed")) { + //TODO: get ProgID, while we're here? + // Type is "Embed" + XmlObject relIDAtt = o2.selectAttribute(new QName( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "id")); + if (relIDAtt != null) { + objectRelId = relIDAtt.getDomNode().getNodeValue(); + } + + XmlObject progIDAtt = o2.selectAttribute(new QName("ProgID")); + if (progIDAtt != null) { + progId = progIDAtt.getDomNode().getNodeValue(); + } + } else if (o2 instanceof CTShapeImpl) { + XmlObject[] imagedata = o2.selectChildren( + new QName("urn:schemas" + + "-microsoft-com:vml","imagedata")); + if (imagedata.length > 0) { + XmlObject relIDAtt = imagedata[0].selectAttribute(new QName( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "id")); + if (relIDAtt != null) { + String rid = relIDAtt.getDomNode().getNodeValue(); + embeddedPartMetadata = new EmbeddedPartMetadata(rid); + tryToParseEmbeddedName(rid, embeddedPartMetadata); + } + } + } + } + if (objectRelId == null) { + return; + } + if (! StringUtils.isBlank(progId)) { + embeddedPartMetadata.setProgId(progId); + } + + if (embeddedPartMetadata != null) { + embeddedPartMetadataMap.put(objectRelId, embeddedPartMetadata); + } + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", objectRelId); + if (!StringUtils.isBlank(embeddedPartMetadata.getFullName())) { + attributes.addAttribute("", "name", "name", "CDATA", + embeddedPartMetadata.getFullName()); + } + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + + private String tryToParseEmbeddedName(String rid, EmbeddedPartMetadata embeddedPartMetadata) { + //This tries to parse the embedded name out of a comment + //field in an emf + POIXMLDocumentPart part = document.getRelationById(rid); + if (part == null || part.getPackagePart() == null + || part.getPackagePart().getContentType() == null) { + return null; + } + PackagePart packagePart = part.getPackagePart(); + if ("image/x-emf".equals(packagePart.getContentType())) { + try (InputStream is = packagePart.getInputStream()) { + EMFParser p = new EMFParser(); + Metadata m = new Metadata(); + ParseContext pc = new ParseContext(); + ToTextContentHandler toTextContentHandler = new ToTextContentHandler(); + p.parse(is, toTextContentHandler, m, pc); + embeddedPartMetadata.setRenderedName(toTextContentHandler.toString().trim()); + embeddedPartMetadata.setFullName(m.get(EMFParser.EMF_ICON_STRING)); + } catch (SecurityException e) { + throw e; + } catch (Exception e) { + //we tried + } + } + return null; + } + private void writeParagraphNumber(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException { if (paragraph.getNumIlvl() == null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java index 21e87b3bd..dfe86f204 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java @@ -300,7 +300,7 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction assertEquals("image1.emf", handler.filenames.get(0)); assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); - assertNull(handler.filenames.get(1)); + assertEquals("Acrobat Document", handler.filenames.get(1)); assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx new file mode 100644 index 000000000..86e4b4541 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index b1c63cd72..65f14f169 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.microsoft.ooxml; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + import java.util.List; import org.junit.jupiter.api.Disabled; @@ -23,8 +26,10 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; public class OOXMLParserTest extends TikaTest { @@ -60,6 +65,57 @@ public class OOXMLParserTest extends TikaTest { assertContains("Hello World", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testEMFAssociatedWithAttachments() throws Exception { + //TIKA-3968 + List<Metadata> metadataList = getRecursiveMetadata("testWORD_EMFAndAttachments.docx"); + + assertEquals("true", metadataList.get(1).get(EMFParser.EMF_ICON_ONLY)); + assertEquals("true", metadataList.get(3).get(EMFParser.EMF_ICON_ONLY)); + assertEquals("true", metadataList.get(5).get(EMFParser.EMF_ICON_ONLY)); + assertEquals("TestText.txt", metadataList.get(1).get(EMFParser.EMF_ICON_STRING)); + assertEquals("TestPdf.pdf", metadataList.get(3).get(EMFParser.EMF_ICON_STRING)); + assertEquals("testWORD123.docx", metadataList.get(5).get(EMFParser.EMF_ICON_STRING)); + + assertNull(metadataList.get(2).get(Office.PROG_ID)); + assertEquals("AcroExch.Document.DC", metadataList.get(4).get(Office.PROG_ID)); + assertEquals("Word.Document.12", metadataList.get(6).get(Office.PROG_ID)); + + assertEquals("TestText.txt", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("TestPdf.pdf", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("testWORD123.docx", metadataList.get(6).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + + assertEquals("/TestText.txt", + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/TestPdf.pdf", + metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/testWORD123.docx", + metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + + assertContains("This is Text File", + metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); + + assertContains("This is test PDF document for parser.", + metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT)); + + assertContains("This is test word document for parser.", + metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT)); + + assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), + metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(), + metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + + assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), + metadataList.get(3).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(), + metadataList.get(5).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + @Disabled("TODO figure out why this doesn't work") @Test//(expected = org.apache.tika.exception.TikaException.class) public void testCorruptedZip() throws Exception {
