This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 82f26b63dc7aa6ccc23a25e57377ab9d851db448 Author: Tim Allison <[email protected]> AuthorDate: Tue Feb 3 06:32:51 2026 -0500 TIKA-4646 -- extract hyperlinks from instrText and other areas in ooxml(#2578) (cherry picked from commit bef2d336b1e4e52e3ca262d656f93ee4d3145b5f) --- .../main/java/org/apache/tika/metadata/Office.java | 51 +++ .../microsoft/ooxml/AbstractOOXMLExtractor.java | 26 ++ .../microsoft/ooxml/FieldHyperlinkTracker.java | 168 +++++++++ .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 25 ++ .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 187 +++++++++- .../ooxml/SXWPFWordExtractorDecorator.java | 179 +++++++++- .../ooxml/XSSFExcelExtractorDecorator.java | 390 +++++++++++++++++++++ .../ooxml/XWPFWordExtractorDecorator.java | 95 ++++- .../xslf/XSLFEventBasedPowerPointExtractor.java | 5 + .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 5 + .../tika/parser/microsoft/ExcelParserTest.java | 43 +++ .../parser/microsoft/ooxml/OOXMLParserTest.java | 39 +++ .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 109 ++++++ .../parser/microsoft/pst/OutlookPSTParserTest.java | 3 + .../test-documents/testAttachedTemplate.docx | Bin 0 -> 2284 bytes .../test-documents/testDataConnections.xlsx | Bin 0 -> 2967 bytes .../test/resources/test-documents/testDdeLink.xlsx | Bin 0 -> 3030 bytes .../resources/test-documents/testExternalRefs.docx | Bin 0 -> 2125 bytes .../resources/test-documents/testFrameset.docx | Bin 0 -> 2328 bytes .../resources/test-documents/testHoverAndVml.docx | Bin 0 -> 2270 bytes .../resources/test-documents/testInstrLink.docx | Bin 0 -> 14464 bytes .../resources/test-documents/testMailMerge.docx | Bin 0 -> 2306 bytes .../resources/test-documents/testSubdocument.docx | Bin 0 -> 1980 bytes 23 files changed, 1322 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 39607445f6..6e9a20e70b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -201,8 +201,59 @@ public interface Office { Property WORKBOOK_CODENAME = Property.internalText("msoffice:excel:workbook-codename"); + Property HAS_DATA_CONNECTIONS = Property.internalBoolean("msoffice:excel:has-data-connections"); + + Property HAS_EXTERNAL_LINKS = Property.internalBoolean("msoffice:excel:has-external-links"); + + Property HAS_WEB_QUERIES = Property.internalBoolean("msoffice:excel:has-web-queries"); + + Property HAS_EXTERNAL_OLE_OBJECTS = Property.internalBoolean("msoffice:has-external-ole-objects"); + + Property HAS_FIELD_HYPERLINKS = Property.internalBoolean("msoffice:has-field-hyperlinks"); + + Property HAS_HOVER_HYPERLINKS = Property.internalBoolean("msoffice:has-hover-hyperlinks"); + + Property HAS_VML_HYPERLINKS = Property.internalBoolean("msoffice:has-vml-hyperlinks"); + Property HAS_COMMENTS = Property.internalBoolean("msoffice:has-comments"); Property COMMENT_PERSONS = Property.internalTextBag("msoffice:comment-person-display-name"); + Property HAS_HIDDEN_SLIDES = Property.internalBoolean("msoffice:ppt:has-hidden-slides"); + + Property NUM_HIDDEN_SLIDES = Property.internalInteger("msoffice:ppt:num-hidden-slides"); + + Property HAS_ANIMATIONS = Property.internalBoolean("msoffice:ppt:has-animations"); + + //w:vanish or isVanish or isFldVanish + Property HAS_HIDDEN_TEXT = Property.internalBoolean("msoffice:doc:has-hidden-text"); + + Property HAS_TRACK_CHANGES = Property.internalBoolean("msoffice:has-track-changes"); + + // Security-relevant: DDE (Dynamic Data Exchange) links can execute commands + Property HAS_DDE_LINKS = Property.internalBoolean("msoffice:excel:has-dde-links"); + + // Security-relevant: Mail merge can reference external data sources + Property HAS_MAIL_MERGE = Property.internalBoolean("msoffice:doc:has-mail-merge"); + + // Security-relevant: Attached templates can be fetched from external URLs + Property HAS_ATTACHED_TEMPLATE = Property.internalBoolean("msoffice:doc:has-attached-template"); + + // Security-relevant: SubDocuments reference external documents in master docs + Property HAS_SUBDOCUMENTS = Property.internalBoolean("msoffice:doc:has-subdocuments"); + + // Security-relevant: Pivot tables can reference external OLAP/database sources + Property HAS_EXTERNAL_PIVOT_DATA = Property.internalBoolean("msoffice:excel:has-external-pivot-data"); + + // Security-relevant: Power Query can contain URLs and connection strings + Property HAS_POWER_QUERY = Property.internalBoolean("msoffice:excel:has-power-query"); + + // Security-relevant: OLE objects can link to external files (vs embedded) + Property HAS_LINKED_OLE_OBJECTS = Property.internalBoolean("msoffice:has-linked-ole-objects"); + + // Security-relevant: Charts can reference external workbook data + Property HAS_EXTERNAL_CHART_DATA = Property.internalBoolean("msoffice:has-external-chart-data"); + + // Security-relevant: Framesets can load external URLs + Property HAS_FRAMESETS = Property.internalBoolean("msoffice:doc:has-framesets"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index c7cfecfa49..a8d65cd895 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -265,6 +265,16 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { sourceDesc = ""; } if (rel.getTargetMode() != TargetMode.INTERNAL) { + // External target - emit as external reference for security analysis + String type = rel.getRelationshipType(); + if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) { + emitExternalRef(xhtml, "externalOleObject", targetURI.toString()); + parentMetadata.set(Office.HAS_EXTERNAL_OLE_OBJECTS, true); + } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) { + emitExternalRef(xhtml, "externalImage", targetURI.toString()); + } else { + emitExternalRef(xhtml, "externalResource", targetURI.toString()); + } return; } PackagePart target; @@ -489,6 +499,22 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); } + /** + * Emits an external reference as an anchor element with appropriate class. + * Used for detecting external resources that could be security risks. + */ + private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url) + throws SAXException { + if (url == null || url.isEmpty()) { + return; + } + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType); + attrs.addAttribute("", "href", "href", "CDATA", url); + xhtml.startElement("a", attrs); + xhtml.endElement("a"); + } + /** * Populates the {@link XHTMLContentHandler} object received as parameter. */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java new file mode 100644 index 0000000000..951711d99f --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Tracks field hyperlink state across multiple runs within a paragraph. + * Field codes span multiple runs: begin -> instrText -> separate -> text runs -> end + * <p> + * This class handles HYPERLINK field codes as well as other external references + * like INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK. + */ +class FieldHyperlinkTracker { + + // Patterns for extracting URLs from field codes + private static final Pattern HYPERLINK_PATTERN = + Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDEPICTURE_PATTERN = + Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDETEXT_PATTERN = + Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern IMPORT_PATTERN = + Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern LINK_PATTERN = + Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", + Pattern.CASE_INSENSITIVE); + + private boolean inField = false; + private boolean inFieldHyperlink = false; + private final StringBuilder instrTextBuffer = new StringBuilder(); + private String lastExternalRefType = null; + private String lastExternalRefUrl = null; + + void startField() { + inField = true; + instrTextBuffer.setLength(0); + lastExternalRefType = null; + lastExternalRefUrl = null; + } + + void addInstrText(String text) { + if (inField && text != null) { + instrTextBuffer.append(text); + } + } + + /** + * Called when fldChar separate is encountered. + * + * @return the hyperlink URL if this is a HYPERLINK field, null otherwise + */ + String separate() { + if (inField) { + String url = parseHyperlinkFromInstrText(instrTextBuffer.toString()); + if (url != null) { + inFieldHyperlink = true; + return url; + } + // Check for other external refs (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK) + StringBuilder fieldType = new StringBuilder(); + String extUrl = parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType); + if (extUrl != null) { + lastExternalRefType = fieldType.toString(); + lastExternalRefUrl = extUrl; + } + } + return null; + } + + void endField() { + inField = false; + inFieldHyperlink = false; + instrTextBuffer.setLength(0); + lastExternalRefType = null; + lastExternalRefUrl = null; + } + + boolean isInFieldHyperlink() { + return inFieldHyperlink; + } + + String getLastExternalRefType() { + return lastExternalRefType; + } + + String getLastExternalRefUrl() { + return lastExternalRefUrl; + } + + void clearExternalRef() { + lastExternalRefType = null; + lastExternalRefUrl = null; + } + + /** + * Parses a HYPERLINK URL from instrText field code content. + * + * @param instrText the accumulated instrText content + * @return the URL if found, or null + */ + private static String parseHyperlinkFromInstrText(String instrText) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); + if (m.find()) { + return m.group(1); + } + return null; + } + + /** + * Parses external reference URLs from instrText field codes + * (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK). + * + * @param instrText the accumulated instrText content + * @param fieldType output parameter - will contain the field type if found + * @return the URL if found, or null + */ + private static String parseExternalRefFromInstrText(String instrText, StringBuilder fieldType) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + String trimmed = instrText.trim(); + + Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDEPICTURE"); + return m.group(1); + } + + m = INCLUDETEXT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDETEXT"); + return m.group(1); + } + + m = IMPORT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("IMPORT"); + return m.group(1); + } + + m = LINK_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("LINK"); + return m.group(1); + } + + return null; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index 8ff630da14..4bc445fb5e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -321,6 +321,19 @@ public class OOXMLTikaBodyPartHandler xhtml.endElement("div"); } + @Override + public void linkedOLERef(String relId) throws SAXException { + if (relId == null) { + return; + } + // Emit as an external reference anchor - linked OLE objects reference external files + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "external-ref-linkedOle"); + attributes.addAttribute("", "id", "id", "CDATA", relId); + xhtml.startElement("a", attributes); + xhtml.endElement("a"); + } + @Override public void embeddedPicRef(String picFileName, String picDescription) throws SAXException { @@ -338,6 +351,18 @@ public class OOXMLTikaBodyPartHandler } + @Override + public void externalRef(String fieldType, String url) throws SAXException { + if (url == null || url.isEmpty()) { + return; + } + AttributesImpl attr = new AttributesImpl(); + attr.addAttribute("", "class", "class", "CDATA", "external-ref-" + fieldType); + attr.addAttribute("", "href", "href", "CDATA", url); + xhtml.startElement("a", attr); + xhtml.endElement("a"); + } + @Override public void startBookmark(String id, String name) throws SAXException { //skip bookmarks within hyperlinks diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index 6e355b8ff9..9e7110f773 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -19,12 +19,16 @@ package org.apache.tika.parser.microsoft.ooxml; import java.util.Date; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.utils.DateUtils; /** @@ -106,12 +110,34 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String MOVE_TO = "moveTo"; private final static String ENDNOTE_REFERENCE = "endnoteReference"; private static final String TEXTBOX = "textbox"; + private final static String FLD_CHAR = "fldChar"; + private final static String INSTR_TEXT = "instrText"; + private final static String FLD_CHAR_TYPE = "fldCharType"; + // DrawingML hyperlinks on shapes/pictures + private final static String HLINK_HOVER = "hlinkHover"; + private final static String C_NV_PR = "cNvPr"; + // VML shape hyperlinks + private final static String SHAPE = "shape"; + private final static String HREF = "href"; + + // Patterns for extracting URLs from field codes + private static final Pattern HYPERLINK_PATTERN = + Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDEPICTURE_PATTERN = + Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern INCLUDETEXT_PATTERN = + Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern IMPORT_PATTERN = + Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); + private static final Pattern LINK_PATTERN = + Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); private final XWPFBodyContentsHandler bodyContentsHandler; private final Map<String, String> linkedRelationships; private final RunProperties currRunProperties = new RunProperties(); private final ParagraphProperties currPProperties = new ParagraphProperties(); private final boolean includeTextBox; private final boolean concatenatePhoneticRuns; + private final Metadata metadata; private final StringBuilder runBuffer = new StringBuilder(); private final StringBuilder rubyBuffer = new StringBuilder(); private boolean inR = false; @@ -143,22 +169,34 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private boolean inHlinkClick = false; private boolean inTextBox = false; private boolean inV = false; //in c:v in chart file + // Field code tracking for instrText-based hyperlinks + private boolean inField = false; + private boolean inInstrText = false; + private boolean inFieldHyperlink = false; + private final StringBuilder instrTextBuffer = new StringBuilder(); private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE; private DateUtils dateUtils = new DateUtils(); public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks) { - this(bodyContentsHandler, hyperlinks, true, true); + this(bodyContentsHandler, hyperlinks, true, true, null); } public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, Map<String, String> hyperlinks, boolean includeTextBox, boolean concatenatePhoneticRuns) { + this(bodyContentsHandler, hyperlinks, includeTextBox, concatenatePhoneticRuns, null); + } + + public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler, + Map<String, String> hyperlinks, boolean includeTextBox, + boolean concatenatePhoneticRuns, Metadata metadata) { this.bodyContentsHandler = bodyContentsHandler; this.linkedRelationships = hyperlinks; this.includeTextBox = includeTextBox; this.concatenatePhoneticRuns = concatenatePhoneticRuns; + this.metadata = metadata; } @Override @@ -322,6 +360,12 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } if ("Embed".equals(type)) { bodyContentsHandler.embeddedOLERef(refId); + } else if ("Link".equals(type)) { + // Linked OLE object - references external file + bodyContentsHandler.linkedOLERef(refId); + if (metadata != null) { + metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true); + } } } else if (CR.equals(localName)) { runBuffer.append(NEWLINE); @@ -332,6 +376,65 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { inV = true; } else if (RT.equals(localName)) { inRt = true; + } else if (FLD_CHAR.equals(localName)) { + String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE); + if ("begin".equals(fldCharType)) { + inField = true; + instrTextBuffer.setLength(0); + } else if ("separate".equals(fldCharType)) { + // Parse instrText for HYPERLINK + String url = parseHyperlinkFromInstrText(instrTextBuffer.toString()); + if (url != null) { + bodyContentsHandler.hyperlinkStart(url); + inFieldHyperlink = true; + if (metadata != null) { + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + } + } else { + // Check for external reference fields (INCLUDEPICTURE, INCLUDETEXT, etc.) + StringBuilder fieldType = new StringBuilder(); + String extUrl = parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType); + if (extUrl != null) { + bodyContentsHandler.externalRef(fieldType.toString(), extUrl); + if (metadata != null) { + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + } + } + } + } else if ("end".equals(fldCharType)) { + if (inFieldHyperlink) { + bodyContentsHandler.hyperlinkEnd(); + inFieldHyperlink = false; + } + inField = false; + instrTextBuffer.setLength(0); + } + } else if (INSTR_TEXT.equals(localName)) { + inInstrText = true; + } else if (HLINK_HOVER.equals(localName)) { + // DrawingML hover hyperlink on shapes/pictures + String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id"); + if (hyperlinkId != null) { + String hyperlink = linkedRelationships.get(hyperlinkId); + if (hyperlink != null) { + bodyContentsHandler.externalRef("hlinkHover", hyperlink); + if (metadata != null) { + metadata.set(Office.HAS_HOVER_HYPERLINKS, true); + } + } + } + } else if (SHAPE.equals(localName) && V_NS.equals(uri)) { + // VML shape with href attribute + String href = atts.getValue(HREF); + if (href == null) { + href = atts.getValue(O_NS, HREF); + } + if (href != null && !href.isEmpty()) { + bodyContentsHandler.externalRef("vml-shape-href", href); + if (metadata != null) { + metadata.set(Office.HAS_VML_HYPERLINKS, true); + } + } } } @@ -367,6 +470,65 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { return -1; } + /** + * Parses a HYPERLINK URL from instrText field code content. + * Field codes like: HYPERLINK "https://example.com" + * + * @param instrText the accumulated instrText content + * @return the URL if found, or null + */ + private String parseHyperlinkFromInstrText(String instrText) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); + if (m.find()) { + return m.group(1); + } + return null; + } + + /** + * Parses URLs from instrText field codes that reference external resources. + * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields. + * + * @param instrText the accumulated instrText content + * @param fieldType output parameter - will contain the field type if found + * @return the URL if found, or null + */ + private String parseExternalRefFromInstrText(String instrText, StringBuilder fieldType) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + String trimmed = instrText.trim(); + + Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDEPICTURE"); + return m.group(1); + } + + m = INCLUDETEXT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("INCLUDETEXT"); + return m.group(1); + } + + m = IMPORT_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("IMPORT"); + return m.group(1); + } + + m = LINK_PATTERN.matcher(trimmed); + if (m.find()) { + fieldType.append("LINK"); + return m.group(1); + } + + return null; + } + @Override public void endElement(String uri, String localName, String qName) throws SAXException { @@ -432,6 +594,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { inRt = false; } else if (RUBY.equals(localName)) { handleEndOfRuby(); + } else if (INSTR_TEXT.equals(localName)) { + inInstrText = false; } } @@ -489,6 +653,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (inV) { appendToBuffer(ch, start, length); appendToBuffer(TAB_CHAR, 0, 1); + } else if (inInstrText && inField) { + // Accumulate instrText content for field code parsing (e.g., HYPERLINK) + instrTextBuffer.append(ch, start, length); } } @@ -564,10 +731,28 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { void embeddedOLERef(String refId) throws SAXException; + /** + * Called when a linked (vs embedded) OLE object is found. + * These reference external files and are a security concern. + */ + void linkedOLERef(String refId) throws SAXException; + void embeddedPicRef(String picFileName, String picDescription) throws SAXException; void startBookmark(String id, String name) throws SAXException; void endBookmark(String id) throws SAXException; + + /** + * Called when an external reference URL is found in a field code. + * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields, + * and DrawingML/VML hyperlinks on shapes. + * + * @param fieldType the type of field (e.g., "INCLUDEPICTURE", "hlinkHover", "vml-href") + * @param url the external URL + */ + default void externalRef(String fieldType, String url) throws SAXException { + // Default no-op implementation for backward compatibility + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 2b4c52748c..fbe16d51a2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -30,14 +30,18 @@ import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; +import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.usermodel.XWPFNumbering; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.xmlbeans.XmlException; +import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; @@ -69,6 +73,16 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"}; + // Relationship types for Word settings + private static final String SETTINGS_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings"; + private static final String WEB_SETTINGS_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings"; + private static final String ATTACHED_TEMPLATE_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/attachedTemplate"; + private static final String SUBDOCUMENT_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/subDocument"; + //a docx file should have one of these "main story" parts private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{XWPFRelation.DOCUMENT.getContentType(), @@ -116,6 +130,106 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { xhtml.endElement("div"); } } + + // Detect security-relevant features in main document + pps = getStoryDocumentParts(); + if (pps != null && !pps.isEmpty()) { + PackagePart mainDoc = pps.get(0); + detectSecurityFeatures(mainDoc, xhtml); + } + } + + /** + * Detects security-relevant features like mail merge, attached templates, + * subdocuments, and framesets. + */ + private void detectSecurityFeatures(PackagePart documentPart, XHTMLContentHandler xhtml) { + // Check for attached template (external template reference) + try { + PackageRelationshipCollection templateRels = + documentPart.getRelationshipsByType(ATTACHED_TEMPLATE_RELATION); + if (templateRels != null && templateRels.size() > 0) { + metadata.set(Office.HAS_ATTACHED_TEMPLATE, true); + for (PackageRelationship rel : templateRels) { + if (rel.getTargetMode() == TargetMode.EXTERNAL) { + emitExternalRef(xhtml, "attachedTemplate", rel.getTargetURI().toString()); + } + } + } + } catch (InvalidFormatException | SAXException e) { + // swallow + } + + // Check for subdocuments (master document with external subdocs) + try { + PackageRelationshipCollection subDocRels = + documentPart.getRelationshipsByType(SUBDOCUMENT_RELATION); + if (subDocRels != null && subDocRels.size() > 0) { + metadata.set(Office.HAS_SUBDOCUMENTS, true); + for (PackageRelationship rel : subDocRels) { + if (rel.getTargetMode() == TargetMode.EXTERNAL) { + emitExternalRef(xhtml, "subDocument", rel.getTargetURI().toString()); + } + } + } + } catch (InvalidFormatException | SAXException e) { + // swallow + } + + // Check settings.xml for mail merge + try { + PackageRelationshipCollection settingsRels = + documentPart.getRelationshipsByType(SETTINGS_RELATION); + if (settingsRels != null && settingsRels.size() > 0) { + PackagePart settingsPart = documentPart.getRelatedPart(settingsRels.getRelationship(0)); + if (settingsPart != null) { + try (InputStream is = settingsPart.getInputStream()) { + WordSettingsHandler handler = new WordSettingsHandler(xhtml); + XMLReaderUtils.parseSAX(is, handler, context); + if (handler.hasMailMerge()) { + metadata.set(Office.HAS_MAIL_MERGE, true); + } + } + } + } + } catch (InvalidFormatException | IOException | TikaException | SAXException e) { + // swallow + } + + // Check webSettings.xml for framesets + try { + PackageRelationshipCollection webSettingsRels = + documentPart.getRelationshipsByType(WEB_SETTINGS_RELATION); + if (webSettingsRels != null && webSettingsRels.size() > 0) { + PackagePart webSettingsPart = documentPart.getRelatedPart(webSettingsRels.getRelationship(0)); + if (webSettingsPart != null) { + try (InputStream is = webSettingsPart.getInputStream()) { + WebSettingsHandler handler = new WebSettingsHandler(xhtml); + XMLReaderUtils.parseSAX(is, handler, context); + if (handler.hasFrameset()) { + metadata.set(Office.HAS_FRAMESETS, true); + } + } + } + } + } catch (InvalidFormatException | IOException | TikaException | SAXException e) { + // swallow + } + } + + /** + * Emits an external reference as an anchor element. + */ + private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url) + throws SAXException { + if (url == null || url.isEmpty()) { + return; + } + org.xml.sax.helpers.AttributesImpl attrs = new org.xml.sax.helpers.AttributesImpl(); + attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType); + attrs.addAttribute("", "href", "href", "CDATA", url); + xhtml.startElement("a", attrs); + xhtml.endElement("a"); } private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) @@ -195,7 +309,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config), linkedRelationships, config.isIncludeShapeBasedContent(), - config.isConcatenatePhoneticRuns())), context); + config.isConcatenatePhoneticRuns(), metadata)), context); } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); @@ -299,4 +413,67 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } return new ArrayList<>(); } + + /** + * Handler for parsing Word settings.xml to detect mail merge and other features. + */ + private static class WordSettingsHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + private boolean hasMailMerge = false; + + WordSettingsHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + // Mail merge element indicates document has mail merge data source + if ("mailMerge".equals(localName)) { + hasMailMerge = true; + } + // dataSource element contains the external data source reference + if ("dataSource".equals(localName) || "query".equals(localName)) { + String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"); + // The actual data source location is in the relationship + } + } + + boolean hasMailMerge() { + return hasMailMerge; + } + } + + /** + * Handler for parsing Word webSettings.xml to detect framesets. + */ + private static class WebSettingsHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + private boolean hasFrameset = false; + + WebSettingsHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + // Frameset element indicates document contains frames + if ("frameset".equals(localName)) { + hasFrameset = true; + } + // Frame with src attribute contains URL + if ("frame".equals(localName)) { + String src = atts.getValue("src"); + if (src != null && !src.isEmpty()) { + // Frame references an external URL + hasFrameset = true; + } + } + } + + boolean hasFrameset() { + return hasFrameset; + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 873242927f..8b41630f32 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -79,6 +79,20 @@ import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { + + // Relationship types for external data sources + private static final String EXTERNAL_LINK_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink"; + private static final String CONNECTIONS_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/connections"; + private static final String QUERY_TABLE_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/queryTable"; + private static final String PIVOT_CACHE_DEFINITION_RELATION = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition"; + // Power Query stores data in customData parts + private static final String POWER_QUERY_CONTENT_TYPE = + "application/vnd.ms-excel.customDataProperties+xml"; + /** * Allows access to headers/footers from raw xml strings */ @@ -223,6 +237,382 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { //swallow } + // Extract external data sources (HIGH security risk - can hide malicious URLs) + try { + extractExternalDataSources(container, xhtml); + } catch (InvalidFormatException | TikaException | IOException | SAXException e) { + //swallow + } + + } + + /** + * Extracts external data sources from the workbook including: + * - External workbook links + * - Data connections (database, web queries) + * - Query tables + */ + private void extractExternalDataSources(OPCPackage container, XHTMLContentHandler xhtml) + throws InvalidFormatException, TikaException, IOException, SAXException { + + PackageRelationship coreDocRelationship = container.getRelationshipsByType( + PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0); + if (coreDocRelationship == null) { + return; + } + PackagePart workbookPart = container.getPart(coreDocRelationship); + if (workbookPart == null) { + return; + } + + // Extract external workbook links + extractExternalLinks(workbookPart, xhtml); + + // Extract connections (database, ODBC, web queries) + extractConnections(workbookPart, xhtml); + + // Extract query tables from each sheet + for (PackagePart sheetPart : sheetParts) { + extractQueryTables(sheetPart, xhtml); + } + + // Detect pivot cache with external data sources + extractPivotCacheExternalData(workbookPart, xhtml); + + // Detect Power Query / Data Mashup + detectPowerQuery(container); + } + + /** + * Detects pivot cache definitions with external data sources (OLAP, databases). + */ + private void extractPivotCacheExternalData(PackagePart workbookPart, XHTMLContentHandler xhtml) + throws InvalidFormatException { + PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(PIVOT_CACHE_DEFINITION_RELATION); + if (coll == null || coll.isEmpty()) { + return; + } + for (PackageRelationship rel : coll) { + try { + PackagePart pivotCachePart = workbookPart.getRelatedPart(rel); + if (pivotCachePart != null) { + PivotCacheHandler handler = new PivotCacheHandler(xhtml); + try (InputStream is = pivotCachePart.getInputStream()) { + XMLReaderUtils.parseSAX(is, handler, parseContext); + } + if (handler.hasExternalData()) { + metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true); + } + } + } catch (IOException | TikaException | SAXException e) { + // swallow + } + } + } + + /** + * Detects Power Query / Data Mashup presence. + */ + private void detectPowerQuery(OPCPackage container) { + // Power Query data is stored in customData parts with specific content type + // or in xl/customData/ folder + try { + List<PackagePart> customDataParts = container.getPartsByContentType(POWER_QUERY_CONTENT_TYPE); + if (customDataParts != null && !customDataParts.isEmpty()) { + metadata.set(Office.HAS_POWER_QUERY, true); + } + // Also check for customData folder parts + for (PackagePart part : container.getParts()) { + String partName = part.getPartName().getName(); + if (partName.contains("/customData/") || partName.contains("/dataMashup")) { + metadata.set(Office.HAS_POWER_QUERY, true); + break; + } + } + } catch (InvalidFormatException e) { + // swallow + } + } + + /** + * Extracts external workbook links from externalLink parts. + */ + private void extractExternalLinks(PackagePart workbookPart, XHTMLContentHandler xhtml) + throws InvalidFormatException, SAXException { + PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(EXTERNAL_LINK_RELATION); + if (coll == null || coll.isEmpty()) { + return; + } + // If we have any external link relationships, set the metadata flag + if (coll.size() > 0) { + metadata.set(Office.HAS_EXTERNAL_LINKS, true); + } + for (PackageRelationship rel : coll) { + if (rel.getTargetMode() == TargetMode.EXTERNAL) { + // Direct external reference + emitExternalRef(xhtml, "externalLink", rel.getTargetURI().toString()); + } else { + // Internal part that contains external reference - parse it + try { + PackagePart externalLinkPart = workbookPart.getRelatedPart(rel); + if (externalLinkPart != null) { + ExternalLinkHandler handler = new ExternalLinkHandler(xhtml); + try (InputStream is = externalLinkPart.getInputStream()) { + XMLReaderUtils.parseSAX(is, handler, parseContext); + } + if (handler.hasDdeLink()) { + metadata.set(Office.HAS_DDE_LINKS, true); + } + } + } catch (IOException | TikaException e) { + // swallow + } + } + } + } + + /** + * Extracts data connections from connections.xml. + */ + private void extractConnections(PackagePart workbookPart, XHTMLContentHandler xhtml) + throws InvalidFormatException, SAXException { + PackageRelationshipCollection coll = workbookPart.getRelationshipsByType(CONNECTIONS_RELATION); + if (coll == null || coll.isEmpty()) { + return; + } + for (PackageRelationship rel : coll) { + try { + PackagePart connectionsPart = workbookPart.getRelatedPart(rel); + if (connectionsPart != null) { + ConnectionsHandler handler = new ConnectionsHandler(xhtml); + try (InputStream is = connectionsPart.getInputStream()) { + XMLReaderUtils.parseSAX(is, handler, parseContext); + } + if (handler.hasConnections()) { + metadata.set(Office.HAS_DATA_CONNECTIONS, true); + } + if (handler.hasWebQueries()) { + metadata.set(Office.HAS_WEB_QUERIES, true); + } + } + } catch (IOException | TikaException e) { + // swallow + } + } + } + + /** + * Extracts query table external sources. + */ + private void extractQueryTables(PackagePart sheetPart, XHTMLContentHandler xhtml) + throws InvalidFormatException, SAXException { + PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(QUERY_TABLE_RELATION); + if (coll == null || coll.isEmpty()) { + return; + } + for (PackageRelationship rel : coll) { + try { + PackagePart queryTablePart = sheetPart.getRelatedPart(rel); + if (queryTablePart != null) { + try (InputStream is = queryTablePart.getInputStream()) { + XMLReaderUtils.parseSAX(is, new QueryTableHandler(xhtml), parseContext); + } + } + } catch (IOException | TikaException e) { + // swallow + } + } + } + + /** + * Emits an external reference as an anchor element with appropriate class. + */ + private void emitExternalRef(XHTMLContentHandler xhtml, String refType, String url) + throws SAXException { + if (url == null || url.isEmpty()) { + return; + } + org.xml.sax.helpers.AttributesImpl attrs = new org.xml.sax.helpers.AttributesImpl(); + attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" + refType); + attrs.addAttribute("", "href", "href", "CDATA", url); + xhtml.startElement("a", attrs); + xhtml.endElement("a"); + } + + /** + * Handler for parsing externalLink XML to extract external workbook references. + */ + private class ExternalLinkHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + private boolean foundDdeLink = false; + + ExternalLinkHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + // Look for externalBook element with r:id attribute + if ("externalBook".equals(localName)) { + String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"); + // The actual URL is in the relationship, not directly in the XML + // For now, we note that there's an external book reference + } + // Look for file element with href attribute (older format) + if ("file".equals(localName)) { + String href = atts.getValue("href"); + if (href != null && !href.isEmpty()) { + emitExternalRef(xhtml, "externalWorkbook", href); + } + } + // Look for oleLink with r:id (OLE links to external files) + if ("oleLink".equals(localName)) { + String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"); + if (rId != null) { + emitExternalRef(xhtml, "oleLink", "relationship:" + rId); + } + } + // DDE links - security risk: can execute commands + if ("ddeLink".equals(localName)) { + foundDdeLink = true; + String ddeService = atts.getValue("ddeService"); + String ddeTopic = atts.getValue("ddeTopic"); + if (ddeService != null || ddeTopic != null) { + String ddeRef = (ddeService != null ? ddeService : "") + "|" + + (ddeTopic != null ? ddeTopic : ""); + emitExternalRef(xhtml, "ddeLink", ddeRef); + } + } + } + + boolean hasDdeLink() { + return foundDdeLink; + } + } + + /** + * Handler for parsing connections.xml to extract external data connections. + */ + private class ConnectionsHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + private boolean foundConnection = false; + private boolean foundWebQuery = false; + + ConnectionsHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if ("connection".equals(localName)) { + foundConnection = true; + } + // Database connection string + if ("dbPr".equals(localName)) { + String connection = atts.getValue("connection"); + if (connection != null && !connection.isEmpty()) { + emitExternalRef(xhtml, "dbConnection", connection); + } + } + // Web query + if ("webPr".equals(localName)) { + foundWebQuery = true; + String url = atts.getValue("url"); + if (url != null && !url.isEmpty()) { + emitExternalRef(xhtml, "webQuery", url); + } + } + // ODBC connection + if ("olapPr".equals(localName)) { + String connection = atts.getValue("connection"); + if (connection != null && !connection.isEmpty()) { + emitExternalRef(xhtml, "olapConnection", connection); + } + } + // Text file import + if ("textPr".equals(localName)) { + String sourceFile = atts.getValue("sourceFile"); + if (sourceFile != null && !sourceFile.isEmpty()) { + emitExternalRef(xhtml, "textFileImport", sourceFile); + } + } + } + + boolean hasConnections() { + return foundConnection; + } + + boolean hasWebQueries() { + return foundWebQuery; + } + } + + /** + * Handler for parsing queryTable XML to extract web query sources. + */ + private class QueryTableHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + + QueryTableHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if ("queryTable".equals(localName)) { + String connectionId = atts.getValue("connectionId"); + // Connection details are in connections.xml + } + // Web query table refresh + if ("queryTableRefresh".equals(localName)) { + // Contains refresh settings + } + } + } + + /** + * Handler for parsing pivotCacheDefinition XML to detect external data sources. + */ + private class PivotCacheHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + private boolean hasExternalData = false; + + PivotCacheHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + // cacheSource with type="external" indicates external data + if ("cacheSource".equals(localName)) { + String type = atts.getValue("type"); + if ("external".equals(type) || "consolidation".equals(type)) { + hasExternalData = true; + } + } + // worksheetSource can have external references + if ("worksheetSource".equals(localName)) { + String ref = atts.getValue("ref"); + String sheet = atts.getValue("sheet"); + String rId = atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"); + // If there's a relationship ID, it likely points to external workbook + if (rId != null) { + hasExternalData = true; + } + } + // consolidation source (multiple ranges, possibly external) + if ("consolidation".equals(localName) || "rangeSets".equals(localName)) { + hasExternalData = true; + } + } + + boolean hasExternalData() { + return hasExternalData; + } } private void getThreadedComments(OPCPackage container, PackagePart sheetPart, XHTMLContentHandler xhtml) throws TikaException, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index 922cdbd01d..a44f4525d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -61,13 +61,18 @@ import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.EMFParser; import org.apache.tika.parser.microsoft.FormattingUtils; @@ -83,7 +88,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // Part 3, Step 3 private static final String LIST_DELIMITER = " "; - //include all parts that might have embedded objects private final static String[] MAIN_PART_RELATIONS = new String[]{XWPFRelation.HEADER.getRelation(), XWPFRelation.FOOTER.getRelation(), @@ -233,8 +237,54 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { //hyperlinks may or may not have hyperlink ids String lastHyperlinkId = null; boolean inHyperlink = false; + // Track field-based hyperlinks (using instrText/fldChar) + FieldHyperlinkTracker fieldTracker = new FieldHyperlinkTracker(); + boolean inFieldHyperlink = false; + // Do the iruns for (IRunElement run : paragraph.getIRuns()) { + // Check for field-based hyperlinks first (instrText HYPERLINK) + if (run instanceof XWPFRun) { + XWPFRun xwpfRun = (XWPFRun) run; + boolean wasInFieldHyperlink = fieldTracker.isInFieldHyperlink(); + String fieldUrl = extractFieldLinks(xwpfRun, fieldTracker); + + // If we just entered a field hyperlink, open the anchor tag + if (fieldUrl != null && !inFieldHyperlink) { + // Close any existing relationship-based hyperlink first + if (inHyperlink) { + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.endElement("a"); + inHyperlink = false; + lastHyperlinkId = null; + } + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.startElement("a", "href", fieldUrl); + inFieldHyperlink = true; + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + } + + // If we just exited a field hyperlink, close the anchor tag + if (wasInFieldHyperlink && !fieldTracker.isInFieldHyperlink() && inFieldHyperlink) { + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.endElement("a"); + inFieldHyperlink = false; + } + + // Emit any external refs (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK) as anchors + if (fieldTracker.getLastExternalRefUrl() != null) { + AttributesImpl extRefAtts = new AttributesImpl(); + extRefAtts.addAttribute("", "class", "class", "CDATA", + "external-ref-" + fieldTracker.getLastExternalRefType()); + extRefAtts.addAttribute("", "href", "href", "CDATA", + fieldTracker.getLastExternalRefUrl()); + xhtml.startElement("a", extRefAtts); + xhtml.endElement("a"); + metadata.set(Office.HAS_FIELD_HYPERLINKS, true); + fieldTracker.clearExternalRef(); + } + } + if (run instanceof XWPFHyperlinkRun) { XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run; if (hyperlinkRun.getHyperlinkId() == null || @@ -278,6 +328,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (inHyperlink) { xhtml.endElement("a"); } + if (inFieldHyperlink) { + xhtml.endElement("a"); + } // Now do any comments for the paragraph @@ -463,6 +516,46 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { xhtml.characters(run.getContent().getText()); } + /** + * Extracts field-based hyperlinks from a run by examining fldChar and instrText elements. + * This handles HYPERLINK field codes that are not relationship-based. + * + * @param run the run to examine + * @param tracker the field hyperlink tracker maintaining state across runs + * @return the hyperlink URL if this run starts a hyperlink, null otherwise + */ + private String extractFieldLinks(XWPFRun run, FieldHyperlinkTracker tracker) { + CTR ctr = run.getCTR(); + try (XmlCursor cursor = ctr.newCursor()) { + if (cursor.toFirstChild()) { + do { + String localName = cursor.getName().getLocalPart(); + if ("fldChar".equals(localName)) { + XmlObject obj = cursor.getObject(); + if (obj instanceof CTFldChar) { + CTFldChar fldChar = (CTFldChar) obj; + STFldCharType.Enum fldType = fldChar.getFldCharType(); + if (fldType == STFldCharType.BEGIN) { + tracker.startField(); + } else if (fldType == STFldCharType.SEPARATE) { + return tracker.separate(); + } else if (fldType == STFldCharType.END) { + tracker.endField(); + } + } + } else if ("instrText".equals(localName)) { + XmlObject obj = cursor.getObject(); + if (obj instanceof CTText) { + CTText text = (CTText) obj; + tracker.addInstrText(text.getStringValue()); + } + } + } while (cursor.toNextSibling()); + } + } + return null; + } + private void extractTable(XWPFTable table, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 72767fa15c..2950e46be3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -201,6 +201,11 @@ public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { //no-op } + @Override + public void linkedOLERef(String refId) { + //no-op + } + @Override public void embeddedPicRef(String picFileName, String picDescription) { //no-op diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 26cda41385..2fb45ca7fd 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -363,6 +363,11 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { //no-op } + @Override + public void linkedOLERef(String refId) { + //no-op + } + @Override public void embeddedPicRef(String picFileName, String picDescription) { //no-op diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index a90d79445f..1cced50eeb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -592,4 +592,47 @@ public class ExcelParserTest extends TikaTest { assertEquals("true", m.get(Office.HAS_COMMENTS)); assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS)); } + + /** + * Test extraction of external data connections from XLSX files. + * These can be used to exfiltrate data or load malicious content. + */ + @Test + public void testDataConnections() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testDataConnections.xlsx"); + Metadata m = metadataList.get(0); + // Check metadata flags are set + assertEquals("true", m.get(Office.HAS_DATA_CONNECTIONS)); + assertEquals("true", m.get(Office.HAS_WEB_QUERIES)); + + String xml = getXML("testDataConnections.xlsx").xml; + // Test web query extraction + assertContains("class=\"external-ref-webQuery\"", xml); + assertContains("http://example.com/data.html", xml); + // Test database connection extraction + assertContains("class=\"external-ref-dbConnection\"", xml); + assertContains("db.example.org", xml); + // Test text file import + assertContains("class=\"external-ref-textFileImport\"", xml); + assertContains("http://example.net/data.csv", xml); + } + + /** + * Test detection of DDE links in Excel files. + * DDE (Dynamic Data Exchange) links are a security risk as they can execute commands. + */ + @Test + public void testDdeLinks() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testDdeLink.xlsx"); + Metadata m = metadataList.get(0); + // Check DDE link metadata flag is set + assertEquals("true", m.get(Office.HAS_DDE_LINKS)); + // Also check external links flag since DDE is in externalLinks + assertEquals("true", m.get(Office.HAS_EXTERNAL_LINKS)); + + String xml = getXML("testDdeLink.xlsx").xml; + // Test DDE link extraction (service|topic format) + assertContains("class=\"external-ref-ddeLink\"", xml); + assertContains("cmd|", xml); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 718c0e07c9..d75c7a20a8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1812,4 +1812,43 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { String content = getText("testRecordSizeExceeded.xlsx"); assertContains("Repetitive content pattern 3 for compression test row 1", content); } + + /** + * Test extraction of field-based hyperlinks using instrText/fldChar. + * These are hyperlinks embedded as field codes rather than relationship-based hyperlinks. + * Uses the DOM-based XWPFWordExtractorDecorator. + */ + @Test + public void testInstrTextHyperlink() throws Exception { + String xml = getXML("testInstrLink.docx").xml; + // The document contains a HYPERLINK field code in instrText + assertContains("<a href=\"https://exmaple.com/file\">", xml); + assertContains("Access Document(s)", xml); + } + + /** + * Test extraction of external reference field codes (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK). + * These can be used to hide malicious URLs in documents. + */ + @Test + public void testExternalRefFieldCodes() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testExternalRefs.docx"); + Metadata m = metadataList.get(0); + // Check metadata flag is set + assertEquals("true", m.get(Office.HAS_FIELD_HYPERLINKS)); + + String xml = getXML("testExternalRefs.docx").xml; + // Test INCLUDEPICTURE field code + assertContains("class=\"external-ref-INCLUDEPICTURE\"", xml); + assertContains("http://example.com/tracking.png", xml); + // Test INCLUDETEXT field code + assertContains("class=\"external-ref-INCLUDETEXT\"", xml); + assertContains("http://example.org/payload.txt", xml); + // Test IMPORT field code + assertContains("class=\"external-ref-IMPORT\"", xml); + assertContains("http://example.net/exploit.wmf", xml); + // Test LINK field code + assertContains("class=\"external-ref-LINK\"", xml); + assertContains("http://test.invalid/cmd.docx", xml); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java index 91cb801ed1..c0482bd304 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java @@ -850,4 +850,113 @@ public class SXWPFExtractorTest extends TikaTest { assertContainsCount("inside-text", xml, 1); } + /** + * Test extraction of field-based hyperlinks using instrText/fldChar. + * These are hyperlinks embedded as field codes rather than relationship-based hyperlinks. + */ + @Test + public void testInstrTextHyperlink() throws Exception { + String xml = getXML("testInstrLink.docx", parseContext).xml; + // The document contains a HYPERLINK field code in instrText + assertContains("<a href=\"https://exmaple.com/file\">", xml); + assertContains("Access Document(s)", xml); + } + + /** + * Test extraction of external reference field codes (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK). + * These can be used to hide malicious URLs in documents. + */ + @Test + public void testExternalRefFieldCodes() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testExternalRefs.docx", parseContext); + Metadata m = metadataList.get(0); + // Check metadata flag is set + assertEquals("true", m.get(Office.HAS_FIELD_HYPERLINKS)); + + String xml = getXML("testExternalRefs.docx", parseContext).xml; + // Test INCLUDEPICTURE field code + assertContains("class=\"external-ref-INCLUDEPICTURE\"", xml); + assertContains("http://example.com/tracking.png", xml); + // Test INCLUDETEXT field code + assertContains("class=\"external-ref-INCLUDETEXT\"", xml); + assertContains("http://example.org/payload.txt", xml); + // Test IMPORT field code + assertContains("class=\"external-ref-IMPORT\"", xml); + assertContains("http://example.net/exploit.wmf", xml); + // Test LINK field code + assertContains("class=\"external-ref-LINK\"", xml); + assertContains("http://test.invalid/cmd.docx", xml); + } + + /** + * Test extraction of hlinkHover (hover hyperlinks) and VML shape hrefs. + * These are sneaky ways to hide malicious URLs. + */ + @Test + public void testHoverAndVmlHyperlinks() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testHoverAndVml.docx", parseContext); + Metadata m = metadataList.get(0); + // Check metadata flags are set + assertEquals("true", m.get(Office.HAS_HOVER_HYPERLINKS)); + assertEquals("true", m.get(Office.HAS_VML_HYPERLINKS)); + + String xml = getXML("testHoverAndVml.docx", parseContext).xml; + // Test hlinkHover (activates on mouse hover, not click) + assertContains("class=\"external-ref-hlinkHover\"", xml); + assertContains("http://hover.example.com/phishing", xml); + // Test VML shape href + assertContains("class=\"external-ref-vml-shape-href\"", xml); + assertContains("http://vml.example.org/shape-link", xml); + } + + /** + * Test detection of mail merge in Word documents. + * Mail merge can reference external data sources. + */ + @Test + public void testMailMerge() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testMailMerge.docx", parseContext); + Metadata m = metadataList.get(0); + assertEquals("true", m.get(Office.HAS_MAIL_MERGE)); + } + + /** + * Test detection of attached external template. + * Templates can be fetched from malicious URLs. + */ + @Test + public void testAttachedTemplate() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testAttachedTemplate.docx", parseContext); + Metadata m = metadataList.get(0); + assertEquals("true", m.get(Office.HAS_ATTACHED_TEMPLATE)); + + String xml = getXML("testAttachedTemplate.docx", parseContext).xml; + assertContains("class=\"external-ref-attachedTemplate\"", xml); + assertContains("example.com/templates", xml); + } + + /** + * Test detection of subdocuments (master document linking external docs). + */ + @Test + public void testSubdocument() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testSubdocument.docx", parseContext); + Metadata m = metadataList.get(0); + assertEquals("true", m.get(Office.HAS_SUBDOCUMENTS)); + + String xml = getXML("testSubdocument.docx", parseContext).xml; + assertContains("class=\"external-ref-subDocument\"", xml); + assertContains("example.org/chapters", xml); + } + + /** + * Test detection of framesets (HTML frames loading external URLs). + */ + @Test + public void testFrameset() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testFrameset.docx", parseContext); + Metadata m = metadataList.get(0); + assertEquals("true", m.get(Office.HAS_FRAMESETS)); + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index 6e9a6d6d1b..7cff052cf8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -67,6 +67,9 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals(10, metadataList.size()); Metadata m1 = metadataList.get(1); + assertEquals("application/x-tika-pst-mail-item", m1.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)); + assertEquals("application/x-tika-pst-mail-item", m1.get(Metadata.CONTENT_TYPE)); + assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME)); assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR)); assertEquals("Re: Feature Generators", m1.get(TikaCoreProperties.TITLE)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx new file mode 100644 index 0000000000..768258ad11 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx new file mode 100644 index 0000000000..af76b99347 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx new file mode 100644 index 0000000000..be4912b4b5 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx new file mode 100644 index 0000000000..8b8d3c1adc Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx new file mode 100644 index 0000000000..d19070fe07 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx new file mode 100644 index 0000000000..2b43e1e047 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx new file mode 100644 index 0000000000..3b2fc9257b Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx new file mode 100644 index 0000000000..e0c8f00b03 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx new file mode 100644 index 0000000000..7bf396e35b Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx differ
