This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bef2d336b1 TIKA-4646 -- extract hyperlinks from instrText and other
areas in ooxml(#2578)
bef2d336b1 is described below
commit bef2d336b1e4e52e3ca262d656f93ee4d3145b5f
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 3 06:32:51 2026 -0500
TIKA-4646 -- extract hyperlinks from instrText and other areas in
ooxml(#2578)
---
.../main/java/org/apache/tika/metadata/Office.java | 41 +++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 26 ++
.../microsoft/ooxml/FieldHyperlinkTracker.java | 168 +++++++++
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 25 ++
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 187 +++++++++-
.../ooxml/SXWPFWordExtractorDecorator.java | 179 +++++++++-
.../ooxml/XSSFExcelExtractorDecorator.java | 390 +++++++++++++++++++++
.../ooxml/XWPFWordExtractorDecorator.java | 95 ++++-
.../xslf/XSLFEventBasedPowerPointExtractor.java | 5 +
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 5 +
.../tika/parser/microsoft/ExcelParserTest.java | 43 +++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 39 +++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 109 ++++++
.../test-documents/testAttachedTemplate.docx | Bin 0 -> 2284 bytes
.../test-documents/testDataConnections.xlsx | Bin 0 -> 2967 bytes
.../test/resources/test-documents/testDdeLink.xlsx | Bin 0 -> 3030 bytes
.../resources/test-documents/testExternalRefs.docx | Bin 0 -> 2125 bytes
.../resources/test-documents/testFrameset.docx | Bin 0 -> 2328 bytes
.../resources/test-documents/testHoverAndVml.docx | Bin 0 -> 2270 bytes
.../resources/test-documents/testInstrLink.docx | Bin 0 -> 14464 bytes
.../resources/test-documents/testMailMerge.docx | Bin 0 -> 2306 bytes
.../resources/test-documents/testSubdocument.docx | Bin 0 -> 1980 bytes
22 files changed, 1309 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 477ffef140..cf717b38cc 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -171,6 +171,20 @@ public interface Office {
Property WORKBOOK_CODENAME =
Property.internalText("msoffice:excel:workbook-codename");
+ Property HAS_DATA_CONNECTIONS =
Property.internalBoolean("msoffice:excel:has-data-connections");
+
+ Property HAS_EXTERNAL_LINKS =
Property.internalBoolean("msoffice:excel:has-external-links");
+
+ Property HAS_WEB_QUERIES =
Property.internalBoolean("msoffice:excel:has-web-queries");
+
+ Property HAS_EXTERNAL_OLE_OBJECTS =
Property.internalBoolean("msoffice:has-external-ole-objects");
+
+ Property HAS_FIELD_HYPERLINKS =
Property.internalBoolean("msoffice:has-field-hyperlinks");
+
+ Property HAS_HOVER_HYPERLINKS =
Property.internalBoolean("msoffice:has-hover-hyperlinks");
+
+ Property HAS_VML_HYPERLINKS =
Property.internalBoolean("msoffice:has-vml-hyperlinks");
+
Property HAS_COMMENTS = Property.internalBoolean("msoffice:has-comments");
Property COMMENT_PERSONS =
Property.internalTextBag("msoffice:comment-person-display-name");
@@ -185,4 +199,31 @@ public interface Office {
Property HAS_HIDDEN_TEXT =
Property.internalBoolean("msoffice:doc:has-hidden-text");
Property HAS_TRACK_CHANGES =
Property.internalBoolean("msoffice:has-track-changes");
+
+ // Security-relevant: DDE (Dynamic Data Exchange) links can execute
commands
+ Property HAS_DDE_LINKS =
Property.internalBoolean("msoffice:excel:has-dde-links");
+
+ // Security-relevant: Mail merge can reference external data sources
+ Property HAS_MAIL_MERGE =
Property.internalBoolean("msoffice:doc:has-mail-merge");
+
+ // Security-relevant: Attached templates can be fetched from external URLs
+ Property HAS_ATTACHED_TEMPLATE =
Property.internalBoolean("msoffice:doc:has-attached-template");
+
+ // Security-relevant: SubDocuments reference external documents in master
docs
+ Property HAS_SUBDOCUMENTS =
Property.internalBoolean("msoffice:doc:has-subdocuments");
+
+ // Security-relevant: Pivot tables can reference external OLAP/database
sources
+ Property HAS_EXTERNAL_PIVOT_DATA =
Property.internalBoolean("msoffice:excel:has-external-pivot-data");
+
+ // Security-relevant: Power Query can contain URLs and connection strings
+ Property HAS_POWER_QUERY =
Property.internalBoolean("msoffice:excel:has-power-query");
+
+ // Security-relevant: OLE objects can link to external files (vs embedded)
+ Property HAS_LINKED_OLE_OBJECTS =
Property.internalBoolean("msoffice:has-linked-ole-objects");
+
+ // Security-relevant: Charts can reference external workbook data
+ Property HAS_EXTERNAL_CHART_DATA =
Property.internalBoolean("msoffice:has-external-chart-data");
+
+ // Security-relevant: Framesets can load external URLs
+ Property HAS_FRAMESETS =
Property.internalBoolean("msoffice:doc:has-framesets");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index a6a0b20c34..70d5920800 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -273,6 +273,16 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
sourceDesc = "";
}
if (rel.getTargetMode() != TargetMode.INTERNAL) {
+ // External target - emit as external reference for security
analysis
+ String type = rel.getRelationshipType();
+ if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
+ emitExternalRef(xhtml, "externalOleObject",
targetURI.toString());
+ parentMetadata.set(Office.HAS_EXTERNAL_OLE_OBJECTS, true);
+ } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) {
+ emitExternalRef(xhtml, "externalImage", targetURI.toString());
+ } else {
+ emitExternalRef(xhtml, "externalResource",
targetURI.toString());
+ }
return;
}
PackagePart target;
@@ -497,6 +507,22 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
}
+ /**
+ * Emits an external reference as an anchor element with appropriate class.
+ * Used for detecting external resources that could be security risks.
+ */
+ private void emitExternalRef(XHTMLContentHandler xhtml, String refType,
String url)
+ throws SAXException {
+ if (url == null || url.isEmpty()) {
+ return;
+ }
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" +
refType);
+ attrs.addAttribute("", "href", "href", "CDATA", url);
+ xhtml.startElement("a", attrs);
+ xhtml.endElement("a");
+ }
+
/**
* Populates the {@link XHTMLContentHandler} object received as parameter.
*/
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java
new file mode 100644
index 0000000000..951711d99f
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Tracks field hyperlink state across multiple runs within a paragraph.
+ * Field codes span multiple runs: begin -> instrText -> separate -> text runs
-> end
+ * <p>
+ * This class handles HYPERLINK field codes as well as other external
references
+ * like INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK.
+ */
+class FieldHyperlinkTracker {
+
+ // Patterns for extracting URLs from field codes
+ private static final Pattern HYPERLINK_PATTERN =
+ Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern INCLUDEPICTURE_PATTERN =
+ Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern INCLUDETEXT_PATTERN =
+ Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern IMPORT_PATTERN =
+ Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern LINK_PATTERN =
+
Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"",
+ Pattern.CASE_INSENSITIVE);
+
+ private boolean inField = false;
+ private boolean inFieldHyperlink = false;
+ private final StringBuilder instrTextBuffer = new StringBuilder();
+ private String lastExternalRefType = null;
+ private String lastExternalRefUrl = null;
+
+ void startField() {
+ inField = true;
+ instrTextBuffer.setLength(0);
+ lastExternalRefType = null;
+ lastExternalRefUrl = null;
+ }
+
+ void addInstrText(String text) {
+ if (inField && text != null) {
+ instrTextBuffer.append(text);
+ }
+ }
+
+ /**
+ * Called when fldChar separate is encountered.
+ *
+ * @return the hyperlink URL if this is a HYPERLINK field, null otherwise
+ */
+ String separate() {
+ if (inField) {
+ String url =
parseHyperlinkFromInstrText(instrTextBuffer.toString());
+ if (url != null) {
+ inFieldHyperlink = true;
+ return url;
+ }
+ // Check for other external refs (INCLUDEPICTURE, INCLUDETEXT,
IMPORT, LINK)
+ StringBuilder fieldType = new StringBuilder();
+ String extUrl =
parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType);
+ if (extUrl != null) {
+ lastExternalRefType = fieldType.toString();
+ lastExternalRefUrl = extUrl;
+ }
+ }
+ return null;
+ }
+
+ void endField() {
+ inField = false;
+ inFieldHyperlink = false;
+ instrTextBuffer.setLength(0);
+ lastExternalRefType = null;
+ lastExternalRefUrl = null;
+ }
+
+ boolean isInFieldHyperlink() {
+ return inFieldHyperlink;
+ }
+
+ String getLastExternalRefType() {
+ return lastExternalRefType;
+ }
+
+ String getLastExternalRefUrl() {
+ return lastExternalRefUrl;
+ }
+
+ void clearExternalRef() {
+ lastExternalRefType = null;
+ lastExternalRefUrl = null;
+ }
+
+ /**
+ * Parses a HYPERLINK URL from instrText field code content.
+ *
+ * @param instrText the accumulated instrText content
+ * @return the URL if found, or null
+ */
+ private static String parseHyperlinkFromInstrText(String instrText) {
+ if (instrText == null || instrText.isEmpty()) {
+ return null;
+ }
+ Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
+ if (m.find()) {
+ return m.group(1);
+ }
+ return null;
+ }
+
+ /**
+ * Parses external reference URLs from instrText field codes
+ * (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK).
+ *
+ * @param instrText the accumulated instrText content
+ * @param fieldType output parameter - will contain the field type if found
+ * @return the URL if found, or null
+ */
+ private static String parseExternalRefFromInstrText(String instrText,
StringBuilder fieldType) {
+ if (instrText == null || instrText.isEmpty()) {
+ return null;
+ }
+ String trimmed = instrText.trim();
+
+ Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("INCLUDEPICTURE");
+ return m.group(1);
+ }
+
+ m = INCLUDETEXT_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("INCLUDETEXT");
+ return m.group(1);
+ }
+
+ m = IMPORT_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("IMPORT");
+ return m.group(1);
+ }
+
+ m = LINK_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("LINK");
+ return m.group(1);
+ }
+
+ return null;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 8ff630da14..4bc445fb5e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -321,6 +321,19 @@ public class OOXMLTikaBodyPartHandler
xhtml.endElement("div");
}
+ @Override
+ public void linkedOLERef(String relId) throws SAXException {
+ if (relId == null) {
+ return;
+ }
+ // Emit as an external reference anchor - linked OLE objects reference
external files
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA",
"external-ref-linkedOle");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("a", attributes);
+ xhtml.endElement("a");
+ }
+
@Override
public void embeddedPicRef(String picFileName, String picDescription)
throws SAXException {
@@ -338,6 +351,18 @@ public class OOXMLTikaBodyPartHandler
}
+ @Override
+ public void externalRef(String fieldType, String url) throws SAXException {
+ if (url == null || url.isEmpty()) {
+ return;
+ }
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "class", "class", "CDATA", "external-ref-" +
fieldType);
+ attr.addAttribute("", "href", "href", "CDATA", url);
+ xhtml.startElement("a", attr);
+ xhtml.endElement("a");
+ }
+
@Override
public void startBookmark(String id, String name) throws SAXException {
//skip bookmarks within hyperlinks
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index a2e940b587..3569398a28 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -19,12 +19,16 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.util.Date;
import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.utils.DateUtils;
/**
@@ -108,12 +112,34 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private final static String MOVE_TO = "moveTo";
private final static String ENDNOTE_REFERENCE = "endnoteReference";
private static final String TEXTBOX = "textbox";
+ private final static String FLD_CHAR = "fldChar";
+ private final static String INSTR_TEXT = "instrText";
+ private final static String FLD_CHAR_TYPE = "fldCharType";
+ // DrawingML hyperlinks on shapes/pictures
+ private final static String HLINK_HOVER = "hlinkHover";
+ private final static String C_NV_PR = "cNvPr";
+ // VML shape hyperlinks
+ private final static String SHAPE = "shape";
+ private final static String HREF = "href";
+
+ // Patterns for extracting URLs from field codes
+ private static final Pattern HYPERLINK_PATTERN =
+ Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern INCLUDEPICTURE_PATTERN =
+ Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern INCLUDETEXT_PATTERN =
+ Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern IMPORT_PATTERN =
+ Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
+ private static final Pattern LINK_PATTERN =
+
Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
private final XWPFBodyContentsHandler bodyContentsHandler;
private final Map<String, String> linkedRelationships;
private final RunProperties currRunProperties = new RunProperties();
private final ParagraphProperties currPProperties = new
ParagraphProperties();
private final boolean includeTextBox;
private final boolean concatenatePhoneticRuns;
+ private final Metadata metadata;
private final StringBuilder runBuffer = new StringBuilder();
private final StringBuilder rubyBuffer = new StringBuilder();
private boolean inR = false;
@@ -145,6 +171,11 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private boolean inHlinkClick = false;
private boolean inTextBox = false;
private boolean inV = false; //in c:v in chart file
+ // Field code tracking for instrText-based hyperlinks
+ private boolean inField = false;
+ private boolean inInstrText = false;
+ private boolean inFieldHyperlink = false;
+ private final StringBuilder instrTextBuffer = new StringBuilder();
private OOXMLWordAndPowerPointTextHandler.EditType editType =
OOXMLWordAndPowerPointTextHandler.EditType.NONE;
private DateUtils dateUtils = new DateUtils();
@@ -153,16 +184,23 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
Map<String, String> hyperlinks) {
- this(bodyContentsHandler, hyperlinks, true, true);
+ this(bodyContentsHandler, hyperlinks, true, true, null);
}
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
Map<String, String> hyperlinks,
boolean includeTextBox,
boolean concatenatePhoneticRuns) {
+ this(bodyContentsHandler, hyperlinks, includeTextBox,
concatenatePhoneticRuns, null);
+ }
+
+ public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
+ Map<String, String> hyperlinks,
boolean includeTextBox,
+ boolean concatenatePhoneticRuns,
Metadata metadata) {
this.bodyContentsHandler = bodyContentsHandler;
this.linkedRelationships = hyperlinks;
this.includeTextBox = includeTextBox;
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
+ this.metadata = metadata;
}
@Override
@@ -326,6 +364,12 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
if ("Embed".equals(type)) {
bodyContentsHandler.embeddedOLERef(refId);
+ } else if ("Link".equals(type)) {
+ // Linked OLE object - references external file
+ bodyContentsHandler.linkedOLERef(refId);
+ if (metadata != null) {
+ metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true);
+ }
}
} else if (CR.equals(localName)) {
runBuffer.append(NEWLINE);
@@ -341,6 +385,65 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
if ("0".equals(val) || "false".equals(val)) {
hiddenSlide = true;
}
+ } else if (FLD_CHAR.equals(localName)) {
+ String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE);
+ if ("begin".equals(fldCharType)) {
+ inField = true;
+ instrTextBuffer.setLength(0);
+ } else if ("separate".equals(fldCharType)) {
+ // Parse instrText for HYPERLINK
+ String url =
parseHyperlinkFromInstrText(instrTextBuffer.toString());
+ if (url != null) {
+ bodyContentsHandler.hyperlinkStart(url);
+ inFieldHyperlink = true;
+ if (metadata != null) {
+ metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+ }
+ } else {
+ // Check for external reference fields (INCLUDEPICTURE,
INCLUDETEXT, etc.)
+ StringBuilder fieldType = new StringBuilder();
+ String extUrl =
parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType);
+ if (extUrl != null) {
+ bodyContentsHandler.externalRef(fieldType.toString(),
extUrl);
+ if (metadata != null) {
+ metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+ }
+ }
+ }
+ } else if ("end".equals(fldCharType)) {
+ if (inFieldHyperlink) {
+ bodyContentsHandler.hyperlinkEnd();
+ inFieldHyperlink = false;
+ }
+ inField = false;
+ instrTextBuffer.setLength(0);
+ }
+ } else if (INSTR_TEXT.equals(localName)) {
+ inInstrText = true;
+ } else if (HLINK_HOVER.equals(localName)) {
+ // DrawingML hover hyperlink on shapes/pictures
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS,
"id");
+ if (hyperlinkId != null) {
+ String hyperlink = linkedRelationships.get(hyperlinkId);
+ if (hyperlink != null) {
+ bodyContentsHandler.externalRef("hlinkHover", hyperlink);
+ if (metadata != null) {
+ metadata.set(Office.HAS_HOVER_HYPERLINKS, true);
+ }
+ }
+ }
+ } else if (SHAPE.equals(localName) && V_NS.equals(uri)) {
+ // VML shape with href attribute
+ String href = atts.getValue(HREF);
+ if (href == null) {
+ href = atts.getValue(O_NS, HREF);
+ }
+ if (href != null && !href.isEmpty()) {
+ bodyContentsHandler.externalRef("vml-shape-href", href);
+ if (metadata != null) {
+ metadata.set(Office.HAS_VML_HYPERLINKS, true);
+ }
+ }
}
}
@@ -376,6 +479,65 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
return -1;
}
+ /**
+ * Parses a HYPERLINK URL from instrText field code content.
+ * Field codes like: HYPERLINK "https://example.com"
+ *
+ * @param instrText the accumulated instrText content
+ * @return the URL if found, or null
+ */
+ private String parseHyperlinkFromInstrText(String instrText) {
+ if (instrText == null || instrText.isEmpty()) {
+ return null;
+ }
+ Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
+ if (m.find()) {
+ return m.group(1);
+ }
+ return null;
+ }
+
+ /**
+ * Parses URLs from instrText field codes that reference external
resources.
+ * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
+ *
+ * @param instrText the accumulated instrText content
+ * @param fieldType output parameter - will contain the field type if found
+ * @return the URL if found, or null
+ */
+ private String parseExternalRefFromInstrText(String instrText,
StringBuilder fieldType) {
+ if (instrText == null || instrText.isEmpty()) {
+ return null;
+ }
+ String trimmed = instrText.trim();
+
+ Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("INCLUDEPICTURE");
+ return m.group(1);
+ }
+
+ m = INCLUDETEXT_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("INCLUDETEXT");
+ return m.group(1);
+ }
+
+ m = IMPORT_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("IMPORT");
+ return m.group(1);
+ }
+
+ m = LINK_PATTERN.matcher(trimmed);
+ if (m.find()) {
+ fieldType.append("LINK");
+ return m.group(1);
+ }
+
+ return null;
+ }
+
@Override
public void endElement(String uri, String localName, String qName) throws
SAXException {
@@ -441,6 +603,8 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
inRt = false;
} else if (RUBY.equals(localName)) {
handleEndOfRuby();
+ } else if (INSTR_TEXT.equals(localName)) {
+ inInstrText = false;
}
}
@@ -498,6 +662,9 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
} else if (inV) {
appendToBuffer(ch, start, length);
appendToBuffer(TAB_CHAR, 0, 1);
+ } else if (inInstrText && inField) {
+ // Accumulate instrText content for field code parsing (e.g.,
HYPERLINK)
+ instrTextBuffer.append(ch, start, length);
}
}
@@ -573,11 +740,29 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
void embeddedOLERef(String refId) throws SAXException;
+ /**
+ * Called when a linked (vs embedded) OLE object is found.
+ * These reference external files and are a security concern.
+ */
+ void linkedOLERef(String refId) throws SAXException;
+
void embeddedPicRef(String picFileName, String picDescription) throws
SAXException;
void startBookmark(String id, String name) throws SAXException;
void endBookmark(String id) throws SAXException;
+
+ /**
+ * Called when an external reference URL is found in a field code.
+ * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields,
+ * and DrawingML/VML hyperlinks on shapes.
+ *
+ * @param fieldType the type of field (e.g., "INCLUDEPICTURE",
"hlinkHover", "vml-href")
+ * @param url the external URL
+ */
+ default void externalRef(String fieldType, String url) throws
SAXException {
+ // Default no-op implementation for backward compatibility
+ }
}
public boolean isHiddenSlide() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 3c41a0024b..60eb91ec94 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -29,14 +29,18 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
@@ -68,6 +72,16 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"};
+ // Relationship types for Word settings
+ private static final String SETTINGS_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings";
+ private static final String WEB_SETTINGS_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings";
+ private static final String ATTACHED_TEMPLATE_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/attachedTemplate";
+ private static final String SUBDOCUMENT_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/subDocument";
+
//a docx file should have one of these "main story" parts
private final static String[] MAIN_STORY_PART_RELATIONS =
new String[]{XWPFRelation.DOCUMENT.getContentType(),
@@ -115,6 +129,106 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.endElement("div");
}
}
+
+ // Detect security-relevant features in main document
+ pps = getStoryDocumentParts();
+ if (pps != null && !pps.isEmpty()) {
+ PackagePart mainDoc = pps.get(0);
+ detectSecurityFeatures(mainDoc, xhtml);
+ }
+ }
+
+ /**
+ * Detects security-relevant features like mail merge, attached templates,
+ * subdocuments, and framesets.
+ */
+ private void detectSecurityFeatures(PackagePart documentPart,
XHTMLContentHandler xhtml) {
+ // Check for attached template (external template reference)
+ try {
+ PackageRelationshipCollection templateRels =
+
documentPart.getRelationshipsByType(ATTACHED_TEMPLATE_RELATION);
+ if (templateRels != null && templateRels.size() > 0) {
+ metadata.set(Office.HAS_ATTACHED_TEMPLATE, true);
+ for (PackageRelationship rel : templateRels) {
+ if (rel.getTargetMode() == TargetMode.EXTERNAL) {
+ emitExternalRef(xhtml, "attachedTemplate",
rel.getTargetURI().toString());
+ }
+ }
+ }
+ } catch (InvalidFormatException | SAXException e) {
+ // swallow
+ }
+
+ // Check for subdocuments (master document with external subdocs)
+ try {
+ PackageRelationshipCollection subDocRels =
+ documentPart.getRelationshipsByType(SUBDOCUMENT_RELATION);
+ if (subDocRels != null && subDocRels.size() > 0) {
+ metadata.set(Office.HAS_SUBDOCUMENTS, true);
+ for (PackageRelationship rel : subDocRels) {
+ if (rel.getTargetMode() == TargetMode.EXTERNAL) {
+ emitExternalRef(xhtml, "subDocument",
rel.getTargetURI().toString());
+ }
+ }
+ }
+ } catch (InvalidFormatException | SAXException e) {
+ // swallow
+ }
+
+ // Check settings.xml for mail merge
+ try {
+ PackageRelationshipCollection settingsRels =
+ documentPart.getRelationshipsByType(SETTINGS_RELATION);
+ if (settingsRels != null && settingsRels.size() > 0) {
+ PackagePart settingsPart =
documentPart.getRelatedPart(settingsRels.getRelationship(0));
+ if (settingsPart != null) {
+ try (InputStream is = settingsPart.getInputStream()) {
+ WordSettingsHandler handler = new
WordSettingsHandler(xhtml);
+ XMLReaderUtils.parseSAX(is, handler, context);
+ if (handler.hasMailMerge()) {
+ metadata.set(Office.HAS_MAIL_MERGE, true);
+ }
+ }
+ }
+ }
+ } catch (InvalidFormatException | IOException | TikaException |
SAXException e) {
+ // swallow
+ }
+
+ // Check webSettings.xml for framesets
+ try {
+ PackageRelationshipCollection webSettingsRels =
+ documentPart.getRelationshipsByType(WEB_SETTINGS_RELATION);
+ if (webSettingsRels != null && webSettingsRels.size() > 0) {
+ PackagePart webSettingsPart =
documentPart.getRelatedPart(webSettingsRels.getRelationship(0));
+ if (webSettingsPart != null) {
+ try (InputStream is = webSettingsPart.getInputStream()) {
+ WebSettingsHandler handler = new
WebSettingsHandler(xhtml);
+ XMLReaderUtils.parseSAX(is, handler, context);
+ if (handler.hasFrameset()) {
+ metadata.set(Office.HAS_FRAMESETS, true);
+ }
+ }
+ }
+ }
+ } catch (InvalidFormatException | IOException | TikaException |
SAXException e) {
+ // swallow
+ }
+ }
+
+ /**
+ * Emits an external reference as an anchor element.
+ */
+ private void emitExternalRef(XHTMLContentHandler xhtml, String refType,
String url)
+ throws SAXException {
+ if (url == null || url.isEmpty()) {
+ return;
+ }
+ org.xml.sax.helpers.AttributesImpl attrs = new
org.xml.sax.helpers.AttributesImpl();
+ attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" +
refType);
+ attrs.addAttribute("", "href", "href", "CDATA", url);
+ xhtml.startElement("a", attrs);
+ xhtml.endElement("a");
}
private void handleDocumentPart(PackagePart documentPart,
XHTMLContentHandler xhtml)
@@ -194,7 +308,7 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
new EmbeddedContentHandler(new
OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml, styles,
listManager, config),
linkedRelationships,
config.isIncludeShapeBasedContent(),
- config.isConcatenatePhoneticRuns())), context);
+ config.isConcatenatePhoneticRuns(), metadata)),
context);
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
@@ -298,4 +412,67 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
return new ArrayList<>();
}
+
+ /**
+ * Handler for parsing Word settings.xml to detect mail merge and other
features.
+ */
+ private static class WordSettingsHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ private boolean hasMailMerge = false;
+
+ WordSettingsHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ // Mail merge element indicates document has mail merge data source
+ if ("mailMerge".equals(localName)) {
+ hasMailMerge = true;
+ }
+ // dataSource element contains the external data source reference
+ if ("dataSource".equals(localName) || "query".equals(localName)) {
+ String rId =
atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id");
+ // The actual data source location is in the relationship
+ }
+ }
+
+ boolean hasMailMerge() {
+ return hasMailMerge;
+ }
+ }
+
+ /**
+ * Handler for parsing Word webSettings.xml to detect framesets.
+ */
+ private static class WebSettingsHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ private boolean hasFrameset = false;
+
+ WebSettingsHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ // Frameset element indicates document contains frames
+ if ("frameset".equals(localName)) {
+ hasFrameset = true;
+ }
+ // Frame with src attribute contains URL
+ if ("frame".equals(localName)) {
+ String src = atts.getValue("src");
+ if (src != null && !src.isEmpty()) {
+ // Frame references an external URL
+ hasFrameset = true;
+ }
+ }
+ }
+
+ boolean hasFrameset() {
+ return hasFrameset;
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index b3a70c09cf..9572e5e27c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -79,6 +79,20 @@ import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+
+ // Relationship types for external data sources
+ private static final String EXTERNAL_LINK_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink";
+ private static final String CONNECTIONS_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/connections";
+ private static final String QUERY_TABLE_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/queryTable";
+ private static final String PIVOT_CACHE_DEFINITION_RELATION =
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition";
+ // Power Query stores data in customData parts
+ private static final String POWER_QUERY_CONTENT_TYPE =
+ "application/vnd.ms-excel.customDataProperties+xml";
+
/**
* Allows access to headers/footers from raw xml strings
*/
@@ -222,6 +236,382 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
//swallow
}
+ // Extract external data sources (HIGH security risk - can hide
malicious URLs)
+ try {
+ extractExternalDataSources(container, xhtml);
+ } catch (InvalidFormatException | TikaException | IOException |
SAXException e) {
+ //swallow
+ }
+
+ }
+
+ /**
+ * Extracts external data sources from the workbook including:
+ * - External workbook links
+ * - Data connections (database, web queries)
+ * - Query tables
+ */
+ private void extractExternalDataSources(OPCPackage container,
XHTMLContentHandler xhtml)
+ throws InvalidFormatException, TikaException, IOException,
SAXException {
+
+ PackageRelationship coreDocRelationship =
container.getRelationshipsByType(
+ PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
+ if (coreDocRelationship == null) {
+ return;
+ }
+ PackagePart workbookPart = container.getPart(coreDocRelationship);
+ if (workbookPart == null) {
+ return;
+ }
+
+ // Extract external workbook links
+ extractExternalLinks(workbookPart, xhtml);
+
+ // Extract connections (database, ODBC, web queries)
+ extractConnections(workbookPart, xhtml);
+
+ // Extract query tables from each sheet
+ for (PackagePart sheetPart : sheetParts) {
+ extractQueryTables(sheetPart, xhtml);
+ }
+
+ // Detect pivot cache with external data sources
+ extractPivotCacheExternalData(workbookPart, xhtml);
+
+ // Detect Power Query / Data Mashup
+ detectPowerQuery(container);
+ }
+
+ /**
+ * Detects pivot cache definitions with external data sources (OLAP,
databases).
+ */
+ private void extractPivotCacheExternalData(PackagePart workbookPart,
XHTMLContentHandler xhtml)
+ throws InvalidFormatException {
+ PackageRelationshipCollection coll =
workbookPart.getRelationshipsByType(PIVOT_CACHE_DEFINITION_RELATION);
+ if (coll == null || coll.isEmpty()) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ try {
+ PackagePart pivotCachePart = workbookPart.getRelatedPart(rel);
+ if (pivotCachePart != null) {
+ PivotCacheHandler handler = new PivotCacheHandler(xhtml);
+ try (InputStream is = pivotCachePart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, handler, parseContext);
+ }
+ if (handler.hasExternalData()) {
+ metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
+ }
+ }
+ } catch (IOException | TikaException | SAXException e) {
+ // swallow
+ }
+ }
+ }
+
+ /**
+ * Detects Power Query / Data Mashup presence.
+ */
+ private void detectPowerQuery(OPCPackage container) {
+ // Power Query data is stored in customData parts with specific
content type
+ // or in xl/customData/ folder
+ try {
+ List<PackagePart> customDataParts =
container.getPartsByContentType(POWER_QUERY_CONTENT_TYPE);
+ if (customDataParts != null && !customDataParts.isEmpty()) {
+ metadata.set(Office.HAS_POWER_QUERY, true);
+ }
+ // Also check for customData folder parts
+ for (PackagePart part : container.getParts()) {
+ String partName = part.getPartName().getName();
+ if (partName.contains("/customData/") ||
partName.contains("/dataMashup")) {
+ metadata.set(Office.HAS_POWER_QUERY, true);
+ break;
+ }
+ }
+ } catch (InvalidFormatException e) {
+ // swallow
+ }
+ }
+
+ /**
+ * Extracts external workbook links from externalLink parts.
+ */
+ private void extractExternalLinks(PackagePart workbookPart,
XHTMLContentHandler xhtml)
+ throws InvalidFormatException, SAXException {
+ PackageRelationshipCollection coll =
workbookPart.getRelationshipsByType(EXTERNAL_LINK_RELATION);
+ if (coll == null || coll.isEmpty()) {
+ return;
+ }
+ // If we have any external link relationships, set the metadata flag
+ if (coll.size() > 0) {
+ metadata.set(Office.HAS_EXTERNAL_LINKS, true);
+ }
+ for (PackageRelationship rel : coll) {
+ if (rel.getTargetMode() == TargetMode.EXTERNAL) {
+ // Direct external reference
+ emitExternalRef(xhtml, "externalLink",
rel.getTargetURI().toString());
+ } else {
+ // Internal part that contains external reference - parse it
+ try {
+ PackagePart externalLinkPart =
workbookPart.getRelatedPart(rel);
+ if (externalLinkPart != null) {
+ ExternalLinkHandler handler = new
ExternalLinkHandler(xhtml);
+ try (InputStream is =
externalLinkPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, handler, parseContext);
+ }
+ if (handler.hasDdeLink()) {
+ metadata.set(Office.HAS_DDE_LINKS, true);
+ }
+ }
+ } catch (IOException | TikaException e) {
+ // swallow
+ }
+ }
+ }
+ }
+
+ /**
+ * Extracts data connections from connections.xml.
+ */
+ private void extractConnections(PackagePart workbookPart,
XHTMLContentHandler xhtml)
+ throws InvalidFormatException, SAXException {
+ PackageRelationshipCollection coll =
workbookPart.getRelationshipsByType(CONNECTIONS_RELATION);
+ if (coll == null || coll.isEmpty()) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ try {
+ PackagePart connectionsPart = workbookPart.getRelatedPart(rel);
+ if (connectionsPart != null) {
+ ConnectionsHandler handler = new ConnectionsHandler(xhtml);
+ try (InputStream is = connectionsPart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, handler, parseContext);
+ }
+ if (handler.hasConnections()) {
+ metadata.set(Office.HAS_DATA_CONNECTIONS, true);
+ }
+ if (handler.hasWebQueries()) {
+ metadata.set(Office.HAS_WEB_QUERIES, true);
+ }
+ }
+ } catch (IOException | TikaException e) {
+ // swallow
+ }
+ }
+ }
+
+ /**
+ * Extracts query table external sources.
+ */
+ private void extractQueryTables(PackagePart sheetPart, XHTMLContentHandler
xhtml)
+ throws InvalidFormatException, SAXException {
+ PackageRelationshipCollection coll =
sheetPart.getRelationshipsByType(QUERY_TABLE_RELATION);
+ if (coll == null || coll.isEmpty()) {
+ return;
+ }
+ for (PackageRelationship rel : coll) {
+ try {
+ PackagePart queryTablePart = sheetPart.getRelatedPart(rel);
+ if (queryTablePart != null) {
+ try (InputStream is = queryTablePart.getInputStream()) {
+ XMLReaderUtils.parseSAX(is, new
QueryTableHandler(xhtml), parseContext);
+ }
+ }
+ } catch (IOException | TikaException e) {
+ // swallow
+ }
+ }
+ }
+
+ /**
+ * Emits an external reference as an anchor element with appropriate class.
+ */
+ private void emitExternalRef(XHTMLContentHandler xhtml, String refType,
String url)
+ throws SAXException {
+ if (url == null || url.isEmpty()) {
+ return;
+ }
+ org.xml.sax.helpers.AttributesImpl attrs = new
org.xml.sax.helpers.AttributesImpl();
+ attrs.addAttribute("", "class", "class", "CDATA", "external-ref-" +
refType);
+ attrs.addAttribute("", "href", "href", "CDATA", url);
+ xhtml.startElement("a", attrs);
+ xhtml.endElement("a");
+ }
+
+ /**
+ * Handler for parsing externalLink XML to extract external workbook
references.
+ */
+ private class ExternalLinkHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ private boolean foundDdeLink = false;
+
+ ExternalLinkHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ // Look for externalBook element with r:id attribute
+ if ("externalBook".equals(localName)) {
+ String rId =
atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id");
+ // The actual URL is in the relationship, not directly in the
XML
+ // For now, we note that there's an external book reference
+ }
+ // Look for file element with href attribute (older format)
+ if ("file".equals(localName)) {
+ String href = atts.getValue("href");
+ if (href != null && !href.isEmpty()) {
+ emitExternalRef(xhtml, "externalWorkbook", href);
+ }
+ }
+ // Look for oleLink with r:id (OLE links to external files)
+ if ("oleLink".equals(localName)) {
+ String rId =
atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id");
+ if (rId != null) {
+ emitExternalRef(xhtml, "oleLink", "relationship:" + rId);
+ }
+ }
+ // DDE links - security risk: can execute commands
+ if ("ddeLink".equals(localName)) {
+ foundDdeLink = true;
+ String ddeService = atts.getValue("ddeService");
+ String ddeTopic = atts.getValue("ddeTopic");
+ if (ddeService != null || ddeTopic != null) {
+ String ddeRef = (ddeService != null ? ddeService : "") +
"|" +
+ (ddeTopic != null ? ddeTopic : "");
+ emitExternalRef(xhtml, "ddeLink", ddeRef);
+ }
+ }
+ }
+
+ boolean hasDdeLink() {
+ return foundDdeLink;
+ }
+ }
+
+ /**
+ * Handler for parsing connections.xml to extract external data
connections.
+ */
+ private class ConnectionsHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ private boolean foundConnection = false;
+ private boolean foundWebQuery = false;
+
+ ConnectionsHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ if ("connection".equals(localName)) {
+ foundConnection = true;
+ }
+ // Database connection string
+ if ("dbPr".equals(localName)) {
+ String connection = atts.getValue("connection");
+ if (connection != null && !connection.isEmpty()) {
+ emitExternalRef(xhtml, "dbConnection", connection);
+ }
+ }
+ // Web query
+ if ("webPr".equals(localName)) {
+ foundWebQuery = true;
+ String url = atts.getValue("url");
+ if (url != null && !url.isEmpty()) {
+ emitExternalRef(xhtml, "webQuery", url);
+ }
+ }
+ // ODBC connection
+ if ("olapPr".equals(localName)) {
+ String connection = atts.getValue("connection");
+ if (connection != null && !connection.isEmpty()) {
+ emitExternalRef(xhtml, "olapConnection", connection);
+ }
+ }
+ // Text file import
+ if ("textPr".equals(localName)) {
+ String sourceFile = atts.getValue("sourceFile");
+ if (sourceFile != null && !sourceFile.isEmpty()) {
+ emitExternalRef(xhtml, "textFileImport", sourceFile);
+ }
+ }
+ }
+
+ boolean hasConnections() {
+ return foundConnection;
+ }
+
+ boolean hasWebQueries() {
+ return foundWebQuery;
+ }
+ }
+
+ /**
+ * Handler for parsing queryTable XML to extract web query sources.
+ */
+ private class QueryTableHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+
+ QueryTableHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ if ("queryTable".equals(localName)) {
+ String connectionId = atts.getValue("connectionId");
+ // Connection details are in connections.xml
+ }
+ // Web query table refresh
+ if ("queryTableRefresh".equals(localName)) {
+ // Contains refresh settings
+ }
+ }
+ }
+
+ /**
+ * Handler for parsing pivotCacheDefinition XML to detect external data
sources.
+ */
+ private class PivotCacheHandler extends DefaultHandler {
+ private final XHTMLContentHandler xhtml;
+ private boolean hasExternalData = false;
+
+ PivotCacheHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes atts)
+ throws SAXException {
+ // cacheSource with type="external" indicates external data
+ if ("cacheSource".equals(localName)) {
+ String type = atts.getValue("type");
+ if ("external".equals(type) || "consolidation".equals(type)) {
+ hasExternalData = true;
+ }
+ }
+ // worksheetSource can have external references
+ if ("worksheetSource".equals(localName)) {
+ String ref = atts.getValue("ref");
+ String sheet = atts.getValue("sheet");
+ String rId =
atts.getValue("http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"id");
+ // If there's a relationship ID, it likely points to external
workbook
+ if (rId != null) {
+ hasExternalData = true;
+ }
+ }
+ // consolidation source (multiple ranges, possibly external)
+ if ("consolidation".equals(localName) ||
"rangeSets".equals(localName)) {
+ hasExternalData = true;
+ }
+ }
+
+ boolean hasExternalData() {
+ return hasExternalData;
+ }
}
private void getThreadedComments(OPCPackage container, PackagePart
sheetPart, XHTMLContentHandler xhtml) throws TikaException,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index b3b0841588..2488804dbb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -60,14 +60,19 @@ import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.EMFParser;
import org.apache.tika.parser.microsoft.FormattingUtils;
@@ -84,7 +89,6 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
// Part 3, Step 3
private static final String LIST_DELIMITER = " ";
-
//include all parts that might have embedded objects
private final static String[] MAIN_PART_RELATIONS =
new String[]{XWPFRelation.HEADER.getRelation(),
XWPFRelation.FOOTER.getRelation(),
@@ -240,8 +244,54 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
//hyperlinks may or may not have hyperlink ids
String lastHyperlinkId = null;
boolean inHyperlink = false;
+ // Track field-based hyperlinks (using instrText/fldChar)
+ FieldHyperlinkTracker fieldTracker = new FieldHyperlinkTracker();
+ boolean inFieldHyperlink = false;
+
// Do the iruns
for (IRunElement run : paragraph.getIRuns()) {
+ // Check for field-based hyperlinks first (instrText HYPERLINK)
+ if (run instanceof XWPFRun) {
+ XWPFRun xwpfRun = (XWPFRun) run;
+ boolean wasInFieldHyperlink =
fieldTracker.isInFieldHyperlink();
+ String fieldUrl = extractFieldLinks(xwpfRun, fieldTracker);
+
+ // If we just entered a field hyperlink, open the anchor tag
+ if (fieldUrl != null && !inFieldHyperlink) {
+ // Close any existing relationship-based hyperlink first
+ if (inHyperlink) {
+ FormattingUtils.closeStyleTags(xhtml, formattingState);
+ xhtml.endElement("a");
+ inHyperlink = false;
+ lastHyperlinkId = null;
+ }
+ FormattingUtils.closeStyleTags(xhtml, formattingState);
+ xhtml.startElement("a", "href", fieldUrl);
+ inFieldHyperlink = true;
+ metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+ }
+
+ // If we just exited a field hyperlink, close the anchor tag
+ if (wasInFieldHyperlink && !fieldTracker.isInFieldHyperlink()
&& inFieldHyperlink) {
+ FormattingUtils.closeStyleTags(xhtml, formattingState);
+ xhtml.endElement("a");
+ inFieldHyperlink = false;
+ }
+
+ // Emit any external refs (INCLUDEPICTURE, INCLUDETEXT,
IMPORT, LINK) as anchors
+ if (fieldTracker.getLastExternalRefUrl() != null) {
+ AttributesImpl extRefAtts = new AttributesImpl();
+ extRefAtts.addAttribute("", "class", "class", "CDATA",
+ "external-ref-" +
fieldTracker.getLastExternalRefType());
+ extRefAtts.addAttribute("", "href", "href", "CDATA",
+ fieldTracker.getLastExternalRefUrl());
+ xhtml.startElement("a", extRefAtts);
+ xhtml.endElement("a");
+ metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
+ fieldTracker.clearExternalRef();
+ }
+ }
+
if (run instanceof XWPFHyperlinkRun) {
XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
if (hyperlinkRun.getHyperlinkId() == null ||
@@ -285,6 +335,9 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
if (inHyperlink) {
xhtml.endElement("a");
}
+ if (inFieldHyperlink) {
+ xhtml.endElement("a");
+ }
// Now do any comments for the paragraph
@@ -469,6 +522,46 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.characters(run.getContent().getText());
}
+ /**
+ * Extracts field-based hyperlinks from a run by examining fldChar and
instrText elements.
+ * This handles HYPERLINK field codes that are not relationship-based.
+ *
+ * @param run the run to examine
+ * @param tracker the field hyperlink tracker maintaining state across runs
+ * @return the hyperlink URL if this run starts a hyperlink, null otherwise
+ */
+ private String extractFieldLinks(XWPFRun run, FieldHyperlinkTracker
tracker) {
+ CTR ctr = run.getCTR();
+ try (XmlCursor cursor = ctr.newCursor()) {
+ if (cursor.toFirstChild()) {
+ do {
+ String localName = cursor.getName().getLocalPart();
+ if ("fldChar".equals(localName)) {
+ XmlObject obj = cursor.getObject();
+ if (obj instanceof CTFldChar) {
+ CTFldChar fldChar = (CTFldChar) obj;
+ STFldCharType.Enum fldType =
fldChar.getFldCharType();
+ if (fldType == STFldCharType.BEGIN) {
+ tracker.startField();
+ } else if (fldType == STFldCharType.SEPARATE) {
+ return tracker.separate();
+ } else if (fldType == STFldCharType.END) {
+ tracker.endField();
+ }
+ }
+ } else if ("instrText".equals(localName)) {
+ XmlObject obj = cursor.getObject();
+ if (obj instanceof CTText) {
+ CTText text = (CTText) obj;
+ tracker.addInstrText(text.getStringValue());
+ }
+ }
+ } while (cursor.toNextSibling());
+ }
+ }
+ return null;
+ }
+
private void extractTable(XWPFTable table, XWPFListManager listManager,
XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index 72767fa15c..2950e46be3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -201,6 +201,11 @@ public class XSLFEventBasedPowerPointExtractor implements
POIXMLTextExtractor {
//no-op
}
+ @Override
+ public void linkedOLERef(String refId) {
+ //no-op
+ }
+
@Override
public void embeddedPicRef(String picFileName, String picDescription) {
//no-op
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index e23a3f7e76..8056f26bfb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -362,6 +362,11 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
//no-op
}
+ @Override
+ public void linkedOLERef(String refId) {
+ //no-op
+ }
+
@Override
public void embeddedPicRef(String picFileName, String picDescription) {
//no-op
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 211d05b27a..18f408d452 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -588,4 +588,47 @@ public class ExcelParserTest extends TikaTest {
assertEquals("true", m.get(Office.HAS_COMMENTS));
assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
}
+
+ /**
+ * Test extraction of external data connections from XLSX files.
+ * These can be used to exfiltrate data or load malicious content.
+ */
+ @Test
+ public void testDataConnections() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testDataConnections.xlsx");
+ Metadata m = metadataList.get(0);
+ // Check metadata flags are set
+ assertEquals("true", m.get(Office.HAS_DATA_CONNECTIONS));
+ assertEquals("true", m.get(Office.HAS_WEB_QUERIES));
+
+ String xml = getXML("testDataConnections.xlsx").xml;
+ // Test web query extraction
+ assertContains("class=\"external-ref-webQuery\"", xml);
+ assertContains("http://example.com/data.html", xml);
+ // Test database connection extraction
+ assertContains("class=\"external-ref-dbConnection\"", xml);
+ assertContains("db.example.org", xml);
+ // Test text file import
+ assertContains("class=\"external-ref-textFileImport\"", xml);
+ assertContains("http://example.net/data.csv", xml);
+ }
+
+ /**
+ * Test detection of DDE links in Excel files.
+ * DDE (Dynamic Data Exchange) links are a security risk as they can
execute commands.
+ */
+ @Test
+ public void testDdeLinks() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testDdeLink.xlsx");
+ Metadata m = metadataList.get(0);
+ // Check DDE link metadata flag is set
+ assertEquals("true", m.get(Office.HAS_DDE_LINKS));
+ // Also check external links flag since DDE is in externalLinks
+ assertEquals("true", m.get(Office.HAS_EXTERNAL_LINKS));
+
+ String xml = getXML("testDdeLink.xlsx").xml;
+ // Test DDE link extraction (service|topic format)
+ assertContains("class=\"external-ref-ddeLink\"", xml);
+ assertContains("cmd|", xml);
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 2538f3b7b2..45e2caabcd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1814,4 +1814,43 @@ public class OOXMLParserTest extends
MultiThreadedTikaTest {
String content = getText("testRecordSizeExceeded.xlsx");
assertContains("Repetitive content pattern 3 for compression test row
1", content);
}
+
+ /**
+ * Test extraction of field-based hyperlinks using instrText/fldChar.
+ * These are hyperlinks embedded as field codes rather than
relationship-based hyperlinks.
+ * Uses the DOM-based XWPFWordExtractorDecorator.
+ */
+ @Test
+ public void testInstrTextHyperlink() throws Exception {
+ String xml = getXML("testInstrLink.docx").xml;
+ // The document contains a HYPERLINK field code in instrText
+ assertContains("<a href=\"https://exmaple.com/file\">", xml);
+ assertContains("Access Document(s)", xml);
+ }
+
+ /**
+ * Test extraction of external reference field codes (INCLUDEPICTURE,
INCLUDETEXT, IMPORT, LINK).
+ * These can be used to hide malicious URLs in documents.
+ */
+ @Test
+ public void testExternalRefFieldCodes() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testExternalRefs.docx");
+ Metadata m = metadataList.get(0);
+ // Check metadata flag is set
+ assertEquals("true", m.get(Office.HAS_FIELD_HYPERLINKS));
+
+ String xml = getXML("testExternalRefs.docx").xml;
+ // Test INCLUDEPICTURE field code
+ assertContains("class=\"external-ref-INCLUDEPICTURE\"", xml);
+ assertContains("http://example.com/tracking.png", xml);
+ // Test INCLUDETEXT field code
+ assertContains("class=\"external-ref-INCLUDETEXT\"", xml);
+ assertContains("http://example.org/payload.txt", xml);
+ // Test IMPORT field code
+ assertContains("class=\"external-ref-IMPORT\"", xml);
+ assertContains("http://example.net/exploit.wmf", xml);
+ // Test LINK field code
+ assertContains("class=\"external-ref-LINK\"", xml);
+ assertContains("http://test.invalid/cmd.docx", xml);
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 7653840e60..2ae7d2c7f7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -848,4 +848,113 @@ public class SXWPFExtractorTest extends TikaTest {
assertContainsCount("inside-text", xml, 1);
}
+ /**
+ * Test extraction of field-based hyperlinks using instrText/fldChar.
+ * These are hyperlinks embedded as field codes rather than
relationship-based hyperlinks.
+ */
+ @Test
+ public void testInstrTextHyperlink() throws Exception {
+ String xml = getXML("testInstrLink.docx", parseContext).xml;
+ // The document contains a HYPERLINK field code in instrText
+ assertContains("<a href=\"https://exmaple.com/file\">", xml);
+ assertContains("Access Document(s)", xml);
+ }
+
+ /**
+ * Test extraction of external reference field codes (INCLUDEPICTURE,
INCLUDETEXT, IMPORT, LINK).
+ * These can be used to hide malicious URLs in documents.
+ */
+ @Test
+ public void testExternalRefFieldCodes() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testExternalRefs.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ // Check metadata flag is set
+ assertEquals("true", m.get(Office.HAS_FIELD_HYPERLINKS));
+
+ String xml = getXML("testExternalRefs.docx", parseContext).xml;
+ // Test INCLUDEPICTURE field code
+ assertContains("class=\"external-ref-INCLUDEPICTURE\"", xml);
+ assertContains("http://example.com/tracking.png", xml);
+ // Test INCLUDETEXT field code
+ assertContains("class=\"external-ref-INCLUDETEXT\"", xml);
+ assertContains("http://example.org/payload.txt", xml);
+ // Test IMPORT field code
+ assertContains("class=\"external-ref-IMPORT\"", xml);
+ assertContains("http://example.net/exploit.wmf", xml);
+ // Test LINK field code
+ assertContains("class=\"external-ref-LINK\"", xml);
+ assertContains("http://test.invalid/cmd.docx", xml);
+ }
+
+ /**
+ * Test extraction of hlinkHover (hover hyperlinks) and VML shape hrefs.
+ * These are sneaky ways to hide malicious URLs.
+ */
+ @Test
+ public void testHoverAndVmlHyperlinks() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testHoverAndVml.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ // Check metadata flags are set
+ assertEquals("true", m.get(Office.HAS_HOVER_HYPERLINKS));
+ assertEquals("true", m.get(Office.HAS_VML_HYPERLINKS));
+
+ String xml = getXML("testHoverAndVml.docx", parseContext).xml;
+ // Test hlinkHover (activates on mouse hover, not click)
+ assertContains("class=\"external-ref-hlinkHover\"", xml);
+ assertContains("http://hover.example.com/phishing", xml);
+ // Test VML shape href
+ assertContains("class=\"external-ref-vml-shape-href\"", xml);
+ assertContains("http://vml.example.org/shape-link", xml);
+ }
+
+ /**
+ * Test detection of mail merge in Word documents.
+ * Mail merge can reference external data sources.
+ */
+ @Test
+ public void testMailMerge() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testMailMerge.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ assertEquals("true", m.get(Office.HAS_MAIL_MERGE));
+ }
+
+ /**
+ * Test detection of attached external template.
+ * Templates can be fetched from malicious URLs.
+ */
+ @Test
+ public void testAttachedTemplate() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testAttachedTemplate.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ assertEquals("true", m.get(Office.HAS_ATTACHED_TEMPLATE));
+
+ String xml = getXML("testAttachedTemplate.docx", parseContext).xml;
+ assertContains("class=\"external-ref-attachedTemplate\"", xml);
+ assertContains("example.com/templates", xml);
+ }
+
+ /**
+ * Test detection of subdocuments (master document linking external docs).
+ */
+ @Test
+ public void testSubdocument() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testSubdocument.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ assertEquals("true", m.get(Office.HAS_SUBDOCUMENTS));
+
+ String xml = getXML("testSubdocument.docx", parseContext).xml;
+ assertContains("class=\"external-ref-subDocument\"", xml);
+ assertContains("example.org/chapters", xml);
+ }
+
+ /**
+ * Test detection of framesets (HTML frames loading external URLs).
+ */
+ @Test
+ public void testFrameset() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testFrameset.docx", parseContext);
+ Metadata m = metadataList.get(0);
+ assertEquals("true", m.get(Office.HAS_FRAMESETS));
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx
new file mode 100644
index 0000000000..768258ad11
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAttachedTemplate.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx
new file mode 100644
index 0000000000..af76b99347
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx
new file mode 100644
index 0000000000..be4912b4b5
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDdeLink.xlsx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx
new file mode 100644
index 0000000000..8b8d3c1adc
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx
new file mode 100644
index 0000000000..d19070fe07
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFrameset.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx
new file mode 100644
index 0000000000..2b43e1e047
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
new file mode 100644
index 0000000000..3b2fc9257b
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx
new file mode 100644
index 0000000000..e0c8f00b03
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testMailMerge.docx
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx
new file mode 100644
index 0000000000..7bf396e35b
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testSubdocument.docx
differ