This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4646-instrText-hyperlinks
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 55544385482aa19fa26437e05296e02e71804d5e
Author: tallison <[email protected]>
AuthorDate: Mon Feb 2 14:20:57 2026 -0500

    TIKA-4646 -- extract hyperlinks from instrText fields in ooxml docx
---
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  57 ++++++++
 .../ooxml/XWPFWordExtractorDecorator.java          | 147 +++++++++++++++++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  13 ++
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java |  12 ++
 .../resources/test-documents/testInstrLink.docx    | Bin 0 -> 14464 bytes
 5 files changed, 229 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index a2e940b587..7c7068d36b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import java.util.Date;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.Attributes;
@@ -108,6 +110,11 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private final static String MOVE_TO = "moveTo";
     private final static String ENDNOTE_REFERENCE = "endnoteReference";
     private static final String TEXTBOX = "textbox";
+    private final static String FLD_CHAR = "fldChar";
+    private final static String INSTR_TEXT = "instrText";
+    private final static String FLD_CHAR_TYPE = "fldCharType";
+    private static final Pattern HYPERLINK_PATTERN =
+            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
     private final XWPFBodyContentsHandler bodyContentsHandler;
     private final Map<String, String> linkedRelationships;
     private final RunProperties currRunProperties = new RunProperties();
@@ -145,6 +152,11 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private boolean inHlinkClick = false;
     private boolean inTextBox = false;
     private boolean inV = false; //in c:v in chart file
+    // Field code tracking for instrText-based hyperlinks
+    private boolean inField = false;
+    private boolean inInstrText = false;
+    private boolean inFieldHyperlink = false;
+    private final StringBuilder instrTextBuffer = new StringBuilder();
     private OOXMLWordAndPowerPointTextHandler.EditType editType =
             OOXMLWordAndPowerPointTextHandler.EditType.NONE;
     private DateUtils dateUtils = new DateUtils();
@@ -341,6 +353,28 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             if ("0".equals(val) || "false".equals(val)) {
                 hiddenSlide = true;
             }
+        } else if (FLD_CHAR.equals(localName)) {
+            String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE);
+            if ("begin".equals(fldCharType)) {
+                inField = true;
+                instrTextBuffer.setLength(0);
+            } else if ("separate".equals(fldCharType)) {
+                // Parse instrText for HYPERLINK
+                String url = 
parseHyperlinkFromInstrText(instrTextBuffer.toString());
+                if (url != null) {
+                    bodyContentsHandler.hyperlinkStart(url);
+                    inFieldHyperlink = true;
+                }
+            } else if ("end".equals(fldCharType)) {
+                if (inFieldHyperlink) {
+                    bodyContentsHandler.hyperlinkEnd();
+                    inFieldHyperlink = false;
+                }
+                inField = false;
+                instrTextBuffer.setLength(0);
+            }
+        } else if (INSTR_TEXT.equals(localName)) {
+            inInstrText = true;
         }
 
     }
@@ -376,6 +410,24 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         return -1;
     }
 
+    /**
+     * Parses a HYPERLINK URL from instrText field code content.
+     * Field codes like: HYPERLINK "https://example.com";
+     *
+     * @param instrText the accumulated instrText content
+     * @return the URL if found, or null
+     */
+    private String parseHyperlinkFromInstrText(String instrText) {
+        if (instrText == null || instrText.isEmpty()) {
+            return null;
+        }
+        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
+        if (m.find()) {
+            return m.group(1);
+        }
+        return null;
+    }
+
     @Override
     public void endElement(String uri, String localName, String qName) throws 
SAXException {
 
@@ -441,6 +493,8 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             inRt = false;
         } else if (RUBY.equals(localName)) {
             handleEndOfRuby();
+        } else if (INSTR_TEXT.equals(localName)) {
+            inInstrText = false;
         }
     }
 
@@ -498,6 +552,9 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         } else if (inV) {
             appendToBuffer(ch, start, length);
             appendToBuffer(TAB_CHAR, 0, 1);
+        } else if (inInstrText && inField) {
+            // Accumulate instrText content for field code parsing (e.g., 
HYPERLINK)
+            instrTextBuffer.append(ch, start, length);
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index b3b0841588..76229b5fbf 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -23,6 +23,8 @@ import java.util.Deque;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import javax.xml.namespace.QName;
 
 import com.microsoft.schemas.vml.impl.CTShapeImpl;
@@ -60,9 +62,13 @@ import org.apache.xmlbeans.XmlCursor;
 import org.apache.xmlbeans.XmlException;
 import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
@@ -84,6 +90,9 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
     // Part 3, Step 3
     private static final String LIST_DELIMITER = " ";
 
+    // Pattern to extract HYPERLINK URL from instrText field codes
+    private static final Pattern HYPERLINK_PATTERN =
+            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", 
Pattern.CASE_INSENSITIVE);
 
     //include all parts that might have embedded objects
     private final static String[] MAIN_PART_RELATIONS =
@@ -240,8 +249,40 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
         //hyperlinks may or may not have hyperlink ids
         String lastHyperlinkId = null;
         boolean inHyperlink = false;
+        // Track field-based hyperlinks (using instrText/fldChar)
+        FieldHyperlinkTracker fieldTracker = new FieldHyperlinkTracker();
+        boolean inFieldHyperlink = false;
+
         // Do the iruns
         for (IRunElement run : paragraph.getIRuns()) {
+            // Check for field-based hyperlinks first (instrText HYPERLINK)
+            if (run instanceof XWPFRun) {
+                XWPFRun xwpfRun = (XWPFRun) run;
+                boolean wasInFieldHyperlink = 
fieldTracker.isInFieldHyperlink();
+                String fieldUrl = extractFieldLinks(xwpfRun, fieldTracker);
+
+                // If we just entered a field hyperlink, open the anchor tag
+                if (fieldUrl != null && !inFieldHyperlink) {
+                    // Close any existing relationship-based hyperlink first
+                    if (inHyperlink) {
+                        FormattingUtils.closeStyleTags(xhtml, formattingState);
+                        xhtml.endElement("a");
+                        inHyperlink = false;
+                        lastHyperlinkId = null;
+                    }
+                    FormattingUtils.closeStyleTags(xhtml, formattingState);
+                    xhtml.startElement("a", "href", fieldUrl);
+                    inFieldHyperlink = true;
+                }
+
+                // If we just exited a field hyperlink, close the anchor tag
+                if (wasInFieldHyperlink && !fieldTracker.isInFieldHyperlink() 
&& inFieldHyperlink) {
+                    FormattingUtils.closeStyleTags(xhtml, formattingState);
+                    xhtml.endElement("a");
+                    inFieldHyperlink = false;
+                }
+            }
+
             if (run instanceof XWPFHyperlinkRun) {
                 XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
                 if (hyperlinkRun.getHyperlinkId() == null ||
@@ -285,6 +326,9 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
         if (inHyperlink) {
             xhtml.endElement("a");
         }
+        if (inFieldHyperlink) {
+            xhtml.endElement("a");
+        }
 
 
         // Now do any comments for the paragraph
@@ -469,6 +513,109 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
         xhtml.characters(run.getContent().getText());
     }
 
+    /**
+     * Extracts field-based hyperlinks from a run by examining fldChar and 
instrText elements.
+     * This handles HYPERLINK field codes that are not relationship-based.
+     *
+     * @param run the run to examine
+     * @param tracker the field hyperlink tracker maintaining state across runs
+     * @return the hyperlink URL if this run starts a hyperlink, null otherwise
+     */
+    private String extractFieldLinks(XWPFRun run, FieldHyperlinkTracker 
tracker) {
+        CTR ctr = run.getCTR();
+        try (XmlCursor cursor = ctr.newCursor()) {
+            if (cursor.toFirstChild()) {
+                do {
+                    String localName = cursor.getName().getLocalPart();
+                    if ("fldChar".equals(localName)) {
+                        XmlObject obj = cursor.getObject();
+                        if (obj instanceof CTFldChar) {
+                            CTFldChar fldChar = (CTFldChar) obj;
+                            STFldCharType.Enum fldType = 
fldChar.getFldCharType();
+                            if (fldType == STFldCharType.BEGIN) {
+                                tracker.startField();
+                            } else if (fldType == STFldCharType.SEPARATE) {
+                                return tracker.separate();
+                            } else if (fldType == STFldCharType.END) {
+                                tracker.endField();
+                            }
+                        }
+                    } else if ("instrText".equals(localName)) {
+                        XmlObject obj = cursor.getObject();
+                        if (obj instanceof CTText) {
+                            CTText text = (CTText) obj;
+                            tracker.addInstrText(text.getStringValue());
+                        }
+                    }
+                } while (cursor.toNextSibling());
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Parses a HYPERLINK URL from instrText field code content.
+     *
+     * @param instrText the accumulated instrText content
+     * @return the URL if found, or null
+     */
+    private static String parseHyperlinkFromInstrText(String instrText) {
+        if (instrText == null || instrText.isEmpty()) {
+            return null;
+        }
+        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
+        if (m.find()) {
+            return m.group(1);
+        }
+        return null;
+    }
+
+    /**
+     * Tracks field hyperlink state across multiple runs within a paragraph.
+     * Field codes span multiple runs: begin -> instrText -> separate -> text 
runs -> end
+     */
+    private static class FieldHyperlinkTracker {
+        private boolean inField = false;
+        private boolean inFieldHyperlink = false;
+        private final StringBuilder instrTextBuffer = new StringBuilder();
+
+        void startField() {
+            inField = true;
+            instrTextBuffer.setLength(0);
+        }
+
+        void addInstrText(String text) {
+            if (inField && text != null) {
+                instrTextBuffer.append(text);
+            }
+        }
+
+        /**
+         * Called when fldChar separate is encountered.
+         * @return the hyperlink URL if this is a HYPERLINK field, null 
otherwise
+         */
+        String separate() {
+            if (inField) {
+                String url = 
parseHyperlinkFromInstrText(instrTextBuffer.toString());
+                if (url != null) {
+                    inFieldHyperlink = true;
+                    return url;
+                }
+            }
+            return null;
+        }
+
+        void endField() {
+            inField = false;
+            inFieldHyperlink = false;
+            instrTextBuffer.setLength(0);
+        }
+
+        boolean isInFieldHyperlink() {
+            return inFieldHyperlink;
+        }
+    }
+
     private void extractTable(XWPFTable table, XWPFListManager listManager,
                               XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 2538f3b7b2..c65fb64f03 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1814,4 +1814,17 @@ public class OOXMLParserTest extends 
MultiThreadedTikaTest {
         String content = getText("testRecordSizeExceeded.xlsx");
         assertContains("Repetitive content pattern 3 for compression test row 
1", content);
     }
+
+    /**
+     * Test extraction of field-based hyperlinks using instrText/fldChar.
+     * These are hyperlinks embedded as field codes rather than 
relationship-based hyperlinks.
+     * Uses the DOM-based XWPFWordExtractorDecorator.
+     */
+    @Test
+    public void testInstrTextHyperlink() throws Exception {
+        String xml = getXML("testInstrLink.docx").xml;
+        // The document contains a HYPERLINK field code in instrText
+        assertContains("<a href=\"https://exmaple.com/file\";>", xml);
+        assertContains("Access Document(s)", xml);
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 7653840e60..54df3f4761 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -848,4 +848,16 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContainsCount("inside-text", xml, 1);
     }
 
+    /**
+     * Test extraction of field-based hyperlinks using instrText/fldChar.
+     * These are hyperlinks embedded as field codes rather than 
relationship-based hyperlinks.
+     */
+    @Test
+    public void testInstrTextHyperlink() throws Exception {
+        String xml = getXML("testInstrLink.docx", parseContext).xml;
+        // The document contains a HYPERLINK field code in instrText
+        assertContains("<a href=\"https://exmaple.com/file\";>", xml);
+        assertContains("Access Document(s)", xml);
+    }
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
new file mode 100644
index 0000000000..3b2fc9257b
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx
 differ

Reply via email to