This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4646-instrText-hyperlinks in repository https://gitbox.apache.org/repos/asf/tika.git
commit 55544385482aa19fa26437e05296e02e71804d5e Author: tallison <[email protected]> AuthorDate: Mon Feb 2 14:20:57 2026 -0500 TIKA-4646 -- extract hyperlinks from instrText fields in ooxml docx --- .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 57 ++++++++ .../ooxml/XWPFWordExtractorDecorator.java | 147 +++++++++++++++++++++ .../parser/microsoft/ooxml/OOXMLParserTest.java | 13 ++ .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 12 ++ .../resources/test-documents/testInstrLink.docx | Bin 0 -> 14464 bytes 5 files changed, 229 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java index a2e940b587..7c7068d36b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java @@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft.ooxml; import java.util.Date; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.Attributes; @@ -108,6 +110,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private final static String MOVE_TO = "moveTo"; private final static String ENDNOTE_REFERENCE = "endnoteReference"; private static final String TEXTBOX = "textbox"; + private final static String FLD_CHAR = "fldChar"; + private final static String INSTR_TEXT = "instrText"; + private final static String FLD_CHAR_TYPE = "fldCharType"; + private static final Pattern HYPERLINK_PATTERN = + Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); private final XWPFBodyContentsHandler bodyContentsHandler; private final Map<String, String> linkedRelationships; private final RunProperties currRunProperties = new RunProperties(); @@ -145,6 +152,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { private boolean inHlinkClick = false; private boolean inTextBox = false; private boolean inV = false; //in c:v in chart file + // Field code tracking for instrText-based hyperlinks + private boolean inField = false; + private boolean inInstrText = false; + private boolean inFieldHyperlink = false; + private final StringBuilder instrTextBuffer = new StringBuilder(); private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE; private DateUtils dateUtils = new DateUtils(); @@ -341,6 +353,28 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { if ("0".equals(val) || "false".equals(val)) { hiddenSlide = true; } + } else if (FLD_CHAR.equals(localName)) { + String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE); + if ("begin".equals(fldCharType)) { + inField = true; + instrTextBuffer.setLength(0); + } else if ("separate".equals(fldCharType)) { + // Parse instrText for HYPERLINK + String url = parseHyperlinkFromInstrText(instrTextBuffer.toString()); + if (url != null) { + bodyContentsHandler.hyperlinkStart(url); + inFieldHyperlink = true; + } + } else if ("end".equals(fldCharType)) { + if (inFieldHyperlink) { + bodyContentsHandler.hyperlinkEnd(); + inFieldHyperlink = false; + } + inField = false; + instrTextBuffer.setLength(0); + } + } else if (INSTR_TEXT.equals(localName)) { + inInstrText = true; } } @@ -376,6 +410,24 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { return -1; } + /** + * Parses a HYPERLINK URL from instrText field code content. + * Field codes like: HYPERLINK "https://example.com" + * + * @param instrText the accumulated instrText content + * @return the URL if found, or null + */ + private String parseHyperlinkFromInstrText(String instrText) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); + if (m.find()) { + return m.group(1); + } + return null; + } + @Override public void endElement(String uri, String localName, String qName) throws SAXException { @@ -441,6 +493,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { inRt = false; } else if (RUBY.equals(localName)) { handleEndOfRuby(); + } else if (INSTR_TEXT.equals(localName)) { + inInstrText = false; } } @@ -498,6 +552,9 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler { } else if (inV) { appendToBuffer(ch, start, length); appendToBuffer(TAB_CHAR, 0, 1); + } else if (inInstrText && inField) { + // Accumulate instrText content for field code parsing (e.g., HYPERLINK) + instrTextBuffer.append(ch, start, length); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index b3b0841588..76229b5fbf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -23,6 +23,8 @@ import java.util.Deque; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.xml.namespace.QName; import com.microsoft.schemas.vml.impl.CTShapeImpl; @@ -60,9 +62,13 @@ import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -84,6 +90,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // Part 3, Step 3 private static final String LIST_DELIMITER = " "; + // Pattern to extract HYPERLINK URL from instrText field codes + private static final Pattern HYPERLINK_PATTERN = + Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE); //include all parts that might have embedded objects private final static String[] MAIN_PART_RELATIONS = @@ -240,8 +249,40 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { //hyperlinks may or may not have hyperlink ids String lastHyperlinkId = null; boolean inHyperlink = false; + // Track field-based hyperlinks (using instrText/fldChar) + FieldHyperlinkTracker fieldTracker = new FieldHyperlinkTracker(); + boolean inFieldHyperlink = false; + // Do the iruns for (IRunElement run : paragraph.getIRuns()) { + // Check for field-based hyperlinks first (instrText HYPERLINK) + if (run instanceof XWPFRun) { + XWPFRun xwpfRun = (XWPFRun) run; + boolean wasInFieldHyperlink = fieldTracker.isInFieldHyperlink(); + String fieldUrl = extractFieldLinks(xwpfRun, fieldTracker); + + // If we just entered a field hyperlink, open the anchor tag + if (fieldUrl != null && !inFieldHyperlink) { + // Close any existing relationship-based hyperlink first + if (inHyperlink) { + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.endElement("a"); + inHyperlink = false; + lastHyperlinkId = null; + } + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.startElement("a", "href", fieldUrl); + inFieldHyperlink = true; + } + + // If we just exited a field hyperlink, close the anchor tag + if (wasInFieldHyperlink && !fieldTracker.isInFieldHyperlink() && inFieldHyperlink) { + FormattingUtils.closeStyleTags(xhtml, formattingState); + xhtml.endElement("a"); + inFieldHyperlink = false; + } + } + if (run instanceof XWPFHyperlinkRun) { XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run; if (hyperlinkRun.getHyperlinkId() == null || @@ -285,6 +326,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { if (inHyperlink) { xhtml.endElement("a"); } + if (inFieldHyperlink) { + xhtml.endElement("a"); + } // Now do any comments for the paragraph @@ -469,6 +513,109 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { xhtml.characters(run.getContent().getText()); } + /** + * Extracts field-based hyperlinks from a run by examining fldChar and instrText elements. + * This handles HYPERLINK field codes that are not relationship-based. + * + * @param run the run to examine + * @param tracker the field hyperlink tracker maintaining state across runs + * @return the hyperlink URL if this run starts a hyperlink, null otherwise + */ + private String extractFieldLinks(XWPFRun run, FieldHyperlinkTracker tracker) { + CTR ctr = run.getCTR(); + try (XmlCursor cursor = ctr.newCursor()) { + if (cursor.toFirstChild()) { + do { + String localName = cursor.getName().getLocalPart(); + if ("fldChar".equals(localName)) { + XmlObject obj = cursor.getObject(); + if (obj instanceof CTFldChar) { + CTFldChar fldChar = (CTFldChar) obj; + STFldCharType.Enum fldType = fldChar.getFldCharType(); + if (fldType == STFldCharType.BEGIN) { + tracker.startField(); + } else if (fldType == STFldCharType.SEPARATE) { + return tracker.separate(); + } else if (fldType == STFldCharType.END) { + tracker.endField(); + } + } + } else if ("instrText".equals(localName)) { + XmlObject obj = cursor.getObject(); + if (obj instanceof CTText) { + CTText text = (CTText) obj; + tracker.addInstrText(text.getStringValue()); + } + } + } while (cursor.toNextSibling()); + } + } + return null; + } + + /** + * Parses a HYPERLINK URL from instrText field code content. + * + * @param instrText the accumulated instrText content + * @return the URL if found, or null + */ + private static String parseHyperlinkFromInstrText(String instrText) { + if (instrText == null || instrText.isEmpty()) { + return null; + } + Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim()); + if (m.find()) { + return m.group(1); + } + return null; + } + + /** + * Tracks field hyperlink state across multiple runs within a paragraph. + * Field codes span multiple runs: begin -> instrText -> separate -> text runs -> end + */ + private static class FieldHyperlinkTracker { + private boolean inField = false; + private boolean inFieldHyperlink = false; + private final StringBuilder instrTextBuffer = new StringBuilder(); + + void startField() { + inField = true; + instrTextBuffer.setLength(0); + } + + void addInstrText(String text) { + if (inField && text != null) { + instrTextBuffer.append(text); + } + } + + /** + * Called when fldChar separate is encountered. + * @return the hyperlink URL if this is a HYPERLINK field, null otherwise + */ + String separate() { + if (inField) { + String url = parseHyperlinkFromInstrText(instrTextBuffer.toString()); + if (url != null) { + inFieldHyperlink = true; + return url; + } + } + return null; + } + + void endField() { + inField = false; + inFieldHyperlink = false; + instrTextBuffer.setLength(0); + } + + boolean isInFieldHyperlink() { + return inFieldHyperlink; + } + } + private void extractTable(XWPFTable table, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 2538f3b7b2..c65fb64f03 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1814,4 +1814,17 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { String content = getText("testRecordSizeExceeded.xlsx"); assertContains("Repetitive content pattern 3 for compression test row 1", content); } + + /** + * Test extraction of field-based hyperlinks using instrText/fldChar. + * These are hyperlinks embedded as field codes rather than relationship-based hyperlinks. + * Uses the DOM-based XWPFWordExtractorDecorator. + */ + @Test + public void testInstrTextHyperlink() throws Exception { + String xml = getXML("testInstrLink.docx").xml; + // The document contains a HYPERLINK field code in instrText + assertContains("<a href=\"https://exmaple.com/file\">", xml); + assertContains("Access Document(s)", xml); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java index 7653840e60..54df3f4761 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java @@ -848,4 +848,16 @@ public class SXWPFExtractorTest extends TikaTest { assertContainsCount("inside-text", xml, 1); } + /** + * Test extraction of field-based hyperlinks using instrText/fldChar. + * These are hyperlinks embedded as field codes rather than relationship-based hyperlinks. + */ + @Test + public void testInstrTextHyperlink() throws Exception { + String xml = getXML("testInstrLink.docx", parseContext).xml; + // The document contains a HYPERLINK field code in instrText + assertContains("<a href=\"https://exmaple.com/file\">", xml); + assertContains("Access Document(s)", xml); + } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx new file mode 100644 index 0000000000..3b2fc9257b Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testInstrLink.docx differ
