This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4744
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 92220e060f68fc8e60bb1fa2071d418df94b2244
Author: tallison <[email protected]>
AuthorDate: Thu May 28 10:05:53 2026 -0400

    TIKA-4744 - fix rtf tags
---
 .../tika/parser/microsoft/rtf/TextExtractor.java   | 37 +++++++++++++++++++---
 .../tika/parser/microsoft/rtf/RTFParserTest.java   | 17 ++++++++++
 .../testRTF_nestedHyperlinkPageRef.rtf             |  9 ++++++
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 72913ae437..b4ad6ac544 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -311,6 +311,12 @@ final class TextExtractor {
     // Non-null if we've seen the url for a HYPERLINK but not yet
     // its text:
     private String pendingURL;
+    // Group depth at which the current <a href=...> was opened (in
+    // groupState.depth units). Used to defer </a> emission until we leave
+    // the fldrslt group that opened it, so a nested \field (e.g., PAGEREF
+    // inside a HYPERLINK's fldrslt) doesn't prematurely close the outer <a>
+    // via the fieldState==3 branch in processGroupEnd. -1 means no <a> open.
+    private int hyperlinkAnchorDepth = -1;
     // Used to process the sub-groups inside the upr
     // group:
     private int uprState = -1;
@@ -1365,8 +1371,18 @@ final class TextExtractor {
                 addOutputChar('\u201D');
             }
         } else if (equals("fldinst")) {
-            fieldState = 1;
-            groupState.ignore = false;
+            if (fieldState == 0) {
+                fieldState = 1;
+                groupState.ignore = false;
+            } else {
+                // Nested \fldinst inside an outer field (e.g., PAGEREF inside
+                // a HYPERLINK's fldrslt). Suppress the nested instruction text
+                // and leave the outer fieldState/pendingURL untouched: the
+                // outer field's closing group still needs to see fieldState==3
+                // to emit </a>. The accompanying \fldrslt of the nested field
+                // emits its display text into the outer hyperlink's <a>.
+                groupState.ignore = true;
+            }
         } else if (equals("fldrslt") && fieldState == 2) {
             assert pendingURL != null;
             lazyStartParagraph();
@@ -1375,6 +1391,10 @@ final class TextExtractor {
             out.startElement("", "a", "a", attrs);
             pendingURL = null;
             fieldState = 3;
+            // Remember which group depth owns this <a>. processGroupEnd only
+            // emits </a> once we leave this depth, so nested groups inside
+            // the fldrslt (e.g., a PAGEREF \field) don't trip the close early.
+            hyperlinkAnchorDepth = groupState.depth;
             groupState.ignore = false;
         }
     }
@@ -1547,8 +1567,17 @@ final class TextExtractor {
             // inlined, but fail to record them in metadata
             // as a field value.
         } else if (fieldState == 3) {
-            end("a");
-            fieldState = 0;
+            // Only close </a> once we've left the fldrslt group that opened
+            // it. groupState.depth here is the OUTER group we just restored
+            // to; if it's now below the recorded anchor depth, the fldrslt
+            // group has closed. This guards against nested-field group ends
+            // (e.g., a PAGEREF \fldinst inside the HYPERLINK fldrslt) closing
+            // the outer </a> too early.
+            if (hyperlinkAnchorDepth >= 0 && groupState.depth < 
hyperlinkAnchorDepth) {
+                end("a");
+                fieldState = 0;
+                hyperlinkAnchorDepth = -1;
+            }
         }
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 33243c963a..2c0a91bf9c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -60,6 +60,23 @@ public class RTFParserTest extends TikaTest {
         assertContains("indexation Word", content);
     }
 
+    @Test //TIKA-4744
+    public void testNestedHyperlinkPageRef() throws Exception {
+        // A HYPERLINK field with a PAGEREF \field nested inside its \fldrslt
+        // used to leak: the nested \fldinst would overwrite fieldState=3 with
+        // 1, so the outer fldrslt's group close skipped the </a> emission and
+        // <a> stayed open. The cascade surfaced at endDocument as the strict
+        // validator complaining </body> didn't match topmost <p>.
+        // getXML wraps the handler in StrictXHTMLValidator, so any imbalance
+        // would throw before the assertions.
+        XMLResult r = getXML("testRTF_nestedHyperlinkPageRef.rtf");
+        // PAGEREF result "42" should render INSIDE the outer hyperlink's <a>,
+        // not after a prematurely-closed </a>:
+        assertContains("<a href=\"#target\">42</a>", r.xml);
+        assertContains("Before", r.xml);
+        assertContains("after.", r.xml);
+    }
+
     @Test
     public void testUmlautSpacesExtraction2() throws Exception {
         String content = getText("testRTFUmlautSpaces2.rtf");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf
new file mode 100644
index 0000000000..250b56c558
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf
@@ -0,0 +1,9 @@
+{\rtf1\ansi
+{\fonttbl{\f0 Arial;}}
+\pard {\f0 Before }
+{\field {\*\fldinst HYPERLINK "#target" }{\fldrslt 
+{\field {\*\fldinst { PAGEREF "target" } }{\fldrslt 42}}
+}}
+{\f0  after.}
+\par
+}

Reply via email to