This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 371b997628 TIKA-4657 -- improve extraction from footnote/endnotes in 
xwpf (#2605)
371b997628 is described below

commit 371b997628978c6f3e98a919c7219e800791da4d
Author: Tim Allison <[email protected]>
AuthorDate: Thu Feb 12 20:58:57 2026 -0500

    TIKA-4657 -- improve extraction from footnote/endnotes in xwpf (#2605)
    
    (cherry picked from commit 78285be19fc72beb7bc85bf3d85c6b11a8c54a41)
---
 .../ooxml/XWPFWordExtractorDecorator.java          |  51 +++++++++++++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  12 +++++
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java |   5 ++
 .../test-documents/testWORD_endnote_table.docx     | Bin 0 -> 18504 bytes
 4 files changed, 63 insertions(+), 5 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index a44f4525d7..1140af7e3c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -42,6 +42,8 @@ import org.apache.poi.xwpf.usermodel.ICell;
 import org.apache.poi.xwpf.usermodel.IRunElement;
 import org.apache.poi.xwpf.usermodel.ISDTContent;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFEndnote;
+import org.apache.poi.xwpf.usermodel.XWPFFootnote;
 import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
@@ -340,11 +342,6 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             xhtml.characters(commentText);
         }
 
-        String footnameText = paragraph.getFootnoteText();
-        if (footnameText != null && footnameText.length() > 0) {
-            xhtml.characters(footnameText + "\n");
-        }
-
         // Also extract any paragraphs embedded in text boxes
         //Note "w:txbxContent//"...must look for all descendant paragraphs
         //not just the immediate children of txbxContent -- TIKA-2807
@@ -359,11 +356,55 @@ public class XWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
         // Finish this paragraph
         xhtml.endElement(tag);
 
+        // Extract footnote/endnote content after the paragraph close tag
+        // to avoid invalid nested block elements (TIKA-4657)
+        extractFootnoteEndnoteContent(paragraph, listManager, xhtml);
+
         if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) 
{
             extractFooters(xhtml, headerFooterPolicy, listManager);
         }
     }
 
+    private void extractFootnoteEndnoteContent(XWPFParagraph paragraph,
+                                                  XWPFListManager listManager,
+                                                  XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        String nsW = 
"http://schemas.openxmlformats.org/wordprocessingml/2006/main";;
+        QName footnoteRefQName = new QName(nsW, "footnoteReference");
+        QName endnoteRefQName = new QName(nsW, "endnoteReference");
+        QName idQName = new QName(nsW, "id");
+        for (XmlObject obj : paragraph.getCTP().selectPath(
+                "declare namespace 
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'"
+                        + " .//w:footnoteReference | .//w:endnoteReference")) {
+            XmlObject idAttr = obj.selectAttribute(idQName);
+            if (idAttr == null) {
+                continue;
+            }
+            int id;
+            try {
+                id = Integer.parseInt(idAttr.getDomNode().getNodeValue());
+            } catch (NumberFormatException e) {
+                continue;
+            }
+            boolean isFootnote = 
obj.getDomNode().getLocalName().equals("footnoteReference");
+            if (isFootnote) {
+                XWPFFootnote footnote = document.getFootnoteByID(id);
+                if (footnote != null) {
+                    xhtml.startElement("div", "class", "footnote");
+                    extractIBodyText(footnote, listManager, xhtml);
+                    xhtml.endElement("div");
+                }
+            } else {
+                XWPFEndnote endnote = document.getEndnoteByID(id);
+                if (endnote != null) {
+                    xhtml.startElement("div", "class", "endnote");
+                    extractIBodyText(endnote, listManager, xhtml);
+                    xhtml.endElement("div");
+                }
+            }
+        }
+    }
+
     private void processEmbeddedObjects(List<XWPFRun> runs, 
XHTMLContentHandler xhtml)
             throws SAXException {
         // TODO: replace w/ XPath/XQuery:
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index d75c7a20a8..8d8be04e57 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -360,6 +360,18 @@ public class OOXMLParserTest extends MultiThreadedTikaTest 
{
         
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertTrue(xmlResult.xml.contains("snoska"));
+        //TIKA-4657 -- footnote content should be in a div with class 
"footnote"
+        // and should not be nested inside the paragraph
+        assertContains("<div class=\"footnote\">", xmlResult.xml);
+        assertNotContained("<p><div class=\"footnote\">", xmlResult.xml);
+    }
+
+    @Test
+    public void testEndnoteWithTable() throws Exception {
+        XMLResult xmlResult = getXML("testWORD_endnote_table.docx");
+        assertContains("Cat Property Act", xmlResult.xml);
+        //TIKA-4657 -- endnote content should be in a div with class "endnote"
+        assertContains("<div class=\"endnote\">", xmlResult.xml);
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index c0482bd304..3168d9d4cf 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -211,7 +211,12 @@ public class SXWPFExtractorTest extends TikaTest {
         
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertTrue(xmlResult.xml.contains("snoska"));
+    }
 
+    @Test
+    public void testEndnoteWithTable() throws Exception {
+        XMLResult xmlResult = getXML("testWORD_endnote_table.docx", 
parseContext);
+        assertContains("Cat Property Act", xmlResult.xml);
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
new file mode 100644
index 0000000000..ccd894b9f5
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
 differ

Reply via email to