This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4657 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 11943309ffdd2ee95d38b8c3231f6e6883ae85fc Author: tallison <[email protected]> AuthorDate: Thu Feb 12 18:07:07 2026 -0500 TIKA-4657 -- improve extraction from footnote/endnotes in xwpf --- .../ooxml/XWPFWordExtractorDecorator.java | 51 +++++++++++++++++++--- .../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +++++ .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 5 +++ 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index 2488804dbb..f40ee517e2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -41,6 +41,8 @@ import org.apache.poi.xwpf.usermodel.ICell; import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.ISDTContent; import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFEndnote; +import org.apache.poi.xwpf.usermodel.XWPFFootnote; import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter; import org.apache.poi.xwpf.usermodel.XWPFHyperlink; import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; @@ -347,11 +349,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { xhtml.characters(commentText); } - String footnameText = paragraph.getFootnoteText(); - if (footnameText != null && footnameText.length() > 0) { - xhtml.characters(footnameText + "\n"); - } - // Also extract any paragraphs embedded in text boxes //Note "w:txbxContent//"...must look for all descendant paragraphs //not just the immediate children of txbxContent -- TIKA-2807 @@ -366,11 +363,55 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // Finish this paragraph xhtml.endElement(tag); + // Extract footnote/endnote content after the paragraph close tag + // to avoid invalid nested block elements (TIKA-4657) + extractFootnoteEndnoteContent(paragraph, listManager, xhtml); + if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) { extractFooters(xhtml, headerFooterPolicy, listManager); } } + private void extractFootnoteEndnoteContent(XWPFParagraph paragraph, + XWPFListManager listManager, + XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + String nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + QName footnoteRefQName = new QName(nsW, "footnoteReference"); + QName endnoteRefQName = new QName(nsW, "endnoteReference"); + QName idQName = new QName(nsW, "id"); + for (XmlObject obj : paragraph.getCTP().selectPath( + "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'" + + " .//w:footnoteReference | .//w:endnoteReference")) { + XmlObject idAttr = obj.selectAttribute(idQName); + if (idAttr == null) { + continue; + } + int id; + try { + id = Integer.parseInt(idAttr.getDomNode().getNodeValue()); + } catch (NumberFormatException e) { + continue; + } + boolean isFootnote = obj.getDomNode().getLocalName().equals("footnoteReference"); + if (isFootnote) { + XWPFFootnote footnote = document.getFootnoteByID(id); + if (footnote != null) { + xhtml.startElement("div", "class", "footnote"); + extractIBodyText(footnote, listManager, xhtml); + xhtml.endElement("div"); + } + } else { + XWPFEndnote endnote = document.getEndnoteByID(id); + if (endnote != null) { + xhtml.startElement("div", "class", "endnote"); + extractIBodyText(endnote, listManager, xhtml); + xhtml.endElement("div"); + } + } + } + } + private void processEmbeddedObjects(List<XWPFRun> runs, XHTMLContentHandler xhtml) throws SAXException { // TODO: replace w/ XPath/XQuery: diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 45e2caabcd..ecbe1956ec 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -358,6 +358,18 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertTrue(xmlResult.xml.contains("snoska")); + //TIKA-4657 -- footnote content should be in a div with class "footnote" + // and should not be nested inside the paragraph + assertContains("<div class=\"footnote\">", xmlResult.xml); + assertNotContained("<p><div class=\"footnote\">", xmlResult.xml); + } + + @Test + public void testEndnoteWithTable() throws Exception { + XMLResult xmlResult = getXML("testWORD_endnote_table.docx"); + assertContains("Cat Property Act", xmlResult.xml); + //TIKA-4657 -- endnote content should be in a div with class "endnote" + assertContains("<div class=\"endnote\">", xmlResult.xml); } /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java index 2ae7d2c7f7..1ec8be3242 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java @@ -210,7 +210,12 @@ public class SXWPFExtractorTest extends TikaTest { assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertTrue(xmlResult.xml.contains("snoska")); + } + @Test + public void testEndnoteWithTable() throws Exception { + XMLResult xmlResult = getXML("testWORD_endnote_table.docx", parseContext); + assertContains("Cat Property Act", xmlResult.xml); } /**
