This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 371b997628 TIKA-4657 -- improve extraction from footnote/endnotes in
xwpf (#2605)
371b997628 is described below
commit 371b997628978c6f3e98a919c7219e800791da4d
Author: Tim Allison <[email protected]>
AuthorDate: Thu Feb 12 20:58:57 2026 -0500
TIKA-4657 -- improve extraction from footnote/endnotes in xwpf (#2605)
(cherry picked from commit 78285be19fc72beb7bc85bf3d85c6b11a8c54a41)
---
.../ooxml/XWPFWordExtractorDecorator.java | 51 +++++++++++++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 12 +++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 5 ++
.../test-documents/testWORD_endnote_table.docx | Bin 0 -> 18504 bytes
4 files changed, 63 insertions(+), 5 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index a44f4525d7..1140af7e3c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -42,6 +42,8 @@ import org.apache.poi.xwpf.usermodel.ICell;
import org.apache.poi.xwpf.usermodel.IRunElement;
import org.apache.poi.xwpf.usermodel.ISDTContent;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFEndnote;
+import org.apache.poi.xwpf.usermodel.XWPFFootnote;
import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
@@ -340,11 +342,6 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.characters(commentText);
}
- String footnameText = paragraph.getFootnoteText();
- if (footnameText != null && footnameText.length() > 0) {
- xhtml.characters(footnameText + "\n");
- }
-
// Also extract any paragraphs embedded in text boxes
//Note "w:txbxContent//"...must look for all descendant paragraphs
//not just the immediate children of txbxContent -- TIKA-2807
@@ -359,11 +356,55 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
// Finish this paragraph
xhtml.endElement(tag);
+ // Extract footnote/endnote content after the paragraph close tag
+ // to avoid invalid nested block elements (TIKA-4657)
+ extractFootnoteEndnoteContent(paragraph, listManager, xhtml);
+
if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters())
{
extractFooters(xhtml, headerFooterPolicy, listManager);
}
}
+ private void extractFootnoteEndnoteContent(XWPFParagraph paragraph,
+ XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ String nsW =
"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+ QName footnoteRefQName = new QName(nsW, "footnoteReference");
+ QName endnoteRefQName = new QName(nsW, "endnoteReference");
+ QName idQName = new QName(nsW, "id");
+ for (XmlObject obj : paragraph.getCTP().selectPath(
+ "declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'"
+ + " .//w:footnoteReference | .//w:endnoteReference")) {
+ XmlObject idAttr = obj.selectAttribute(idQName);
+ if (idAttr == null) {
+ continue;
+ }
+ int id;
+ try {
+ id = Integer.parseInt(idAttr.getDomNode().getNodeValue());
+ } catch (NumberFormatException e) {
+ continue;
+ }
+ boolean isFootnote =
obj.getDomNode().getLocalName().equals("footnoteReference");
+ if (isFootnote) {
+ XWPFFootnote footnote = document.getFootnoteByID(id);
+ if (footnote != null) {
+ xhtml.startElement("div", "class", "footnote");
+ extractIBodyText(footnote, listManager, xhtml);
+ xhtml.endElement("div");
+ }
+ } else {
+ XWPFEndnote endnote = document.getEndnoteByID(id);
+ if (endnote != null) {
+ xhtml.startElement("div", "class", "endnote");
+ extractIBodyText(endnote, listManager, xhtml);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+
private void processEmbeddedObjects(List<XWPFRun> runs,
XHTMLContentHandler xhtml)
throws SAXException {
// TODO: replace w/ XPath/XQuery:
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index d75c7a20a8..8d8be04e57 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -360,6 +360,18 @@ public class OOXMLParserTest extends MultiThreadedTikaTest
{
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertTrue(xmlResult.xml.contains("snoska"));
+ //TIKA-4657 -- footnote content should be in a div with class
"footnote"
+ // and should not be nested inside the paragraph
+ assertContains("<div class=\"footnote\">", xmlResult.xml);
+ assertNotContained("<p><div class=\"footnote\">", xmlResult.xml);
+ }
+
+ @Test
+ public void testEndnoteWithTable() throws Exception {
+ XMLResult xmlResult = getXML("testWORD_endnote_table.docx");
+ assertContains("Cat Property Act", xmlResult.xml);
+ //TIKA-4657 -- endnote content should be in a div with class "endnote"
+ assertContains("<div class=\"endnote\">", xmlResult.xml);
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index c0482bd304..3168d9d4cf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -211,7 +211,12 @@ public class SXWPFExtractorTest extends TikaTest {
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertTrue(xmlResult.xml.contains("snoska"));
+ }
+ @Test
+ public void testEndnoteWithTable() throws Exception {
+ XMLResult xmlResult = getXML("testWORD_endnote_table.docx",
parseContext);
+ assertContains("Cat Property Act", xmlResult.xml);
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
new file mode 100644
index 0000000000..ccd894b9f5
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_endnote_table.docx
differ