This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4744 in repository https://gitbox.apache.org/repos/asf/tika.git
commit edafcbb16989f0c69e2ddd0698b1ceaf33b13a8e Author: tallison <[email protected]> AuthorDate: Thu May 28 09:31:40 2026 -0400 TIKA-4744 - fix odt tags --- .../tika/parser/odf/OpenDocumentBodyHandler.java | 48 ++++++++++++++------- .../org/apache/tika/parser/odf/ODFParserTest.java | 14 ++++++ .../testODT_svgTitleInStyledSpan.odt | Bin 0 -> 6820 bytes 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java index 6bbc7eaa86..60c1457d68 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java @@ -337,19 +337,31 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { } /** - * Returns true for ODF elements that map to block-level XHTML and so - * shouldn't sit inside open inline-style tags. When such an element opens - * while {@code <b>/<i>/<u>} are on the SAX stack, the inline tags would - * trap the new block element underneath them; subsequent style flips - * inside the block would emit close events that don't match the topmost - * open element. The startElement handler closes pending style tags - * before opening any of these. + * Returns true for ODF elements that shouldn't sit inside open inline-style + * tags. When such an element opens while {@code <b>/<i>/<u>} are on the SAX + * stack, the inline tags would trap the new element underneath them; + * subsequent style flips inside would emit close events that don't match + * the topmost open element. The startElement handler closes pending style + * tags before opening any of these. * <p> + * Two cases qualify: + * <ul> + * <li>Block-level XHTML targets (draw:text-box, table/row/cell, list-item) + * — opening a block under inline styles produces malformed XHTML even + * if the SAX stream happened to balance.</li> + * <li>svg:title / svg:desc — empty or near-empty inline elements that map + * to {@code <span>} via MAPPINGS. When their parent {@code <text:span>} + * had a bold/italic/underline style, the outer {@code <b>/<i>/<u>} + * is still on top of the SAX stack when the svg's {@code <span>} + * opens; the existing endElement closeStyleTags then tries to close + * {@code </b>} while the svg span is topmost, which the strict + * validator (correctly) rejects.</li> + * </ul> * text:p / text:h / text:list / annotation / note / notes / a are handled * by their own branches in startElement and never reach the default * branch where this check is used. */ - private static boolean isBlockLevelOpen(String uri, String localName) { + private static boolean closeStylesBeforeOpen(String uri, String localName) { if (DRAW_NS.equals(uri) && "text-box".equals(localName)) { return true; } @@ -358,6 +370,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { || "table-cell".equals(localName))) { return true; } + if (SVG_NS.equals(uri) && ("title".equals(localName) || "desc".equals(localName))) { + return true; + } return TEXT_NS.equals(uri) && "list-item".equals(localName); } @@ -486,14 +501,15 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { // inside it. See updateStyleTags / closeStyleTags. anchorDepth++; super.startElement(namespaceURI, localName, qName, attrs); - } else if (isBlockLevelOpen(namespaceURI, localName)) { - // Block-level structural elements (draw:text-box -> <div>, - // table:table -> <table>, etc.) opened while <b>/<i>/<u> are - // on top would trap those inline tags. Subsequent style flips - // inside would emit </b> while the block is on top, producing - // cross-nested XHTML. Close pending styles before opening the - // block; if there's still text to emit at the same style after - // the block closes, updateStyleTags() will reopen them. + } else if (closeStylesBeforeOpen(namespaceURI, localName)) { + // Elements that mustn't open under <b>/<i>/<u>: block-level + // structural elements (draw:text-box -> <div>, table:table -> + // <table>, etc.) and svg:title / svg:desc inline shells that + // map to <span>. Closing pending style tags first ensures the + // new element opens at body/paragraph/span level, not nested + // under stale inline styling. If there's still text to emit at + // the same style after the element closes, updateStyleTags() + // will reopen them. closeStyleTags(); super.startElement(namespaceURI, localName, qName, attrs); } else { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java index cded9ea148..bedcc038c2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java @@ -365,6 +365,20 @@ public class ODFParserTest extends TikaTest { xml); } + @Test //TIKA-4744 + public void testSvgTitleInStyledSpan() throws Exception { + // Empty <svg:title/>/<svg:desc/> inside a <draw:connector> or + // <draw:custom-shape> that is itself wrapped in a styled <text:span> + // used to leave the SAX stack with the svg's <span> sitting above the + // outer <b>. The endElement closeStyleTags (TIKA-4728) then emitted + // </b> while <span> was topmost, which StrictXHTMLValidator rejects. + // getXML wraps the handler in StrictXHTMLValidator, so a desync would + // throw before any assertions ran. + String xml = getXML("testODT_svgTitleInStyledSpan.odt").xml; + assertContains("國立雲林科技大學國際", xml); + assertContains("學生簽章", xml); + } + @Test public void testEmbedded() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt new file mode 100755 index 0000000000..d2f09b5d69 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt differ
