This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4728-js-in-pdf in repository https://gitbox.apache.org/repos/asf/tika.git
commit daef21e3a33389fd831f3a2803391290c02c8d2d Author: tallison <[email protected]> AuthorDate: Wed May 13 14:59:00 2026 -0400 TIKA-4728 - fix xhtml in widgets --- .../src/test/java/org/apache/tika/TikaTest.java | 18 +++++++++++++++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 17 ++++++++++++-- .../org/apache/tika/parser/pdf/PDFParserTest.java | 13 +++++++++++ .../test-documents/testPDF_jsActionOnPage.pdf | 26 ++++++++++++++++++++++ 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a8e37a85b2..f746d9ab86 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -26,6 +26,7 @@ import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.File; import java.io.IOException; +import java.io.StringReader; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; @@ -41,7 +42,9 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedResourceHandler; @@ -59,6 +62,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.ToXMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; /** * Parent class of Tika tests @@ -89,6 +93,20 @@ public abstract class TikaTest { assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack); } + /** + * Re-parses the given XHTML string with a SAX parser and fails the test if it is + * not well-formed. Use this on the output of {@link #getXML} to catch parsers that + * emit malformed XHTML (e.g., duplicate attributes, unclosed tags, bad escaping). + */ + public static void assertValidXHTML(String xml) { + try { + XMLReaderUtils.getSAXParser().parse( + new InputSource(new StringReader(xml)), new DefaultHandler()); + } catch (Exception e) { + fail("XHTML is not well-formed: " + e.getMessage() + "\nXHTML:\n" + xml, e); + } + } + public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) { assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index ec68f8c2bd..0918bc9e95 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -223,6 +223,19 @@ class AbstractPDF2XHTML extends PDFTextStripper { attributes.addAttribute("", name, name, "CDATA", value); } + private static void setOrReplaceAttribute(String name, String value, + AttributesImpl attributes) { + if (name == null || value == null) { + return; + } + int idx = attributes.getIndex("", name); + if (idx >= 0) { + attributes.setValue(idx, value); + } else { + attributes.addAttribute("", name, name, "CDATA", value); + } + } + private static PDActionURI getActionURI(PDAnnotation annot) { //copied and pasted from PDFBox's PrintURLs @@ -1093,8 +1106,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true); } }; - addNonNullAttribute("class", "javascript", attrs); - addNonNullAttribute("type", jsAction.getType(), attrs); + setOrReplaceAttribute("class", "javascript", attrs); + setOrReplaceAttribute("type", jsAction.getType(), attrs); addNonNullAttribute("subtype", jsAction.getSubType(), attrs); xhtml.startElement("div", attrs); xhtml.endElement("div"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 6ce771c6f9..e2d39fc6cc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1537,6 +1537,19 @@ public class PDFParserTest extends TikaTest { config.setMaxPages(1); } + // TIKA-XXXX: handleDestinationOrAction pre-populated class/type on the action div, + // then processJavaScriptAction appended a second class/type for PDActionJavaScript + // actions, producing a div with duplicate attributes that SAX parsers reject. + @Test + public void testExtractActionsXHTMLWellFormed() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + config.setExtractActions(true); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + XMLResult r = getXML("testPDF_jsActionOnPage.pdf", context); + assertValidXHTML(r.xml); + } + /** @Test public void testWriteLimit() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf new file mode 100644 index 0000000000..5b283307f8 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf @@ -0,0 +1,26 @@ +%PDF-1.4 +%���� +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /AA << /O 4 0 R >> >> +endobj +4 0 obj +<< /Type /Action /S /JavaScript /JS (app.alert\('TIKA test'\);) >> +endobj +xref +0 5 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000121 00000 n +0000000211 00000 n +trailer +<< /Size 5 /Root 1 0 R >> +startxref +293 +%%EOF
