This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 4799b2939 TIKA-4471 -- add unit tests to confirm defense against xxe
in sax, dom and stax. (#2318)
4799b2939 is described below
commit 4799b2939de9ab724f8cb024314b1dfd449f994c
Author: Tim Allison <[email protected]>
AuthorDate: Mon Sep 8 10:32:42 2025 -0400
TIKA-4471 -- add unit tests to confirm defense against xxe in sax, dom and
stax. (#2318)
(cherry picked from commit 60abc9556f74c43c0fe98e18e34bfb76567b2885)
---
.../org/apache/tika/utils/XMLReaderUtilsTest.java | 189 ++++++++++++++++++---
1 file changed, 163 insertions(+), 26 deletions(-)
diff --git
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 1d5371019..3642fe1f4 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -16,54 +16,191 @@
*/
package org.apache.tika.utils;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.ByteArrayInputStream;
import java.net.ConnectException;
import java.nio.charset.StandardCharsets;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
public class XMLReaderUtilsTest {
+
+ private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM
\"tutorials.dtd\"><foo/>";
+ private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM
\"http://127.234.172.38:7845/bar\"><foo/>";
+ private static final String EXTERNAL_ENTITY = "<!DOCTYPE foo [" + "
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" +
+ " ]><foo>&bar;</foo>";
+ private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+ "<!ENTITY % local_dtd SYSTEM
\"file:///usr/local/app/schema.dtd\">" +
+ "%local_dtd;]><foo/>";
+
+ private static final String BILLION_LAUGHS_CLASSICAL = "<?xml
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + "
<!ELEMENT lolz (#PCDATA)>\n" +
+ " <!ENTITY lol1
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+ " <!ENTITY lol3
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+ " <!ENTITY lol4
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+ " <!ENTITY lol5
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+ " <!ENTITY lol6
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+ " <!ENTITY lol7
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+ " <!ENTITY lol8
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+ " <!ENTITY lol9
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" +
"<lolz>&lol9;</lolz>";
+
+ private static String BILLION_LAUGHS_VARIANT;
+
+ static {
+ StringBuilder entity = new StringBuilder();
+ for (int i = 0; i < 1000000; i++) {
+ entity.append("a");
+ }
+ StringBuilder xml = new StringBuilder();
+ xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "
<!ENTITY a \"");
+ xml.append(entity.toString());
+ xml.append("\">]>" + "<kaboom>");
+ for (int i = 0; i < 100000; i++) {
+ xml.append("&a;");
+ }
+ xml.append("</kaboom>");
+ BILLION_LAUGHS_VARIANT = xml.toString();
+ }
+
+ private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+ EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+ private static final String[] BILLION_LAUGHS = new String[]{
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
//make sure that parseSAX actually defends against external entities
@Test
- public void testExternalDTD() throws Exception {
- String xml = "<!DOCTYPE foo SYSTEM
\"http://127.234.172.38:7845/bar\"><foo/>";
- try {
- XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
- } catch (ConnectException e) {
- fail("Parser tried to access the external DTD:" + e);
+ public void testSAX() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(), new ParseContext());
+ } catch (ConnectException e) {
+ fail("Parser tried to access resource: " + xml, e);
+ }
+ }
+ }
+
+ @Test
+ public void testDOM() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ XMLReaderUtils.buildDOM(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+ } catch (ConnectException e) {
+ fail("Parser tried to access resource: " + xml, e);
+ }
}
}
@Test
- public void testExternalEntity() throws Exception {
- String xml =
- "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM
\"http://127.234.172.38:7845/bar\">" +
- " ]><foo>&bar;</foo>";
- try {
- XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
- } catch (ConnectException e) {
- fail("Parser tried to access the external DTD:" + e);
+ public void testStax() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLEventReader reader =
xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+ StringBuilder sb = new StringBuilder();
+ while (reader.hasNext()) {
+ sb.append(reader.next());
+ }
+ if (sb.toString().contains("Exception scanning External")) {
+ fail("tried to read external dtd");
+ }
+ } catch (XMLStreamException e) {
+ fail("StreamException: " + xml, e);
+ } catch (NoSuchElementException e) {
+ if (e.getMessage() != null) {
+ if (e.getMessage().contains("Connection refused")) {
+ fail("Vulnerable to ssrf via url: " + xml, e);
+ } else if (e.getMessage().contains("No such file")) {
+ fail("Vulnerable to local file read via external
entity/dtd: " + xml, e);
+ }
+ }
+ }
}
}
@Test
- public void testExternalEntityLocal() throws Exception {
- String xml =
- "<!DOCTYPE foo [" +
- "<!ENTITY % local_dtd SYSTEM
\"file:///usr/local/app/schema.dtd\">" +
- "%local_dtd;]><foo/>";
- try {
- XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
- } catch (ConnectException e) {
- fail("Parser tried to access the external DTD:" + e);
+ public void testSAXBillionLaughs() throws Exception {
+ for (String xml : BILLION_LAUGHS) {
+ try {
+ XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(), new ParseContext());
+ } catch (SAXException e) {
+ if (e.getMessage() != null && e
+ .getMessage()
+ .contains("entity expansions")) {
+ //do nothing
+ } else {
+ throw e;
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testDOMBillionLaughs() throws Exception {
+ //confirm that ExpandEntityReferences has been set to false.
+ for (String xml : BILLION_LAUGHS) {
+ Document doc = XMLReaderUtils.buildDOM(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+ NodeList nodeList = doc.getChildNodes();
+ StringBuilder sb = new StringBuilder();
+ dumpChildren(nodeList, sb);
+ assertEquals(0, sb
+ .toString()
+ .trim()
+ .length(), sb.toString());
+ }
+ }
+
+ private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Node n = nodeList.item(i);
+ String txt = n.getTextContent();
+ if (txt != null) {
+ sb.append(txt);
+ }
+ }
+ }
+
+ @Test
+ public void testStaxBillionLaughs() throws Exception {
+ /*
+ Turning off dtd support of the XMLInputFactory in XMLReaderUtils
turns off entity expansions and
+ causes a "NoSuchElementException" with the "'lol9' was referenced
but not declared" message with this line:
+ tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD,
false);
+ If that line doesn't exist, then we get a
+ NoSuchElementException with: "The parser has encountered more than
"20" entity expansions in this document; this is the limit imposed by the JDK."
+ */
+
+ for (String xml : BILLION_LAUGHS) {
+ javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLEventReader reader = xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+ try {
+ while (reader.hasNext()) {
+ reader.next();
+ }
+ } catch (NoSuchElementException e) {
+ //full message on temurin-17: The entity "lol9" was
referenced, but not declared.
+ if (e.getMessage() != null && e
+ .getMessage()
+ .contains("referenced") && e
+ .getMessage()
+ .contains("not declared")) {
+ //swallow -- this is expected
+ } else {
+ throw e;
+ }
+ }
}
}
}