This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 4799b2939 TIKA-4471 -- add unit tests to confirm defense against xxe 
in sax, dom and stax. (#2318)
4799b2939 is described below

commit 4799b2939de9ab724f8cb024314b1dfd449f994c
Author: Tim Allison <[email protected]>
AuthorDate: Mon Sep 8 10:32:42 2025 -0400

    TIKA-4471 -- add unit tests to confirm defense against xxe in sax, dom and 
stax. (#2318)
    
    (cherry picked from commit 60abc9556f74c43c0fe98e18e34bfb76567b2885)
---
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 189 ++++++++++++++++++---
 1 file changed, 163 insertions(+), 26 deletions(-)

diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 1d5371019..3642fe1f4 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -16,54 +16,191 @@
  */
 package org.apache.tika.utils;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.ByteArrayInputStream;
 import java.net.ConnectException;
 import java.nio.charset.StandardCharsets;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLStreamException;
 
 import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ToTextContentHandler;
 
 public class XMLReaderUtilsTest {
+
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " 
<!ELEMENT lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + 
"<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
     //make sure that parseSAX actually defends against external entities
     @Test
-    public void testExternalDTD() throws Exception {
-        String xml = "<!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
         }
     }
 
     @Test
-    public void testExternalEntity() throws Exception {
-        String xml =
-                "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM 
\"http://127.234.172.38:7845/bar\";>" +
-                        " ]><foo>&bar;</foo>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+                XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            }
         }
     }
 
     @Test
-    public void testExternalEntityLocal() throws Exception {
-        String xml =
-                "<!DOCTYPE foo [" +
-                "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
-                "%local_dtd;]><foo/>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                if (e.getMessage() != null && e
+                        .getMessage()
+                        .contains("entity expansions")) {
+                    //do nothing
+                } else {
+                    throw e;
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity expansions in this document; this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                if (e.getMessage() != null && e
+                        .getMessage()
+                        .contains("referenced") && e
+                        .getMessage()
+                        .contains("not declared")) {
+                    //swallow -- this is expected
+                } else {
+                    throw e;
+                }
+            }
         }
     }
 }

Reply via email to