This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 307faa22d892266615aa40bdfebeaebcc1aa465e
Author: Tim Allison <[email protected]>
AuthorDate: Wed Sep 10 10:42:47 2025 -0400

    TIKA-4482 -- don't fail if woodstox is on the classpath (#2320)
    
    (cherry picked from commit 16de8cc0efcdb21785a448d1c2dbd8a1c925dfc2)
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  22 +-
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 237 ++++++++++++++++--
 tika-integration-tests/pom.xml                     |   1 +
 .../{ => tika-woodstox-tests}/pom.xml              |  39 +--
 .../tika/woodstox/WoodstoxXMLReaderUtilsTest.java  | 265 +++++++++++++++++++++
 5 files changed, 506 insertions(+), 58 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 6a6a9dfc3..fe57f04ee 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -37,12 +37,14 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLResolver;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.TransformerFactoryConfigurationError;
 import javax.xml.transform.sax.SAXTransformerFactory;
 
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
@@ -123,6 +125,11 @@ public class XMLReaderUtils implements Serializable {
     private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER =
             (publicId, systemId) -> new InputSource(new StringReader(""));
+
+    //BE CAREFUL with the return type. Some parsers will silently ignore an 
unexpected return type: CVE-2025-54988
+    private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
+            (publicID, systemID, baseURI, namespace) ->
+                    UnsynchronizedByteArrayInputStream.nullInputStream();
     /**
      * Parser pool size
      */
@@ -302,12 +309,17 @@ public class XMLReaderUtils implements Serializable {
         if (LOG.isDebugEnabled()) {
             LOG.debug("XMLInputFactory class {}", factory.getClass());
         }
-        factory.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, "");
+
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, 
true);
+
+        //try to configure secure processing
+        tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
         tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
         tryToSetStaxProperty(factory, 
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
 
+        //defense in depth
+        factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
         trySetStaxSecurityManager(factory);
         return factory;
     }
@@ -361,6 +373,14 @@ public class XMLReaderUtils implements Serializable {
         }
     }
 
+    private static void tryToSetStaxProperty(XMLInputFactory factory, String 
key, String value) {
+        try {
+            factory.setProperty(key, value);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("StAX Feature unsupported: {}", key, e);
+        }
+    }
+
     /**
      * Returns a new transformer
      * <p>
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 1d5371019..310a8b158 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -16,54 +16,239 @@
  */
 package org.apache.tika.utils;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.ByteArrayInputStream;
 import java.net.ConnectException;
 import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
 
+import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ToTextContentHandler;
 
+/**
+ * Class to test that XMLReaderUtils defends against xxe and billion laughs.
+ * <p>
+ * Different versions and different implementations vary. This is not a fully 
comprehensive set of tests.
+ * <p>
+ * Please add more.
+ * <p>
+ * See also the tests with woodstox in tika-woodstox-tests.
+ */
 public class XMLReaderUtilsTest {
+
+    private static final Locale defaultLocale = Locale.getDefault();
+    static {
+        //tests on content of Exception msgs require specifying locale.
+        //even this, though is not sufficient for the billion laughs tests ?!
+        Locale.setDefault(Locale.US);
+    }
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " 
<!ELEMENT lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + 
"<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
+    @AfterAll
+    public static void tearDown() {
+        Locale.setDefault(defaultLocale);
+    }
+
     //make sure that parseSAX actually defends against external entities
     @Test
-    public void testExternalDTD() throws Exception {
-        String xml = "<!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+                XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            }
         }
     }
 
     @Test
-    public void testExternalEntity() throws Exception {
-        String xml =
-                "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM 
\"http://127.234.172.38:7845/bar\";>" +
-                        " ]><foo>&bar;</foo>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+            }
         }
     }
 
     @Test
-    public void testExternalEntityLocal() throws Exception {
-        String xml =
-                "<!DOCTYPE foo [" +
-                "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
-                "%local_dtd;]><foo/>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+
+        //some implementations ignore the expandEntityReferences=false, and we 
are still
+        //protected by the "The parser has encountered more than "20" entity 
expansions" SAXException.
+        //We need to check for either: empty content and no exception, or this 
SAXException
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = null;
+            try {
+                doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+                continue;
+            }
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity expansions in this document; this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                String msg = e.getLocalizedMessage();
+
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) {
+                        continue;
+                    } else if (msg.contains("JAXP00010001")) {
+                        continue;
+                    }
+                }
+                throw e;
+
+            }
+        }
+    }
+
+    private void limitCheck(SAXException e) throws SAXException {
+        String msg = e.getLocalizedMessage();
+        if (msg == null) {
+            throw e;
+        }
+
+        //depending on the flavor/version of the jdk, entity expansions may be 
triggered
+        // OR entitySizeLimit may be triggered
+        //See TIKA-4471
+        if (msg.contains("JAXP00010001") || //entity expansions
+                msg.contains("JAXP00010003") || //max entity size limit
+                msg.contains("JAXP00010004") || //TotalEntitySizeLimit
+                msg.contains("entity expansions") ||
+                e.getMessage().contains("maxGeneralEntitySizeLimit")) {
+            return;
         }
+        throw e;
     }
 }
diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml
index 897e8fb02..da7a2c384 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/pom.xml
@@ -37,6 +37,7 @@
     <module>tika-pipes-s3-integration-tests</module>
     <module>tika-resource-loading-tests</module>
     <module>tika-pipes-kafka-integration-tests</module>
+    <module>tika-woodstox-tests</module>
   </modules>
 
   <dependencies>
diff --git a/tika-integration-tests/pom.xml 
b/tika-integration-tests/tika-woodstox-tests/pom.xml
similarity index 54%
copy from tika-integration-tests/pom.xml
copy to tika-integration-tests/tika-woodstox-tests/pom.xml
index 897e8fb02..be0f1e9ee 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-woodstox-tests/pom.xml
@@ -17,50 +17,27 @@
   specific language governing permissions and limitations
   under the License.
 -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
https://maven.apache.org/xsd/maven-4.0.0.xsd";>
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
   <parent>
+    <artifactId>tika-integration-tests</artifactId>
     <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parent</artifactId>
-    <version>3.2.2</version>
-    <relativePath>../tika-parent/pom.xml</relativePath>
+    <version>4.0.0-SNAPSHOT</version>
   </parent>
-  <modelVersion>4.0.0</modelVersion>
-
-  <artifactId>tika-integration-tests</artifactId>
-  <name>Apache Tika integration tests</name>
 
-  <packaging>pom</packaging>
+  <modelVersion>4.0.0</modelVersion>
 
-  <modules>
-    <module>tika-pipes-solr-integration-tests</module>
-    <module>tika-pipes-opensearch-integration-tests</module>
-    <module>tika-pipes-s3-integration-tests</module>
-    <module>tika-resource-loading-tests</module>
-    <module>tika-pipes-kafka-integration-tests</module>
-  </modules>
+  <artifactId>tika-woodstox-tests</artifactId>
 
   <dependencies>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-core</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-serialization</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
     </dependency>
-    <!-- after we migrate everything to junit5, we can get rid of this -->
     <dependency>
-      <groupId>org.junit.vintage</groupId>
-      <artifactId>junit-vintage-engine</artifactId>
-      <scope>test</scope>
+      <groupId>com.fasterxml.woodstox</groupId>
+      <artifactId>woodstox-core</artifactId>
     </dependency>
   </dependencies>
 
-  <scm>
-    <tag>3.2.2-rc1</tag>
-  </scm>
-</project>
+</project>
\ No newline at end of file
diff --git 
a/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
 
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
new file mode 100644
index 000000000..c0d56cd7c
--- /dev/null
+++ 
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.woodstox;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import java.io.ByteArrayInputStream;
+import java.net.ConnectException;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * This confirms that XML parsing still works with woodstox on the classpath
+ */
+public class WoodstoxXMLReaderUtilsTest {
+
+    private static final Locale defaultLocale = Locale.getDefault();
+    static {
+        //tests on content of Exception msgs require specifying locale.
+        //even this, though is not sufficient for the billion laughs tests ?!
+        Locale.setDefault(Locale.US);
+    }
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " 
<!ELEMENT lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + 
"<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
+    @AfterAll
+    public static void tearDown() {
+        Locale.setDefault(defaultLocale);
+    }
+
+    //make sure that parseSAX actually defends against external entities
+    @Test
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+                XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                
assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullStack = ExceptionUtils.getStackTrace(e);
+                if (fullStack.contains("Undeclared general entity")) {
+                    continue;
+                }
+                throw e;
+            }
+        }
+    }
+
+    @Test
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+
+        //some implementations ignore the expandEntityReferences=false, and we 
are still
+        //protected by the "The parser has encountered more than "20" entity 
expansions" SAXException.
+        //We need to check for either: empty content and no exception, or this 
SAXException
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = null;
+            try {
+                doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+                continue;
+            }
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity expansions in this document; this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                String msg = e.getLocalizedMessage();
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) { //standard Java
+                        continue;
+                    }
+                }
+                throw e;
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullTrace = ExceptionUtils.getStackTrace(e);
+                if (fullTrace.contains("Undeclared general entity")) {
+                    continue;
+                } else if (fullTrace.contains("Maximum entity expansion 
count")) {
+                    continue;
+                }
+                throw e;
+            }
+        }
+    }
+
+    private void limitCheck(SAXException e) throws SAXException {
+        String msg = e.getLocalizedMessage();
+        if (msg == null) {
+            throw e;
+        }
+
+        //depending on the flavor/version of the jdk, entity expansions may be 
triggered
+        // OR entitySizeLimit may be triggered
+        //See TIKA-4471
+        if (msg.contains("JAXP00010001") || //entity expansions
+                msg.contains("JAXP00010003") || //max entity size limit
+                msg.contains("JAXP00010004") || //TotalEntitySizeLimit
+                msg.contains("entity expansions") ||
+                e.getMessage().contains("maxGeneralEntitySizeLimit")) {
+            return;
+        }
+        throw e;
+    }
+}

Reply via email to