This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new fd557d4e01 Addressing CVE-2025-66516 in branch_1x (#2437)
fd557d4e01 is described below

commit fd557d4e0187fd870b7b8809dc80271733159d98
Author: Julian Reschke <[email protected]>
AuthorDate: Thu Dec 11 14:12:41 2025 +0100

    Addressing CVE-2025-66516 in branch_1x (#2437)
    
    * Addressing CVE-2025-66516
    
    Disabling the processing of external entities when parsing XML.
    
    * Addressing CVE-2025-54988
    
    Backport of missing patch.
    
    ---------
    
    Co-authored-by: Manfred Baedke <[email protected]>
---
 tika-core/pom.xml                                  |  12 +
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  26 +-
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 254 ++++++++++++++++++++
 tika-server/pom.xml                                |  12 +
 .../org/apache/tika/server/ExceptionUtils.java     |  91 +++++++
 .../tika/server/WoodstoxXMLReaderUtilsTest.java    | 263 +++++++++++++++++++++
 .../org/apache/tika/server}/XMLReaderUtils.java    |  29 ++-
 7 files changed, 670 insertions(+), 17 deletions(-)

diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index 74cf429797..aab5d76350 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -97,6 +97,18 @@
             <artifactId>junit</artifactId>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-api</artifactId>
+            <version>5.9.3</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+          <groupId>org.junit.jupiter</groupId>
+          <artifactId>junit-jupiter-engine</artifactId>
+          <version>5.9.3</version>
+          <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 8066227e8d..4a89ceab2f 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -17,6 +17,7 @@
 
 package org.apache.tika.utils;
 
+import org.apache.commons.io.input.NullInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.OfflineContentHandler;
@@ -40,7 +41,6 @@ import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLResolver;
-import javax.xml.stream.XMLStreamException;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
@@ -161,14 +161,10 @@ public class XMLReaderUtils implements Serializable {
         }
     };
 
+    //BE CAREFUL with the return type. Some parsers will silently ignore an 
unexpected return type: CVE-2025-54988
     private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
-            new XMLResolver() {
-                @Override
-                public Object resolveEntity(String publicID, String systemID, 
String baseURI, String namespace) throws
-                        XMLStreamException {
-                    return "";
-                }
-            };
+            (publicID, systemID, baseURI, namespace) -> new NullInputStream();
+
 
     /**
      * Set the maximum number of entity expansions allowable in SAX/DOM/StAX 
parsing.
@@ -326,8 +322,14 @@ public class XMLReaderUtils implements Serializable {
         XMLInputFactory factory = XMLInputFactory.newFactory();
 
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, 
true);
+
+        //try to configure secure processing
+        tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
+        tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
+        tryToSetStaxProperty(factory, 
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
 
+        //defense in depth
         factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
         trySetStaxSecurityManager(factory);
         return factory;
@@ -375,6 +377,14 @@ public class XMLReaderUtils implements Serializable {
         }
     }
 
+    private static void tryToSetStaxProperty(XMLInputFactory factory, String 
key, String value) {
+        try {
+            factory.setProperty(key, value);
+        } catch (IllegalArgumentException e) {
+            LOG.log(Level.WARNING,"StAX Feature unsupported: " + key, e);
+        }
+    }
+
     /**
      * Returns a new transformer
      * <p>
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
new file mode 100755
index 0000000000..8472b75d65
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import java.io.ByteArrayInputStream;
+import java.net.ConnectException;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+
+/**
+ * Class to test that XMLReaderUtils defends against xxe and billion laughs.
+ * <p>
+ * Different versions and different implementations vary. This is not a fully 
comprehensive set of tests.
+ * <p>
+ * Please add more.
+ * <p>
+ * See also the tests with woodstox in tika-woodstox-tests.
+ */
+public class XMLReaderUtilsTest {
+
+    private static final Locale defaultLocale = Locale.getDefault();
+    static {
+        //tests on content of Exception msgs require specifying locale.
+        //even this, though is not sufficient for the billion laughs tests ?!
+        Locale.setDefault(Locale.US);
+    }
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " 
<!ELEMENT lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + 
"<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
+    @AfterAll
+    public static void tearDown() {
+        Locale.setDefault(defaultLocale);
+    }
+
+    //make sure that parseSAX actually defends against external entities
+    @Test
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+                XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+
+        //some implementations ignore the expandEntityReferences=false, and we 
are still
+        //protected by the "The parser has encountered more than "20" entity 
expansions" SAXException.
+        //We need to check for either: empty content and no exception, or this 
SAXException
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = null;
+            try {
+                doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+                continue;
+            }
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity expansions in this document; this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                String msg = e.getLocalizedMessage();
+
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) {
+                        continue;
+                    } else if (msg.contains("JAXP00010001")) {
+                        continue;
+                    }
+                }
+                throw e;
+
+            }
+        }
+    }
+
+    private void limitCheck(SAXException e) throws SAXException {
+        String msg = e.getLocalizedMessage();
+        if (msg == null) {
+            throw e;
+        }
+
+        //depending on the flavor/version of the jdk, entity expansions may be 
triggered
+        // OR entitySizeLimit may be triggered
+        //See TIKA-4471
+        if (msg.contains("JAXP00010001") || //entity expansions
+                msg.contains("JAXP00010003") || //max entity size limit
+                msg.contains("JAXP00010004") || //TotalEntitySizeLimit
+                msg.contains("entity expansions") ||
+                e.getMessage().contains("maxGeneralEntitySizeLimit")) {
+            return;
+        }
+        throw e;
+    }
+}
\ No newline at end of file
diff --git a/tika-server/pom.xml b/tika-server/pom.xml
index 570d23e00c..bf4fec1a55 100644
--- a/tika-server/pom.xml
+++ b/tika-server/pom.xml
@@ -198,6 +198,18 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-api</artifactId>
+      <version>5.9.3</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-engine</artifactId>
+      <version>5.9.3</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git 
a/tika-server/src/test/java/org/apache/tika/server/ExceptionUtils.java 
b/tika-server/src/test/java/org/apache/tika/server/ExceptionUtils.java
new file mode 100755
index 0000000000..3364ab81a0
--- /dev/null
+++ b/tika-server/src/test/java/org/apache/tika/server/ExceptionUtils.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+
+//Temporarily copied from tika-core until a better solution is found
+public class ExceptionUtils {
+
+    private final static Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+");
+
+    /**
+     * Simple util to get stack trace.
+     * <p>
+     * This will unwrap a TikaException and return the cause if not null
+     * <p>
+     * NOTE: If your stacktraces are truncated, make sure to start your jvm
+     * with: -XX:-OmitStackTraceInFastThrow
+     *
+     * @param t throwable
+     * @return
+     * @throws IOException
+     */
+    public static String getFilteredStackTrace(Throwable t) {
+        Throwable cause = t;
+        if ((t.getClass().equals(TikaException.class)) &&
+                t.getCause() != null) {
+            cause = t.getCause();
+        }
+        return getStackTrace(cause);
+    }
+
+    /**
+     * Get the full stacktrace as a string
+     * @param t
+     * @return
+     */
+    public static String getStackTrace(Throwable t) {
+        Writer result = new StringWriter();
+        PrintWriter writer = new PrintWriter(result);
+        t.printStackTrace(writer);
+        try {
+            writer.flush();
+            result.flush();
+            writer.close();
+            result.close();
+        } catch (IOException e) {
+            //swallow
+        }
+        return result.toString();
+    }
+
+    /**
+     * Utility method to trim the message from a stack trace
+     * string.
+     * <p>
+     * E.g. <code>java.lang.IllegalStateException: Potential loop detected 
</code>
+     * will be trimmed to <code>java.lang.IllegalStateException</code>
+     * @param trace string view of stack trace
+     * @return trimmed stack trace
+     */
+    public static String trimMessage(String trace) {
+        Matcher msgMatcher = MSG_PATTERN.matcher(trace);
+        if (msgMatcher.find()) {
+            return msgMatcher.replaceFirst("");
+        }
+        return trace;
+    }
+}
diff --git 
a/tika-server/src/test/java/org/apache/tika/server/WoodstoxXMLReaderUtilsTest.java
 
b/tika-server/src/test/java/org/apache/tika/server/WoodstoxXMLReaderUtilsTest.java
new file mode 100755
index 0000000000..d4357353b6
--- /dev/null
+++ 
b/tika-server/src/test/java/org/apache/tika/server/WoodstoxXMLReaderUtilsTest.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import java.io.ByteArrayInputStream;
+import java.net.ConnectException;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ToTextContentHandler;
+
+/**
+ * This confirms that XML parsing still works with woodstox on the classpath
+ */
+public class WoodstoxXMLReaderUtilsTest {
+
+    private static final Locale defaultLocale = Locale.getDefault();
+    static {
+        //tests on content of Exception msgs require specifying locale.
+        //even this, though is not sufficient for the billion laughs tests ?!
+        Locale.setDefault(Locale.US);
+    }
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " 
<!ELEMENT lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + 
"<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
+
+    @AfterAll
+    public static void tearDown() {
+        Locale.setDefault(defaultLocale);
+    }
+
+    //make sure that parseSAX actually defends against external entities
+    @Test
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+                XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                
assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullStack = ExceptionUtils.getStackTrace(e);
+                if (fullStack.contains("Undeclared general entity")) {
+                    continue;
+                }
+                throw e;
+            }
+        }
+    }
+
+    @Test
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+
+        //some implementations ignore the expandEntityReferences=false, and we 
are still
+        //protected by the "The parser has encountered more than "20" entity 
expansions" SAXException.
+        //We need to check for either: empty content and no exception, or this 
SAXException
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = null;
+            try {
+                doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext());
+            } catch (SAXException e) {
+                limitCheck(e);
+                continue;
+            }
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity expansions in this document; this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                String msg = e.getLocalizedMessage();
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) { //standard Java
+                        continue;
+                    }
+                }
+                throw e;
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullTrace = ExceptionUtils.getStackTrace(e);
+                if (fullTrace.contains("Undeclared general entity")) {
+                    continue;
+                } else if (fullTrace.contains("Maximum entity expansion 
count")) {
+                    continue;
+                }
+                throw e;
+            }
+        }
+    }
+
+    private void limitCheck(SAXException e) throws SAXException {
+        String msg = e.getLocalizedMessage();
+        if (msg == null) {
+            throw e;
+        }
+
+        //depending on the flavor/version of the jdk, entity expansions may be 
triggered
+        // OR entitySizeLimit may be triggered
+        //See TIKA-4471
+        if (msg.contains("JAXP00010001") || //entity expansions
+                msg.contains("JAXP00010003") || //max entity size limit
+                msg.contains("JAXP00010004") || //TotalEntitySizeLimit
+                msg.contains("entity expansions") ||
+                e.getMessage().contains("maxGeneralEntitySizeLimit")) {
+            return;
+        }
+        throw e;
+    }
+}
\ No newline at end of file
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-server/src/test/java/org/apache/tika/server/XMLReaderUtils.java
old mode 100644
new mode 100755
similarity index 97%
copy from tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
copy to tika-server/src/test/java/org/apache/tika/server/XMLReaderUtils.java
index 8066227e8d..08440f4313
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-server/src/test/java/org/apache/tika/server/XMLReaderUtils.java
@@ -15,8 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.tika.utils;
+package org.apache.tika.server;
 
+import org.apache.commons.io.input.NullInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.OfflineContentHandler;
@@ -40,7 +41,6 @@ import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLResolver;
-import javax.xml.stream.XMLStreamException;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
@@ -64,6 +64,7 @@ import java.util.logging.Logger;
  * to use the {@link OfflineContentHandler} to guard against
  * XML External Entity attacks.
  */
+//Temporarily copied from tika-core until a better solution is found
 public class XMLReaderUtils implements Serializable {
 
     /**
@@ -161,14 +162,10 @@ public class XMLReaderUtils implements Serializable {
         }
     };
 
+    //BE CAREFUL with the return type. Some parsers will silently ignore an 
unexpected return type: CVE-2025-54988
     private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
-            new XMLResolver() {
-                @Override
-                public Object resolveEntity(String publicID, String systemID, 
String baseURI, String namespace) throws
-                        XMLStreamException {
-                    return "";
-                }
-            };
+            (publicID, systemID, baseURI, namespace) -> new NullInputStream();
+
 
     /**
      * Set the maximum number of entity expansions allowable in SAX/DOM/StAX 
parsing.
@@ -326,8 +323,14 @@ public class XMLReaderUtils implements Serializable {
         XMLInputFactory factory = XMLInputFactory.newFactory();
 
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, 
true);
+
+        //try to configure secure processing
+        tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
+        tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
+        tryToSetStaxProperty(factory, 
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
 
+        //defense in depth
         factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
         trySetStaxSecurityManager(factory);
         return factory;
@@ -375,6 +378,14 @@ public class XMLReaderUtils implements Serializable {
         }
     }
 
+    private static void tryToSetStaxProperty(XMLInputFactory factory, String 
key, String value) {
+        try {
+            factory.setProperty(key, value);
+        } catch (IllegalArgumentException e) {
+            LOG.log(Level.WARNING,"StAX Feature unsupported: " + key, e);
+        }
+    }
+
     /**
      * Returns a new transformer
      * <p>


Reply via email to