This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3_2x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 307faa22d892266615aa40bdfebeaebcc1aa465e Author: Tim Allison <[email protected]> AuthorDate: Wed Sep 10 10:42:47 2025 -0400 TIKA-4482 -- don't fail if woodstox is on the classpath (#2320) (cherry picked from commit 16de8cc0efcdb21785a448d1c2dbd8a1c925dfc2) --- .../java/org/apache/tika/utils/XMLReaderUtils.java | 22 +- .../org/apache/tika/utils/XMLReaderUtilsTest.java | 237 ++++++++++++++++-- tika-integration-tests/pom.xml | 1 + .../{ => tika-woodstox-tests}/pom.xml | 39 +-- .../tika/woodstox/WoodstoxXMLReaderUtilsTest.java | 265 +++++++++++++++++++++ 5 files changed, 506 insertions(+), 58 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index 6a6a9dfc3..fe57f04ee 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -37,12 +37,14 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLResolver; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.sax.SAXTransformerFactory; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -123,6 +125,11 @@ public class XMLReaderUtils implements Serializable { private static final AtomicInteger POOL_GENERATION = new AtomicInteger(); private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = (publicId, systemId) -> new InputSource(new StringReader("")); + + //BE CAREFUL with the return type. Some parsers will silently ignore an unexpected return type: CVE-2025-54988 + private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER = + (publicID, systemID, baseURI, namespace) -> + UnsynchronizedByteArrayInputStream.nullInputStream(); /** * Parser pool size */ @@ -302,12 +309,17 @@ public class XMLReaderUtils implements Serializable { if (LOG.isDebugEnabled()) { LOG.debug("XMLInputFactory class {}", factory.getClass()); } - factory.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, ""); + tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true); + + //try to configure secure processing + tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false); tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); tryToSetStaxProperty(factory, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + //defense in depth + factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER); trySetStaxSecurityManager(factory); return factory; } @@ -361,6 +373,14 @@ public class XMLReaderUtils implements Serializable { } } + private static void tryToSetStaxProperty(XMLInputFactory factory, String key, String value) { + try { + factory.setProperty(key, value); + } catch (IllegalArgumentException e) { + LOG.warn("StAX Feature unsupported: {}", key, e); + } + } + /** * Returns a new transformer * <p> diff --git a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java index 1d5371019..310a8b158 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java @@ -16,54 +16,239 @@ */ package org.apache.tika.utils; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.fail; import java.io.ByteArrayInputStream; import java.net.ConnectException; import java.nio.charset.StandardCharsets; +import java.util.Locale; +import java.util.NoSuchElementException; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ToTextContentHandler; +/** + * Class to test that XMLReaderUtils defends against xxe and billion laughs. + * <p> + * Different versions and different implementations vary. This is not a fully comprehensive set of tests. + * <p> + * Please add more. + * <p> + * See also the tests with woodstox in tika-woodstox-tests. + */ public class XMLReaderUtilsTest { + + private static final Locale defaultLocale = Locale.getDefault(); + static { + //tests on content of Exception msgs require specifying locale. + //even this, though is not sufficient for the billion laughs tests ?! + Locale.setDefault(Locale.US); + } + private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"tutorials.dtd\"><foo/>"; + private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>"; + private static final String EXTERNAL_ENTITY = "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" + + " ]><foo>&bar;</foo>"; + private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" + + "<!ENTITY % local_dtd SYSTEM \"file:///usr/local/app/schema.dtd\">" + + "%local_dtd;]><foo/>"; + + private static final String BILLION_LAUGHS_CLASSICAL = "<?xml version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " <!ELEMENT lolz (#PCDATA)>\n" + + " <!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" + + " <!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" + + " <!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" + + " <!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" + + " <!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" + + " <!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" + + " <!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" + + " <!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + "<lolz>&lol9;</lolz>"; + + private static String BILLION_LAUGHS_VARIANT; + + static { + StringBuilder entity = new StringBuilder(); + for (int i = 0; i < 1000000; i++) { + entity.append("a"); + } + StringBuilder xml = new StringBuilder(); + xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + " <!ENTITY a \""); + xml.append(entity.toString()); + xml.append("\">]>" + "<kaboom>"); + for (int i = 0; i < 100000; i++) { + xml.append("&a;"); + } + xml.append("</kaboom>"); + BILLION_LAUGHS_VARIANT = xml.toString(); + } + + private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL, + EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD }; + + private static final String[] BILLION_LAUGHS = new String[]{ BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT }; + + @AfterAll + public static void tearDown() { + Locale.setDefault(defaultLocale); + } + //make sure that parseSAX actually defends against external entities @Test - public void testExternalDTD() throws Exception { - String xml = "<!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>"; - try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); - } catch (ConnectException e) { - fail("Parser tried to access the external DTD:" + e); + public void testSAX() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); + } catch (ConnectException e) { + fail("Parser tried to access resource: " + xml, e); + } + } + } + + @Test + public void testDOM() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + } catch (ConnectException e) { + fail("Parser tried to access resource: " + xml, e); + } + } + } + + @Test + public void testStax() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + StringBuilder sb = new StringBuilder(); + while (reader.hasNext()) { + sb.append(reader.next()); + } + if (sb.toString().contains("Exception scanning External")) { + fail("tried to read external dtd"); + } + } catch (XMLStreamException e) { + fail("StreamException: " + xml, e); + } catch (NoSuchElementException e) { + if (e.getMessage() != null) { + if (e.getMessage().contains("Connection refused")) { + fail("Vulnerable to ssrf via url: " + xml, e); + } else if (e.getMessage().contains("No such file")) { + fail("Vulnerable to local file read via external entity/dtd: " + xml, e); + } + } + } } } @Test - public void testExternalEntity() throws Exception { - String xml = - "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" + - " ]><foo>&bar;</foo>"; - try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); - } catch (ConnectException e) { - fail("Parser tried to access the external DTD:" + e); + public void testSAXBillionLaughs() throws Exception { + for (String xml : BILLION_LAUGHS) { + try { + XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); + } catch (SAXException e) { + limitCheck(e); + } } } @Test - public void testExternalEntityLocal() throws Exception { - String xml = - "<!DOCTYPE foo [" + - "<!ENTITY % local_dtd SYSTEM \"file:///usr/local/app/schema.dtd\">" + - "%local_dtd;]><foo/>"; - try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); - } catch (ConnectException e) { - fail("Parser tried to access the external DTD:" + e); + public void testDOMBillionLaughs() throws Exception { + //confirm that ExpandEntityReferences has been set to false. + + //some implementations ignore the expandEntityReferences=false, and we are still + //protected by the "The parser has encountered more than "20" entity expansions" SAXException. + //We need to check for either: empty content and no exception, or this SAXException + for (String xml : BILLION_LAUGHS) { + Document doc = null; + try { + doc = XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + } catch (SAXException e) { + limitCheck(e); + continue; + } + NodeList nodeList = doc.getChildNodes(); + StringBuilder sb = new StringBuilder(); + dumpChildren(nodeList, sb); + assertEquals(0, sb + .toString() + .trim() + .length(), sb.toString()); + } + } + + private void dumpChildren(NodeList nodeList, StringBuilder sb) { + for (int i = 0; i < nodeList.getLength(); i++) { + Node n = nodeList.item(i); + String txt = n.getTextContent(); + if (txt != null) { + sb.append(txt); + } + } + } + + @Test + public void testStaxBillionLaughs() throws Exception { + /* + Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity expansions and + causes a "NoSuchElementException" with the "'lol9' was referenced but not declared" message with this line: + tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); + If that line doesn't exist, then we get a + NoSuchElementException with: "The parser has encountered more than "20" entity expansions in this document; this is the limit imposed by the JDK." + */ + + for (String xml : BILLION_LAUGHS) { + XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + try { + while (reader.hasNext()) { + reader.next(); + } + } catch (NoSuchElementException e) { + //full message on temurin-17: The entity "lol9" was referenced, but not declared. + String msg = e.getLocalizedMessage(); + + if (msg != null) { + if (msg.contains("referenced") && msg.contains("not declared")) { + continue; + } else if (msg.contains("JAXP00010001")) { + continue; + } + } + throw e; + + } + } + } + + private void limitCheck(SAXException e) throws SAXException { + String msg = e.getLocalizedMessage(); + if (msg == null) { + throw e; + } + + //depending on the flavor/version of the jdk, entity expansions may be triggered + // OR entitySizeLimit may be triggered + //See TIKA-4471 + if (msg.contains("JAXP00010001") || //entity expansions + msg.contains("JAXP00010003") || //max entity size limit + msg.contains("JAXP00010004") || //TotalEntitySizeLimit + msg.contains("entity expansions") || + e.getMessage().contains("maxGeneralEntitySizeLimit")) { + return; } + throw e; } } diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml index 897e8fb02..da7a2c384 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-integration-tests/pom.xml @@ -37,6 +37,7 @@ <module>tika-pipes-s3-integration-tests</module> <module>tika-resource-loading-tests</module> <module>tika-pipes-kafka-integration-tests</module> + <module>tika-woodstox-tests</module> </modules> <dependencies> diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/tika-woodstox-tests/pom.xml similarity index 54% copy from tika-integration-tests/pom.xml copy to tika-integration-tests/tika-woodstox-tests/pom.xml index 897e8fb02..be0f1e9ee 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-integration-tests/tika-woodstox-tests/pom.xml @@ -17,50 +17,27 @@ specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <parent> + <artifactId>tika-integration-tests</artifactId> <groupId>org.apache.tika</groupId> - <artifactId>tika-parent</artifactId> - <version>3.2.2</version> - <relativePath>../tika-parent/pom.xml</relativePath> + <version>4.0.0-SNAPSHOT</version> </parent> - <modelVersion>4.0.0</modelVersion> - - <artifactId>tika-integration-tests</artifactId> - <name>Apache Tika integration tests</name> - <packaging>pom</packaging> + <modelVersion>4.0.0</modelVersion> - <modules> - <module>tika-pipes-solr-integration-tests</module> - <module>tika-pipes-opensearch-integration-tests</module> - <module>tika-pipes-s3-integration-tests</module> - <module>tika-resource-loading-tests</module> - <module>tika-pipes-kafka-integration-tests</module> - </modules> + <artifactId>tika-woodstox-tests</artifactId> <dependencies> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-serialization</artifactId> - <version>${project.version}</version> - <scope>test</scope> </dependency> - <!-- after we migrate everything to junit5, we can get rid of this --> <dependency> - <groupId>org.junit.vintage</groupId> - <artifactId>junit-vintage-engine</artifactId> - <scope>test</scope> + <groupId>com.fasterxml.woodstox</groupId> + <artifactId>woodstox-core</artifactId> </dependency> </dependencies> - <scm> - <tag>3.2.2-rc1</tag> - </scm> -</project> +</project> \ No newline at end of file diff --git a/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java new file mode 100644 index 000000000..c0d56cd7c --- /dev/null +++ b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.woodstox; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.io.ByteArrayInputStream; +import java.net.ConnectException; +import java.nio.charset.StandardCharsets; +import java.util.Locale; +import java.util.NoSuchElementException; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ToTextContentHandler; +import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * This confirms that XML parsing still works with woodstox on the classpath + */ +public class WoodstoxXMLReaderUtilsTest { + + private static final Locale defaultLocale = Locale.getDefault(); + static { + //tests on content of Exception msgs require specifying locale. + //even this, though is not sufficient for the billion laughs tests ?! + Locale.setDefault(Locale.US); + } + private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"tutorials.dtd\"><foo/>"; + private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>"; + private static final String EXTERNAL_ENTITY = "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" + + " ]><foo>&bar;</foo>"; + private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" + + "<!ENTITY % local_dtd SYSTEM \"file:///usr/local/app/schema.dtd\">" + + "%local_dtd;]><foo/>"; + + private static final String BILLION_LAUGHS_CLASSICAL = "<?xml version=\"1.0\"?>\n" + "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " <!ELEMENT lolz (#PCDATA)>\n" + + " <!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" + " <!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" + + " <!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" + + " <!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" + + " <!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" + + " <!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" + + " <!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" + + " <!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" + + " <!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" + "<lolz>&lol9;</lolz>"; + + private static String BILLION_LAUGHS_VARIANT; + + static { + StringBuilder entity = new StringBuilder(); + for (int i = 0; i < 1000000; i++) { + entity.append("a"); + } + StringBuilder xml = new StringBuilder(); + xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + " <!ENTITY a \""); + xml.append(entity.toString()); + xml.append("\">]>" + "<kaboom>"); + for (int i = 0; i < 100000; i++) { + xml.append("&a;"); + } + xml.append("</kaboom>"); + BILLION_LAUGHS_VARIANT = xml.toString(); + } + + private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL, + EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD }; + + private static final String[] BILLION_LAUGHS = new String[]{ BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT }; + + @AfterAll + public static void tearDown() { + Locale.setDefault(defaultLocale); + } + + //make sure that parseSAX actually defends against external entities + @Test + public void testSAX() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); + } catch (ConnectException e) { + fail("Parser tried to access resource: " + xml, e); + } + } + } + + @Test + public void testDOM() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + } catch (ConnectException e) { + fail("Parser tried to access resource: " + xml, e); + } + } + } + + @Test + public void testStax() throws Exception { + for (String xml : EXTERNAL_ENTITY_XMLS) { + try { + XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + assertTrue(reader.getClass().getName().contains("com.ctc.wstx")); + StringBuilder sb = new StringBuilder(); + while (reader.hasNext()) { + sb.append(reader.next()); + } + if (sb.toString().contains("Exception scanning External")) { + fail("tried to read external dtd"); + } + } catch (XMLStreamException e) { + fail("StreamException: " + xml, e); + } catch (NoSuchElementException e) { + if (e.getMessage() != null) { + if (e.getMessage().contains("Connection refused")) { + fail("Vulnerable to ssrf via url: " + xml, e); + } else if (e.getMessage().contains("No such file")) { + fail("Vulnerable to local file read via external entity/dtd: " + xml, e); + } + } + } catch (RuntimeException e) { + //woodstox + String fullStack = ExceptionUtils.getStackTrace(e); + if (fullStack.contains("Undeclared general entity")) { + continue; + } + throw e; + } + } + } + + @Test + public void testSAXBillionLaughs() throws Exception { + for (String xml : BILLION_LAUGHS) { + try { + XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); + } catch (SAXException e) { + limitCheck(e); + } + } + } + + @Test + public void testDOMBillionLaughs() throws Exception { + //confirm that ExpandEntityReferences has been set to false. + + //some implementations ignore the expandEntityReferences=false, and we are still + //protected by the "The parser has encountered more than "20" entity expansions" SAXException. + //We need to check for either: empty content and no exception, or this SAXException + for (String xml : BILLION_LAUGHS) { + Document doc = null; + try { + doc = XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + } catch (SAXException e) { + limitCheck(e); + continue; + } + NodeList nodeList = doc.getChildNodes(); + StringBuilder sb = new StringBuilder(); + dumpChildren(nodeList, sb); + assertEquals(0, sb + .toString() + .trim() + .length(), sb.toString()); + } + } + + private void dumpChildren(NodeList nodeList, StringBuilder sb) { + for (int i = 0; i < nodeList.getLength(); i++) { + Node n = nodeList.item(i); + String txt = n.getTextContent(); + if (txt != null) { + sb.append(txt); + } + } + } + + @Test + public void testStaxBillionLaughs() throws Exception { + /* + Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity expansions and + causes a "NoSuchElementException" with the "'lol9' was referenced but not declared" message with this line: + tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); + If that line doesn't exist, then we get a + NoSuchElementException with: "The parser has encountered more than "20" entity expansions in this document; this is the limit imposed by the JDK." + */ + + for (String xml : BILLION_LAUGHS) { + XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + assertTrue(reader.getClass().getName().contains("com.ctc.wstx")); + try { + while (reader.hasNext()) { + reader.next(); + } + } catch (NoSuchElementException e) { + String msg = e.getLocalizedMessage(); + //full message on temurin-17: The entity "lol9" was referenced, but not declared. + if (msg != null) { + if (msg.contains("referenced") && msg.contains("not declared")) { //standard Java + continue; + } + } + throw e; + } catch (RuntimeException e) { + //woodstox + String fullTrace = ExceptionUtils.getStackTrace(e); + if (fullTrace.contains("Undeclared general entity")) { + continue; + } else if (fullTrace.contains("Maximum entity expansion count")) { + continue; + } + throw e; + } + } + } + + private void limitCheck(SAXException e) throws SAXException { + String msg = e.getLocalizedMessage(); + if (msg == null) { + throw e; + } + + //depending on the flavor/version of the jdk, entity expansions may be triggered + // OR entitySizeLimit may be triggered + //See TIKA-4471 + if (msg.contains("JAXP00010001") || //entity expansions + msg.contains("JAXP00010003") || //max entity size limit + msg.contains("JAXP00010004") || //TotalEntitySizeLimit + msg.contains("entity expansions") || + e.getMessage().contains("maxGeneralEntitySizeLimit")) { + return; + } + throw e; + } +}
