This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 16de8cc0e TIKA-4482 -- don't fail if woodstox is on the classpath
(#2320)
16de8cc0e is described below
commit 16de8cc0efcdb21785a448d1c2dbd8a1c925dfc2
Author: Tim Allison <[email protected]>
AuthorDate: Wed Sep 10 10:42:47 2025 -0400
TIKA-4482 -- don't fail if woodstox is on the classpath (#2320)
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 22 +++++++++-
.../org/apache/tika/utils/XMLReaderUtilsTest.java | 32 ++++++++++-----
tika-integration-tests/pom.xml | 1 +
.../{ => tika-woodstox-tests}/pom.xml | 37 ++++-------------
.../tika/woodstox/WoodstoxXMLReaderUtilsTest.java | 47 ++++++++++++++++------
5 files changed, 86 insertions(+), 53 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 6a6a9dfc3..fe57f04ee 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -37,12 +37,14 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLResolver;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXTransformerFactory;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -123,6 +125,11 @@ public class XMLReaderUtils implements Serializable {
private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER =
(publicId, systemId) -> new InputSource(new StringReader(""));
+
+ //BE CAREFUL with the return type. Some parsers will silently ignore an
unexpected return type: CVE-2025-54988
+ private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
+ (publicID, systemID, baseURI, namespace) ->
+ UnsynchronizedByteArrayInputStream.nullInputStream();
/**
* Parser pool size
*/
@@ -302,12 +309,17 @@ public class XMLReaderUtils implements Serializable {
if (LOG.isDebugEnabled()) {
LOG.debug("XMLInputFactory class {}", factory.getClass());
}
- factory.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, "");
+
tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE,
true);
+
+ //try to configure secure processing
+ tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
tryToSetStaxProperty(factory,
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
+ //defense in depth
+ factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
trySetStaxSecurityManager(factory);
return factory;
}
@@ -361,6 +373,14 @@ public class XMLReaderUtils implements Serializable {
}
}
+ private static void tryToSetStaxProperty(XMLInputFactory factory, String
key, String value) {
+ try {
+ factory.setProperty(key, value);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("StAX Feature unsupported: {}", key, e);
+ }
+ }
+
/**
* Returns a new transformer
* <p>
diff --git
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 9f14f6636..310a8b158 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -25,6 +25,7 @@ import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.AfterAll;
@@ -37,6 +38,15 @@ import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
+/**
+ * Class to test that XMLReaderUtils defends against xxe and billion laughs.
+ * <p>
+ * Different versions and different implementations vary. This is not a fully
comprehensive set of tests.
+ * <p>
+ * Please add more.
+ * <p>
+ * See also the tests with woodstox in tika-woodstox-tests.
+ */
public class XMLReaderUtilsTest {
private static final Locale defaultLocale = Locale.getDefault();
@@ -119,7 +129,7 @@ public class XMLReaderUtilsTest {
public void testStax() throws Exception {
for (String xml : EXTERNAL_ENTITY_XMLS) {
try {
- javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader =
xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
StringBuilder sb = new StringBuilder();
while (reader.hasNext()) {
@@ -200,7 +210,7 @@ public class XMLReaderUtilsTest {
*/
for (String xml : BILLION_LAUGHS) {
- javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader = xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
try {
while (reader.hasNext()) {
@@ -208,15 +218,17 @@ public class XMLReaderUtilsTest {
}
} catch (NoSuchElementException e) {
//full message on temurin-17: The entity "lol9" was
referenced, but not declared.
- if (e.getMessage() != null && e
- .getMessage()
- .contains("referenced") && e
- .getMessage()
- .contains("not declared")) {
- //swallow -- this is expected
- } else {
- throw e;
+ String msg = e.getLocalizedMessage();
+
+ if (msg != null) {
+ if (msg.contains("referenced") && msg.contains("not
declared")) {
+ continue;
+ } else if (msg.contains("JAXP00010001")) {
+ continue;
+ }
}
+ throw e;
+
}
}
}
diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml
index 8dbb87c8a..8e472929f 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/pom.xml
@@ -37,6 +37,7 @@
<module>tika-pipes-s3-integration-tests</module>
<module>tika-resource-loading-tests</module>
<module>tika-pipes-kafka-integration-tests</module>
+ <module>tika-woodstox-tests</module>
</modules>
<dependencies>
diff --git a/tika-integration-tests/pom.xml
b/tika-integration-tests/tika-woodstox-tests/pom.xml
similarity index 55%
copy from tika-integration-tests/pom.xml
copy to tika-integration-tests/tika-woodstox-tests/pom.xml
index 8dbb87c8a..be0f1e9ee 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-woodstox-tests/pom.xml
@@ -17,50 +17,27 @@
specific language governing permissions and limitations
under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
+ <artifactId>tika-integration-tests</artifactId>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
<version>4.0.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
</parent>
- <modelVersion>4.0.0</modelVersion>
-
- <artifactId>tika-integration-tests</artifactId>
- <name>Apache Tika integration tests</name>
- <packaging>pom</packaging>
+ <modelVersion>4.0.0</modelVersion>
- <modules>
- <module>tika-pipes-solr-integration-tests</module>
- <module>tika-pipes-opensearch-integration-tests</module>
- <module>tika-pipes-s3-integration-tests</module>
- <module>tika-resource-loading-tests</module>
- <module>tika-pipes-kafka-integration-tests</module>
- </modules>
+ <artifactId>tika-woodstox-tests</artifactId>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-serialization</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
</dependency>
- <!-- after we migrate everything to junit5, we can get rid of this -->
<dependency>
- <groupId>org.junit.vintage</groupId>
- <artifactId>junit-vintage-engine</artifactId>
- <scope>test</scope>
+ <groupId>com.fasterxml.woodstox</groupId>
+ <artifactId>woodstox-core</artifactId>
</dependency>
</dependencies>
- <scm>
- <tag>3.0.0-rc1</tag>
- </scm>
-</project>
+</project>
\ No newline at end of file
diff --git
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
similarity index 85%
copy from tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
copy to
tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
index 9f14f6636..c0d56cd7c 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
@@ -14,9 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.utils;
+package org.apache.tika.woodstox;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.ByteArrayInputStream;
@@ -25,6 +26,7 @@ import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.AfterAll;
@@ -36,8 +38,13 @@ import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.XMLReaderUtils;
-public class XMLReaderUtilsTest {
+/**
+ * This confirms that XML parsing still works with woodstox on the classpath
+ */
+public class WoodstoxXMLReaderUtilsTest {
private static final Locale defaultLocale = Locale.getDefault();
static {
@@ -119,8 +126,9 @@ public class XMLReaderUtilsTest {
public void testStax() throws Exception {
for (String xml : EXTERNAL_ENTITY_XMLS) {
try {
- javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader =
xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+
assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
StringBuilder sb = new StringBuilder();
while (reader.hasNext()) {
sb.append(reader.next());
@@ -138,6 +146,13 @@ public class XMLReaderUtilsTest {
fail("Vulnerable to local file read via external
entity/dtd: " + xml, e);
}
}
+ } catch (RuntimeException e) {
+ //woodstox
+ String fullStack = ExceptionUtils.getStackTrace(e);
+ if (fullStack.contains("Undeclared general entity")) {
+ continue;
+ }
+ throw e;
}
}
}
@@ -200,23 +215,31 @@ public class XMLReaderUtilsTest {
*/
for (String xml : BILLION_LAUGHS) {
- javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+ XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory(new ParseContext());
XMLEventReader reader = xmlInputFactory.createXMLEventReader(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+ assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
try {
while (reader.hasNext()) {
reader.next();
}
} catch (NoSuchElementException e) {
+ String msg = e.getLocalizedMessage();
//full message on temurin-17: The entity "lol9" was
referenced, but not declared.
- if (e.getMessage() != null && e
- .getMessage()
- .contains("referenced") && e
- .getMessage()
- .contains("not declared")) {
- //swallow -- this is expected
- } else {
- throw e;
+ if (msg != null) {
+ if (msg.contains("referenced") && msg.contains("not
declared")) { //standard Java
+ continue;
+ }
+ }
+ throw e;
+ } catch (RuntimeException e) {
+ //woodstox
+ String fullTrace = ExceptionUtils.getStackTrace(e);
+ if (fullTrace.contains("Undeclared general entity")) {
+ continue;
+ } else if (fullTrace.contains("Maximum entity expansion
count")) {
+ continue;
}
+ throw e;
}
}
}