This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 16de8cc0e TIKA-4482 -- don't fail if woodstox is on the classpath 
(#2320)
16de8cc0e is described below

commit 16de8cc0efcdb21785a448d1c2dbd8a1c925dfc2
Author: Tim Allison <[email protected]>
AuthorDate: Wed Sep 10 10:42:47 2025 -0400

    TIKA-4482 -- don't fail if woodstox is on the classpath (#2320)
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 22 +++++++++-
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 32 ++++++++++-----
 tika-integration-tests/pom.xml                     |  1 +
 .../{ => tika-woodstox-tests}/pom.xml              | 37 ++++-------------
 .../tika/woodstox/WoodstoxXMLReaderUtilsTest.java  | 47 ++++++++++++++++------
 5 files changed, 86 insertions(+), 53 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 6a6a9dfc3..fe57f04ee 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -37,12 +37,14 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLResolver;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.TransformerFactoryConfigurationError;
 import javax.xml.transform.sax.SAXTransformerFactory;
 
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
@@ -123,6 +125,11 @@ public class XMLReaderUtils implements Serializable {
     private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER =
             (publicId, systemId) -> new InputSource(new StringReader(""));
+
+    //BE CAREFUL with the return type. Some parsers will silently ignore an 
unexpected return type: CVE-2025-54988
+    private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
+            (publicID, systemID, baseURI, namespace) ->
+                    UnsynchronizedByteArrayInputStream.nullInputStream();
     /**
      * Parser pool size
      */
@@ -302,12 +309,17 @@ public class XMLReaderUtils implements Serializable {
         if (LOG.isDebugEnabled()) {
             LOG.debug("XMLInputFactory class {}", factory.getClass());
         }
-        factory.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, "");
+
         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, 
true);
+
+        //try to configure secure processing
+        tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
         tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false);
         tryToSetStaxProperty(factory, 
XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
 
+        //defense in depth
+        factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
         trySetStaxSecurityManager(factory);
         return factory;
     }
@@ -361,6 +373,14 @@ public class XMLReaderUtils implements Serializable {
         }
     }
 
+    private static void tryToSetStaxProperty(XMLInputFactory factory, String 
key, String value) {
+        try {
+            factory.setProperty(key, value);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("StAX Feature unsupported: {}", key, e);
+        }
+    }
+
     /**
      * Returns a new transformer
      * <p>
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 9f14f6636..310a8b158 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -25,6 +25,7 @@ import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 import java.util.NoSuchElementException;
 import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLStreamException;
 
 import org.junit.jupiter.api.AfterAll;
@@ -37,6 +38,15 @@ import org.xml.sax.SAXException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ToTextContentHandler;
 
+/**
+ * Class to test that XMLReaderUtils defends against xxe and billion laughs.
+ * <p>
+ * Different versions and different implementations vary. This is not a fully 
comprehensive set of tests.
+ * <p>
+ * Please add more.
+ * <p>
+ * See also the tests with woodstox in tika-woodstox-tests.
+ */
 public class XMLReaderUtilsTest {
 
     private static final Locale defaultLocale = Locale.getDefault();
@@ -119,7 +129,7 @@ public class XMLReaderUtilsTest {
     public void testStax() throws Exception {
         for (String xml : EXTERNAL_ENTITY_XMLS) {
             try {
-                javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
                 XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
                 StringBuilder sb = new StringBuilder();
                 while (reader.hasNext()) {
@@ -200,7 +210,7 @@ public class XMLReaderUtilsTest {
          */
 
         for (String xml : BILLION_LAUGHS) {
-            javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
             XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
             try {
                 while (reader.hasNext()) {
@@ -208,15 +218,17 @@ public class XMLReaderUtilsTest {
                 }
             } catch (NoSuchElementException e) {
                 //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
-                if (e.getMessage() != null && e
-                        .getMessage()
-                        .contains("referenced") && e
-                        .getMessage()
-                        .contains("not declared")) {
-                    //swallow -- this is expected
-                } else {
-                    throw e;
+                String msg = e.getLocalizedMessage();
+
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) {
+                        continue;
+                    } else if (msg.contains("JAXP00010001")) {
+                        continue;
+                    }
                 }
+                throw e;
+
             }
         }
     }
diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml
index 8dbb87c8a..8e472929f 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/pom.xml
@@ -37,6 +37,7 @@
     <module>tika-pipes-s3-integration-tests</module>
     <module>tika-resource-loading-tests</module>
     <module>tika-pipes-kafka-integration-tests</module>
+    <module>tika-woodstox-tests</module>
   </modules>
 
   <dependencies>
diff --git a/tika-integration-tests/pom.xml 
b/tika-integration-tests/tika-woodstox-tests/pom.xml
similarity index 55%
copy from tika-integration-tests/pom.xml
copy to tika-integration-tests/tika-woodstox-tests/pom.xml
index 8dbb87c8a..be0f1e9ee 100644
--- a/tika-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-woodstox-tests/pom.xml
@@ -17,50 +17,27 @@
   specific language governing permissions and limitations
   under the License.
 -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
https://maven.apache.org/xsd/maven-4.0.0.xsd";>
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
   <parent>
+    <artifactId>tika-integration-tests</artifactId>
     <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parent</artifactId>
     <version>4.0.0-SNAPSHOT</version>
-    <relativePath>../tika-parent/pom.xml</relativePath>
   </parent>
-  <modelVersion>4.0.0</modelVersion>
-
-  <artifactId>tika-integration-tests</artifactId>
-  <name>Apache Tika integration tests</name>
 
-  <packaging>pom</packaging>
+  <modelVersion>4.0.0</modelVersion>
 
-  <modules>
-    <module>tika-pipes-solr-integration-tests</module>
-    <module>tika-pipes-opensearch-integration-tests</module>
-    <module>tika-pipes-s3-integration-tests</module>
-    <module>tika-resource-loading-tests</module>
-    <module>tika-pipes-kafka-integration-tests</module>
-  </modules>
+  <artifactId>tika-woodstox-tests</artifactId>
 
   <dependencies>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-core</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-serialization</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
     </dependency>
-    <!-- after we migrate everything to junit5, we can get rid of this -->
     <dependency>
-      <groupId>org.junit.vintage</groupId>
-      <artifactId>junit-vintage-engine</artifactId>
-      <scope>test</scope>
+      <groupId>com.fasterxml.woodstox</groupId>
+      <artifactId>woodstox-core</artifactId>
     </dependency>
   </dependencies>
 
-  <scm>
-    <tag>3.0.0-rc1</tag>
-  </scm>
-</project>
+</project>
\ No newline at end of file
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
similarity index 85%
copy from tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
copy to 
tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
index 9f14f6636..c0d56cd7c 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ 
b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java
@@ -14,9 +14,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.utils;
+package org.apache.tika.woodstox;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.ByteArrayInputStream;
@@ -25,6 +26,7 @@ import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 import java.util.NoSuchElementException;
 import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLStreamException;
 
 import org.junit.jupiter.api.AfterAll;
@@ -36,8 +38,13 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.XMLReaderUtils;
 
-public class XMLReaderUtilsTest {
+/**
+ * This confirms that XML parsing still works with woodstox on the classpath
+ */
+public class WoodstoxXMLReaderUtilsTest {
 
     private static final Locale defaultLocale = Locale.getDefault();
     static {
@@ -119,8 +126,9 @@ public class XMLReaderUtilsTest {
     public void testStax() throws Exception {
         for (String xml : EXTERNAL_ENTITY_XMLS) {
             try {
-                javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+                XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
                 XMLEventReader reader = 
xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                
assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
                 StringBuilder sb = new StringBuilder();
                 while (reader.hasNext()) {
                     sb.append(reader.next());
@@ -138,6 +146,13 @@ public class XMLReaderUtilsTest {
                         fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
                     }
                 }
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullStack = ExceptionUtils.getStackTrace(e);
+                if (fullStack.contains("Undeclared general entity")) {
+                    continue;
+                }
+                throw e;
             }
         }
     }
@@ -200,23 +215,31 @@ public class XMLReaderUtilsTest {
          */
 
         for (String xml : BILLION_LAUGHS) {
-            javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
+            XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory(new ParseContext());
             XMLEventReader reader = xmlInputFactory.createXMLEventReader(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            assertTrue(reader.getClass().getName().contains("com.ctc.wstx"));
             try {
                 while (reader.hasNext()) {
                     reader.next();
                 }
             } catch (NoSuchElementException e) {
+                String msg = e.getLocalizedMessage();
                 //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
-                if (e.getMessage() != null && e
-                        .getMessage()
-                        .contains("referenced") && e
-                        .getMessage()
-                        .contains("not declared")) {
-                    //swallow -- this is expected
-                } else {
-                    throw e;
+                if (msg != null) {
+                    if (msg.contains("referenced") && msg.contains("not 
declared")) { //standard Java
+                        continue;
+                    }
+                }
+                throw e;
+            } catch (RuntimeException e) {
+                //woodstox
+                String fullTrace = ExceptionUtils.getStackTrace(e);
+                if (fullTrace.contains("Undeclared general entity")) {
+                    continue;
+                } else if (fullTrace.contains("Maximum entity expansion 
count")) {
+                    continue;
                 }
+                throw e;
             }
         }
     }

Reply via email to