This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch ooxml-bigdecimal-dos
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5d478c4982744abef2988cd68a42d39d2564373f
Author: tallison <[email protected]>
AuthorDate: Tue May 26 15:18:58 2026 -0400

    fix potential sax dos
---
 .../microsoft/ooxml/SAXBasedMetadataExtractor.java |  47 ++++++-
 .../ooxml/SAXBasedMetadataExtractorTest.java       | 156 +++++++++++++++++++++
 .../core/testutil/AbstractConfigExamplesTest.java  | 147 +++++++++++++++++++
 3 files changed, 346 insertions(+), 4 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
index 632eb0c6c8..175db7660d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
@@ -56,6 +56,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
     private static final String CUSTOM_PROPERTIES_REL =
             
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";;
 
+    /**
+     * Hard cap on the accumulated text-content of a single property element.
+     * Real OOXML property values are at most a few hundred bytes; anything 
beyond
+     * this is either corruption or an attacker trying to drive memory or CPU
+     * pressure (cf. the {@code <vt:decimal>} BigDecimal DoS where a 1M-digit
+     * literal compresses ~1000:1 in deflate). 64 KB leaves headroom for any
+     * legitimate value while bounding the slow-path inputs decisively.
+     */
+    static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+    /**
+     * Hard cap on the {@code <vt:decimal>} text length passed to
+     * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+     * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+     * decimal values fit in well under 50 digits; 256 is generous.
+     */
+    static final int MAX_DECIMAL_LENGTH = 256;
+
     private final OPCPackage opcPackage;
     private final ParseContext parseContext;
 
@@ -186,10 +204,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor 
{
         SummaryExtractor.addMulti(metadata, property, value.get());
     }
 
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream parsing
+     * (which will either accept the prefix or reject it as a 
NumberFormatException).
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
     /**
      * SAX handler for docProps/app.xml (extended properties).
      */
-    private static class ExtendedPropertiesHandler extends DefaultHandler {
+    static class ExtendedPropertiesHandler extends DefaultHandler {
 
         private String application;
         private String appVersion;
@@ -219,7 +251,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
 
         @Override
         public void characters(char[] ch, int start, int length) {
-            textBuffer.append(ch, start, length);
+            appendCapped(textBuffer, ch, start, length);
         }
 
         @Override
@@ -364,7 +396,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
     /**
      * SAX handler for docProps/custom.xml (custom properties).
      */
-    private static class CustomPropertiesHandler extends DefaultHandler {
+    static class CustomPropertiesHandler extends DefaultHandler {
 
         private static final String VT_NS =
                 
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
@@ -387,7 +419,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
 
         @Override
         public void characters(char[] ch, int start, int length) {
-            textBuffer.append(ch, start, length);
+            appendCapped(textBuffer, ch, start, length);
         }
 
         @Override
@@ -429,6 +461,13 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
                         customMetadata.set(propName, val);
                         break;
                     case "decimal":
+                        // BigDecimal(String) is O(n²) on JDK 17; cap the input
+                        // length to keep an attacker-controlled <vt:decimal>
+                        // from burning CPU. Real values are < 50 chars; 256 is
+                        // generous. See ooxml-bigdecimal-dos.
+                        if (val.length() > MAX_DECIMAL_LENGTH) {
+                            break;
+                        }
                         try {
                             BigDecimal d = new BigDecimal(val);
                             customMetadata.set(propName, d.toPlainString());
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
new file mode 100644
index 0000000000..90c2921cce
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        // Pre-fill to one short of the cap; next 3 chars should partially 
land.
+        char[] padding = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 
0, 3);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+        // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+        // attacker's 1M-digit DoS payload. With the cap in place this should
+        // complete in milliseconds and the property should NOT be set.
+        String hugeDigits = "9".repeat(1000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
hugeDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"),
+                "oversized decimal must be rejected, not parsed");
+        assertTrue(elapsedMs < 2_000,
+                "parse must complete quickly; took " + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadCompletesQuickly() throws 
Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+        // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+        // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+        // decimal-length check then rejects it.
+        String attackDigits = "9".repeat(1_000_000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"));
+        assertTrue(elapsedMs < 2_000,
+                "1M-digit attack payload must not trigger O(n²) BigDecimal; 
took "
+                        + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedStringIsTruncatedNotRejected() throws Exception {
+        // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+        // accumulation would still be a memory pressure vector. The buffer
+        // cap stops accumulation at 64 KB; the truncated value still flows.
+        String giantString = "a".repeat(200_000);
+        Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", 
giantString));
+        String got = m.get("custom:bigstr");
+        assertNotNull(got, "string-typed property survives truncation");
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
got.length(),
+                "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+    }
+
+    // ===== helpers =====
+
+    private static String customProperty(String name, String type, String 
value) {
+        return CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"" + name + "\">"
+                + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+                + "</property>"
+                + CUSTOM_FOOTER;
+    }
+
+    private static Metadata parseCustomProperties(String xml) throws Exception 
{
+        SAXBasedMetadataExtractor.CustomPropertiesHandler handler =
+                new SAXBasedMetadataExtractor.CustomPropertiesHandler();
+        SAXParserFactory factory = SAXParserFactory.newInstance();
+        factory.setNamespaceAware(true);
+        SAXParser parser = factory.newSAXParser();
+        parser.parse(new InputSource(new ByteArrayInputStream(
+                xml.getBytes(StandardCharsets.UTF_8))), handler);
+        Metadata metadata = new Metadata();
+        handler.applyTo(metadata);
+        return metadata;
+    }
+}
diff --git 
a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java
 
b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java
new file mode 100644
index 0000000000..ea11561113
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.core.testutil;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+
+/**
+ * Base class for plugin {@code ConfigExamplesTest} classes.
+ * <p>
+ * Reads JSON examples from {@code src/test/resources/config-examples/} and
+ * verifies that each one loads via {@link TikaLoader} <strong>and</strong> 
that
+ * the {@code fetchers}/{@code emitters}/{@code pipes-iterator} sections have
+ * the shape the pipes loader actually consumes
+ * ({@code {"id": {"type-name": {...config...}}}}).
+ * <p>
+ * Without the shape check, an example using an unsupported array form like
+ * <pre>
+ *     "fetchers": [ {"file-system-fetcher": {"id": "fsf", ...}} ]
+ * </pre>
+ * loads successfully but registers <em>zero</em> components, which surfaces 
only
+ * at runtime as {@code "Can't find fetcher for id=fsf. Available: []"}.
+ */
+public abstract class AbstractConfigExamplesTest {
+
+    protected static final String EXAMPLES_DIR = "/config-examples/";
+    protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    @TempDir
+    protected Path tempDir;
+
+    protected String readExample(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + EXAMPLES_DIR + 
resourceName);
+            return new String(is.readAllBytes(), StandardCharsets.UTF_8);
+        }
+    }
+
+    /**
+     * Writes the example to a temp file, loads it through {@link TikaLoader},
+     * and validates the shape of every pipes-component section. Returns the
+     * parsed JSON for any further plugin-specific assertions.
+     */
+    protected JsonNode loadAndValidate(String resourceName) throws Exception {
+        String json = readExample(resourceName);
+        Path configFile = tempDir.resolve("tika-config.json");
+        Files.writeString(configFile, json, StandardCharsets.UTF_8);
+        TikaLoader loader = TikaLoader.load(configFile);
+        assertNotNull(loader, "TikaLoader should not be null for: " + 
resourceName);
+
+        JsonNode root = OBJECT_MAPPER.readTree(json);
+        assertIdKeyedSection(root, "fetchers", resourceName);
+        assertIdKeyedSection(root, "emitters", resourceName);
+        assertSingletonSection(root, "pipes-iterator", resourceName);
+        assertSingletonSection(root, "pipes-reporters", resourceName);
+        return root;
+    }
+
+    /**
+     * Sections like {@code fetchers} and {@code emitters} are
+     * {@code {"id": {"type-name": {...}}}}. Catches array-form drift.
+     */
+    private void assertIdKeyedSection(JsonNode root, String section, String 
fixture) {
+        JsonNode node = root.get(section);
+        if (node == null || node.isNull()) {
+            return;
+        }
+        assertTrue(node.isObject(),
+                "[" + fixture + "] '" + section + "' must be a JSON object 
keyed by "
+                        + "instance ID, e.g. {\"my-id\": {\"type-name\": 
{...}}}; got "
+                        + node.getNodeType());
+        Iterator<Map.Entry<String, JsonNode>> ids = node.fields();
+        while (ids.hasNext()) {
+            Map.Entry<String, JsonNode> idEntry = ids.next();
+            JsonNode typed = idEntry.getValue();
+            String path = section + "." + idEntry.getKey();
+            assertTrue(typed.isObject(),
+                    "[" + fixture + "] '" + path + "' must be a JSON object 
wrapping "
+                            + "exactly one type entry");
+            assertEquals(1, typed.size(),
+                    "[" + fixture + "] '" + path + "' must contain exactly one 
type entry; "
+                            + "got " + typed.size() + " fields");
+        }
+    }
+
+    /**
+     * {@code pipes-iterator} is a single component without an ID layer: just
+     * {@code {"type-name": {...}}}.
+     */
+    private void assertSingletonSection(JsonNode root, String section, String 
fixture) {
+        JsonNode node = root.get(section);
+        if (node == null || node.isNull()) {
+            return;
+        }
+        assertTrue(node.isObject(),
+                "[" + fixture + "] '" + section + "' must be a JSON object 
containing "
+                        + "exactly one type entry; got " + node.getNodeType());
+        assertEquals(1, node.size(),
+                "[" + fixture + "] '" + section + "' must contain exactly one 
type entry; "
+                        + "got " + node.size() + " fields");
+    }
+
+    /**
+     * Extract the inner per-type config object: {@code 
root[section][id][typeName]}.
+     * If {@code id} is null, the section is treated as the singleton form 
(e.g.,
+     * {@code pipes-iterator}).
+     */
+    protected JsonNode innerComponent(String json, String section, String id, 
String typeName)
+            throws Exception {
+        JsonNode root = OBJECT_MAPPER.readTree(json);
+        JsonNode sectionNode = root.get(section);
+        assertNotNull(sectionNode, "Missing section: " + section);
+        JsonNode idNode = id == null ? sectionNode : sectionNode.get(id);
+        assertNotNull(idNode, "Missing id: " + id);
+        JsonNode typed = idNode.get(typeName);
+        assertNotNull(typed, "Missing type: " + typeName);
+        return typed;
+    }
+}

Reply via email to