This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch ooxml-bigdecimal-dos in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5d478c4982744abef2988cd68a42d39d2564373f Author: tallison <[email protected]> AuthorDate: Tue May 26 15:18:58 2026 -0400 fix potential sax dos --- .../microsoft/ooxml/SAXBasedMetadataExtractor.java | 47 ++++++- .../ooxml/SAXBasedMetadataExtractorTest.java | 156 +++++++++++++++++++++ .../core/testutil/AbstractConfigExamplesTest.java | 147 +++++++++++++++++++ 3 files changed, 346 insertions(+), 4 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java index 632eb0c6c8..175db7660d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java @@ -56,6 +56,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { private static final String CUSTOM_PROPERTIES_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties"; + /** + * Hard cap on the accumulated text-content of a single property element. + * Real OOXML property values are at most a few hundred bytes; anything beyond + * this is either corruption or an attacker trying to drive memory or CPU + * pressure (cf. the {@code <vt:decimal>} BigDecimal DoS where a 1M-digit + * literal compresses ~1000:1 in deflate). 64 KB leaves headroom for any + * legitimate value while bounding the slow-path inputs decisively. + */ + static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024; + + /** + * Hard cap on the {@code <vt:decimal>} text length passed to + * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the + * digit count, so even a 64 KB string costs noticeable CPU. Real-world + * decimal values fit in well under 50 digits; 256 is generous. + */ + static final int MAX_DECIMAL_LENGTH = 256; + private final OPCPackage opcPackage; private final ParseContext parseContext; @@ -186,10 +204,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { SummaryExtractor.addMulti(metadata, property, value.get()); } + /** + * Append SAX {@code characters()} content to {@code buf}, but stop accepting + * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are + * silently dropped; truncated values still flow through downstream parsing + * (which will either accept the prefix or reject it as a NumberFormatException). + */ + static void appendCapped(StringBuilder buf, char[] ch, int start, int length) { + if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) { + return; + } + int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length(); + buf.append(ch, start, Math.min(length, remaining)); + } + /** * SAX handler for docProps/app.xml (extended properties). */ - private static class ExtendedPropertiesHandler extends DefaultHandler { + static class ExtendedPropertiesHandler extends DefaultHandler { private String application; private String appVersion; @@ -219,7 +251,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { @Override public void characters(char[] ch, int start, int length) { - textBuffer.append(ch, start, length); + appendCapped(textBuffer, ch, start, length); } @Override @@ -364,7 +396,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { /** * SAX handler for docProps/custom.xml (custom properties). */ - private static class CustomPropertiesHandler extends DefaultHandler { + static class CustomPropertiesHandler extends DefaultHandler { private static final String VT_NS = "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"; @@ -387,7 +419,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { @Override public void characters(char[] ch, int start, int length) { - textBuffer.append(ch, start, length); + appendCapped(textBuffer, ch, start, length); } @Override @@ -429,6 +461,13 @@ class SAXBasedMetadataExtractor extends MetadataExtractor { customMetadata.set(propName, val); break; case "decimal": + // BigDecimal(String) is O(n²) on JDK 17; cap the input + // length to keep an attacker-controlled <vt:decimal> + // from burning CPU. Real values are < 50 chars; 256 is + // generous. See ooxml-bigdecimal-dos. + if (val.length() > MAX_DECIMAL_LENGTH) { + break; + } try { BigDecimal d = new BigDecimal(val); customMetadata.set(propName, d.toPlainString()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java new file mode 100644 index 0000000000..90c2921cce --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.junit.jupiter.api.Test; +import org.xml.sax.InputSource; + +import org.apache.tika.metadata.Metadata; + +/** + * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}. + * <p> + * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML + * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric + * literal would otherwise burn ~25 s of CPU per file in JDK 17's + * {@code BigDecimal(String)} (O(n²)). + */ +public class SAXBasedMetadataExtractorTest { + + private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>" + + "<Properties xmlns=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/custom-properties\"" + + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/docPropsVTypes\">"; + private static final String CUSTOM_FOOTER = "</Properties>"; + + @Test + public void appendCappedTruncatesAtLimit() { + StringBuilder buf = new StringBuilder(); + char[] giant = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000]; + java.util.Arrays.fill(giant, '9'); + + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "buffer must be capped at MAX_TEXT_BUFFER_LENGTH"); + + // Further appends after the cap are silent no-ops. + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "appends past the cap must be silently dropped"); + } + + @Test + public void appendCappedRespectsRemainingRoom() { + StringBuilder buf = new StringBuilder(); + // Pre-fill to one short of the cap; next 3 chars should partially land. + char[] padding = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1]; + java.util.Arrays.fill(padding, 'x'); + buf.append(padding); + + SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 0, 3); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "remaining room (1 char) must be filled; overflow dropped"); + assertEquals('a', buf.charAt(buf.length() - 1)); + } + + @Test + public void normalDecimalIsExtracted() throws Exception { + Metadata m = parseCustomProperties(customProperty("price", "decimal", "1234.56")); + assertEquals("1234.56", m.get("custom:price")); + } + + @Test + public void oversizedDecimalIsSkippedNotParsed() throws Exception { + // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the + // attacker's 1M-digit DoS payload. With the cap in place this should + // complete in milliseconds and the property should NOT be set. + String hugeDigits = "9".repeat(1000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", hugeDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil"), + "oversized decimal must be rejected, not parsed"); + assertTrue(elapsedMs < 2_000, + "parse must complete quickly; took " + elapsedMs + "ms"); + } + + @Test + public void oversizedDecimalAttackPayloadCompletesQuickly() throws Exception { + // Reporter's actual attack shape: 1,000,000 digits. Without the cap + // this takes ~25 s on JDK 17. With the cap the SAX read still has to + // accumulate the buffer (bounded to 64 KB by appendCapped) and the + // decimal-length check then rejects it. + String attackDigits = "9".repeat(1_000_000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", attackDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil")); + assertTrue(elapsedMs < 2_000, + "1M-digit attack payload must not trigger O(n²) BigDecimal; took " + + elapsedMs + "ms"); + } + + @Test + public void oversizedStringIsTruncatedNotRejected() throws Exception { + // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text + // accumulation would still be a memory pressure vector. The buffer + // cap stops accumulation at 64 KB; the truncated value still flows. + String giantString = "a".repeat(200_000); + Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", giantString)); + String got = m.get("custom:bigstr"); + assertNotNull(got, "string-typed property survives truncation"); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, got.length(), + "string value must be capped at MAX_TEXT_BUFFER_LENGTH"); + } + + // ===== helpers ===== + + private static String customProperty(String name, String type, String value) { + return CUSTOM_HEADER + + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" pid=\"2\"" + + " name=\"" + name + "\">" + + "<vt:" + type + ">" + value + "</vt:" + type + ">" + + "</property>" + + CUSTOM_FOOTER; + } + + private static Metadata parseCustomProperties(String xml) throws Exception { + SAXBasedMetadataExtractor.CustomPropertiesHandler handler = + new SAXBasedMetadataExtractor.CustomPropertiesHandler(); + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser parser = factory.newSAXParser(); + parser.parse(new InputSource(new ByteArrayInputStream( + xml.getBytes(StandardCharsets.UTF_8))), handler); + Metadata metadata = new Metadata(); + handler.applyTo(metadata); + return metadata; + } +} diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java new file mode 100644 index 0000000000..ea11561113 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core.testutil; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Map; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; + +/** + * Base class for plugin {@code ConfigExamplesTest} classes. + * <p> + * Reads JSON examples from {@code src/test/resources/config-examples/} and + * verifies that each one loads via {@link TikaLoader} <strong>and</strong> that + * the {@code fetchers}/{@code emitters}/{@code pipes-iterator} sections have + * the shape the pipes loader actually consumes + * ({@code {"id": {"type-name": {...config...}}}}). + * <p> + * Without the shape check, an example using an unsupported array form like + * <pre> + * "fetchers": [ {"file-system-fetcher": {"id": "fsf", ...}} ] + * </pre> + * loads successfully but registers <em>zero</em> components, which surfaces only + * at runtime as {@code "Can't find fetcher for id=fsf. Available: []"}. + */ +public abstract class AbstractConfigExamplesTest { + + protected static final String EXAMPLES_DIR = "/config-examples/"; + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + protected Path tempDir; + + protected String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + EXAMPLES_DIR + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + /** + * Writes the example to a temp file, loads it through {@link TikaLoader}, + * and validates the shape of every pipes-component section. Returns the + * parsed JSON for any further plugin-specific assertions. + */ + protected JsonNode loadAndValidate(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + + JsonNode root = OBJECT_MAPPER.readTree(json); + assertIdKeyedSection(root, "fetchers", resourceName); + assertIdKeyedSection(root, "emitters", resourceName); + assertSingletonSection(root, "pipes-iterator", resourceName); + assertSingletonSection(root, "pipes-reporters", resourceName); + return root; + } + + /** + * Sections like {@code fetchers} and {@code emitters} are + * {@code {"id": {"type-name": {...}}}}. Catches array-form drift. + */ + private void assertIdKeyedSection(JsonNode root, String section, String fixture) { + JsonNode node = root.get(section); + if (node == null || node.isNull()) { + return; + } + assertTrue(node.isObject(), + "[" + fixture + "] '" + section + "' must be a JSON object keyed by " + + "instance ID, e.g. {\"my-id\": {\"type-name\": {...}}}; got " + + node.getNodeType()); + Iterator<Map.Entry<String, JsonNode>> ids = node.fields(); + while (ids.hasNext()) { + Map.Entry<String, JsonNode> idEntry = ids.next(); + JsonNode typed = idEntry.getValue(); + String path = section + "." + idEntry.getKey(); + assertTrue(typed.isObject(), + "[" + fixture + "] '" + path + "' must be a JSON object wrapping " + + "exactly one type entry"); + assertEquals(1, typed.size(), + "[" + fixture + "] '" + path + "' must contain exactly one type entry; " + + "got " + typed.size() + " fields"); + } + } + + /** + * {@code pipes-iterator} is a single component without an ID layer: just + * {@code {"type-name": {...}}}. + */ + private void assertSingletonSection(JsonNode root, String section, String fixture) { + JsonNode node = root.get(section); + if (node == null || node.isNull()) { + return; + } + assertTrue(node.isObject(), + "[" + fixture + "] '" + section + "' must be a JSON object containing " + + "exactly one type entry; got " + node.getNodeType()); + assertEquals(1, node.size(), + "[" + fixture + "] '" + section + "' must contain exactly one type entry; " + + "got " + node.size() + " fields"); + } + + /** + * Extract the inner per-type config object: {@code root[section][id][typeName]}. + * If {@code id} is null, the section is treated as the singleton form (e.g., + * {@code pipes-iterator}). + */ + protected JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } +}
