This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new cabd1f2d44 fix potential sax dos (#2838)
cabd1f2d44 is described below
commit cabd1f2d440a80e29f29c0b467ab96caf69aaa30
Author: Tim Allison <[email protected]>
AuthorDate: Wed May 27 10:26:21 2026 -0400
fix potential sax dos (#2838)
BigDecimal dos found and reported by @tonghuaroot.
---
.../microsoft/ooxml/SAXBasedMetadataExtractor.java | 89 +++++++--
.../ooxml/SAXBasedMetadataExtractorTest.java | 216 +++++++++++++++++++++
2 files changed, 292 insertions(+), 13 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
index 632eb0c6c8..df3c6aeb38 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
@@ -56,6 +56,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
private static final String CUSTOM_PROPERTIES_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
+ /**
+ * Hard cap on the accumulated text-content of a single property element.
+ * Real OOXML property values are at most a few hundred bytes; anything
beyond
+ * this is either corruption or an attacker trying to drive memory or CPU
+ * pressure (cf. the {@code <vt:decimal>} BigDecimal DoS where a 1M-digit
+ * literal compresses ~1000:1 in deflate). 64 KB leaves headroom for any
+ * legitimate value while bounding the slow-path inputs decisively.
+ */
+ static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+ /**
+ * Hard cap on the {@code <vt:decimal>} text length passed to
+ * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+ * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+ * decimal values fit in well under 50 digits; 256 is generous.
+ */
+ static final int MAX_DECIMAL_LENGTH = 256;
+
private final OPCPackage opcPackage;
private final ParseContext parseContext;
@@ -186,10 +204,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor
{
SummaryExtractor.addMulti(metadata, property, value.get());
}
+ /**
+ * Append SAX {@code characters()} content to {@code buf}, but stop
accepting
+ * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+ * silently dropped; truncated values still flow through downstream parsing
+ * (which will either accept the prefix or reject it as a
NumberFormatException).
+ */
+ static void appendCapped(StringBuilder buf, char[] ch, int start, int
length) {
+ if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+ return;
+ }
+ int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+ buf.append(ch, start, Math.min(length, remaining));
+ }
+
/**
* SAX handler for docProps/app.xml (extended properties).
*/
- private static class ExtendedPropertiesHandler extends DefaultHandler {
+ static class ExtendedPropertiesHandler extends DefaultHandler {
private String application;
private String appVersion;
@@ -219,7 +251,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
@Override
public void characters(char[] ch, int start, int length) {
- textBuffer.append(ch, start, length);
+ appendCapped(textBuffer, ch, start, length);
}
@Override
@@ -364,7 +396,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
/**
* SAX handler for docProps/custom.xml (custom properties).
*/
- private static class CustomPropertiesHandler extends DefaultHandler {
+ static class CustomPropertiesHandler extends DefaultHandler {
private static final String VT_NS =
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";
@@ -379,7 +411,14 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
if ("property".equals(localName)) {
currentPropertyName = atts.getValue("name");
currentValueType = null;
- } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+ } else if (VT_NS.equals(uri) && currentPropertyName != null
+ && currentValueType == null) {
+ // First vt: child under <property> wins. The == null guard
keeps
+ // <vt:vector>/<vt:array> containers latched as the type so
their
+ // inner children (vt:lpstr, vt:i4, ...) don't get re-emitted
as
+ // a scalar custom property. The container itself falls through
+ // the endElement switch's default branch (no emit), matching
the
+ // prior POI path that explicitly skipped vector/array.
currentValueType = localName;
textBuffer.setLength(0);
}
@@ -387,28 +426,41 @@ class SAXBasedMetadataExtractor extends MetadataExtractor
{
@Override
public void characters(char[] ch, int start, int length) {
- textBuffer.append(ch, start, length);
+ appendCapped(textBuffer, ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
if (VT_NS.equals(uri) && currentValueType != null &&
localName.equals(currentValueType) && currentPropertyName
!= null) {
- String val = textBuffer.toString().trim();
+ // Legacy POI's typed accessors (getLpwstr/getLpstr/getBstr)
returned
+ // the raw element text, while numeric/date/bool accessors
yielded
+ // already-normalized forms. Mirror that here: keep whitespace
in
+ // strings, work with the trimmed form everywhere else.
+ String raw = textBuffer.toString();
+ String trimmed = raw.trim();
String propName = "custom:" + currentPropertyName;
switch (currentValueType) {
case "lpwstr":
case "lpstr":
case "bstr":
- customMetadata.set(propName, val);
+ customMetadata.set(propName, raw);
break;
case "filetime":
case "date":
Property tikaProp = Property.externalDate(propName);
- customMetadata.set(tikaProp, val);
+ customMetadata.set(tikaProp, trimmed);
break;
case "bool":
- customMetadata.set(propName, val);
+ // xs:boolean lexical space is {true,false,1,0}.
Legacy POI
+ // routed through Boolean.toString(getBool()) so
consumers
+ // doing "true".equals(...) never saw the 1/0 form.
Anything
+ // outside the lexical space is dropped, not stored
verbatim.
+ if ("1".equals(trimmed) || "true".equals(trimmed)) {
+ customMetadata.set(propName, "true");
+ } else if ("0".equals(trimmed) ||
"false".equals(trimmed)) {
+ customMetadata.set(propName, "false");
+ }
break;
case "i1":
case "i2":
@@ -416,21 +468,28 @@ class SAXBasedMetadataExtractor extends MetadataExtractor
{
case "int":
case "ui1":
case "ui2":
- customMetadata.set(propName, val);
+ customMetadata.set(propName, trimmed);
break;
case "i8":
case "ui4":
case "ui8":
case "uint":
- customMetadata.set(propName, val);
+ customMetadata.set(propName, trimmed);
break;
case "r4":
case "r8":
- customMetadata.set(propName, val);
+ customMetadata.set(propName, trimmed);
break;
case "decimal":
+ // BigDecimal(String) is O(n²) on JDK 17; cap the input
+ // length to keep an attacker-controlled <vt:decimal>
+ // from burning CPU. Real values are < 50 chars; 256 is
+ // generous. See ooxml-bigdecimal-dos.
+ if (trimmed.length() > MAX_DECIMAL_LENGTH) {
+ break;
+ }
try {
- BigDecimal d = new BigDecimal(val);
+ BigDecimal d = new BigDecimal(trimmed);
customMetadata.set(propName, d.toPlainString());
} catch (NumberFormatException e) {
//swallow
@@ -442,6 +501,10 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
currentValueType = null;
} else if ("property".equals(localName)) {
currentPropertyName = null;
+ // Defensive: if a malformed custom.xml left a vt: container
open
+ // (e.g. <vt:vector> without a matching close before
</property>),
+ // make sure the next property doesn't inherit it.
+ currentValueType = null;
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
new file mode 100644
index 0000000000..99a89f3ef2
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+ private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+ + "<Properties
xmlns=\"http://schemas.openxmlformats.org/officeDocument"
+ + "/2006/custom-properties\""
+ + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument"
+ + "/2006/docPropsVTypes\">";
+ private static final String CUSTOM_FOOTER = "</Properties>";
+
+ @Test
+ public void appendCappedTruncatesAtLimit() {
+ StringBuilder buf = new StringBuilder();
+ char[] giant = new
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+ java.util.Arrays.fill(giant, '9');
+
+ SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+ assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH,
buf.length(),
+ "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+ // Further appends after the cap are silent no-ops.
+ SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+ assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH,
buf.length(),
+ "appends past the cap must be silently dropped");
+ }
+
+ @Test
+ public void appendCappedRespectsRemainingRoom() {
+ StringBuilder buf = new StringBuilder();
+ // Pre-fill to one short of the cap; next 3 chars should partially
land.
+ char[] padding = new
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+ java.util.Arrays.fill(padding, 'x');
+ buf.append(padding);
+
+ SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'},
0, 3);
+ assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH,
buf.length(),
+ "remaining room (1 char) must be filled; overflow dropped");
+ assertEquals('a', buf.charAt(buf.length() - 1));
+ }
+
+ @Test
+ public void normalDecimalIsExtracted() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("price", "decimal",
"1234.56"));
+ assertEquals("1234.56", m.get("custom:price"));
+ }
+
+ @Test
+ public void decimalAtMaxLengthIsAccepted() throws Exception {
+ // Boundary: exactly MAX_DECIMAL_LENGTH digits must still parse. This
is
+ // the upper edge of the accept-and-parse path.
+ String digits =
"9".repeat(SAXBasedMetadataExtractor.MAX_DECIMAL_LENGTH);
+ Metadata m = parseCustomProperties(customProperty("ok", "decimal",
digits));
+ assertEquals(digits, m.get("custom:ok"),
+ "decimal of exactly MAX_DECIMAL_LENGTH digits must
round-trip");
+ }
+
+ @Test
+ public void decimalOneOverMaxLengthIsRejected() throws Exception {
+ // Boundary: one character past the cap must short-circuit before
+ // BigDecimal(String). Combined with appendCappedTruncatesAtLimit this
+ // mechanically proves the O(n²) parser is never invoked above the cap.
+ String digits =
"9".repeat(SAXBasedMetadataExtractor.MAX_DECIMAL_LENGTH + 1);
+ Metadata m = parseCustomProperties(customProperty("evil", "decimal",
digits));
+ assertNull(m.get("custom:evil"),
+ "decimal one char over MAX_DECIMAL_LENGTH must be rejected,
not parsed");
+ }
+
+ @Test
+ public void oversizedDecimalAttackPayloadIsRejected() throws Exception {
+ // Reporter's actual attack shape: 1,000,000 digits. The SAX read
+ // truncates accumulation at MAX_TEXT_BUFFER_LENGTH (64 KB) via
+ // appendCapped, then the decimal-length check rejects the truncated
+ // value before BigDecimal(String) runs. No wall-clock assertion —
+ // the boundary tests above are the mechanical proof that
+ // BigDecimal is never called on an oversized payload.
+ String attackDigits = "9".repeat(1_000_000);
+ Metadata m = parseCustomProperties(customProperty("evil", "decimal",
attackDigits));
+ assertNull(m.get("custom:evil"),
+ "1M-digit attack payload must be rejected without parsing");
+ }
+
+ @Test
+ public void oversizedStringIsTruncatedNotRejected() throws Exception {
+ // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+ // accumulation would still be a memory pressure vector. The buffer
+ // cap stops accumulation at 64 KB; the truncated value still flows.
+ String giantString = "a".repeat(200_000);
+ Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr",
giantString));
+ String got = m.get("custom:bigstr");
+ assertNotNull(got, "string-typed property survives truncation");
+ assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH,
got.length(),
+ "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+ }
+
+ @Test
+ public void stringValuesPreserveLeadingAndTrailingWhitespace() throws
Exception {
+ // Legacy POI's getLpwstr/getLpstr/getBstr returned the raw element
text;
+ // anything that depends on whitespace inside the value would regress
if
+ // the SAX path silently trimmed.
+ Metadata m = parseCustomProperties(
+ customProperty("padded", "lpwstr", " hello world "));
+ assertEquals(" hello world ", m.get("custom:padded"),
+ "lpwstr must preserve leading/trailing whitespace");
+
+ m = parseCustomProperties(customProperty("padded", "lpstr", " ascii
"));
+ assertEquals(" ascii ", m.get("custom:padded"));
+
+ m = parseCustomProperties(customProperty("padded", "bstr",
"\tindented\n"));
+ assertEquals("\tindented\n", m.get("custom:padded"));
+ }
+
+ @Test
+ public void boolLexicalOneIsNormalizedToTrue() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("flag", "bool",
"1"));
+ assertEquals("true", m.get("custom:flag"),
+ "<vt:bool>1</vt:bool> must normalize to \"true\" (matching
legacy POI)");
+ }
+
+ @Test
+ public void boolLexicalZeroIsNormalizedToFalse() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("flag", "bool",
"0"));
+ assertEquals("false", m.get("custom:flag"),
+ "<vt:bool>0</vt:bool> must normalize to \"false\" (matching
legacy POI)");
+ }
+
+ @Test
+ public void boolLexicalTrueAndFalsePassThrough() throws Exception {
+ Metadata m = parseCustomProperties(customProperty("flag", "bool",
"true"));
+ assertEquals("true", m.get("custom:flag"));
+
+ m = parseCustomProperties(customProperty("flag", "bool", "false"));
+ assertEquals("false", m.get("custom:flag"));
+ }
+
+ @Test
+ public void vectorContainingScalarIsNotEmittedAsScalar() throws Exception {
+ // <vt:vector> with inner <vt:lpstr> children. The container latches as
+ // the value type; inner children must NOT overwrite it, and the
+ // container itself must not be emitted as a scalar. Legacy POI skipped
+ // vector/array entirely.
+ String xml = CUSTOM_HEADER
+ + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\"
pid=\"2\""
+ + " name=\"tags\">"
+ + "<vt:vector size=\"2\" baseType=\"lpstr\">"
+ + "<vt:lpstr>alpha</vt:lpstr>"
+ + "<vt:lpstr>beta</vt:lpstr>"
+ + "</vt:vector>"
+ + "</property>"
+ + CUSTOM_FOOTER;
+ Metadata m = parseCustomProperties(xml);
+ assertNull(m.get("custom:tags"),
+ "vector container must not emit a scalar custom property");
+ }
+
+ // ===== helpers =====
+
+ private static String customProperty(String name, String type, String
value) {
+ return CUSTOM_HEADER
+ + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\"
pid=\"2\""
+ + " name=\"" + name + "\">"
+ + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+ + "</property>"
+ + CUSTOM_FOOTER;
+ }
+
+ private static Metadata parseCustomProperties(String xml) throws Exception
{
+ SAXBasedMetadataExtractor.CustomPropertiesHandler handler =
+ new SAXBasedMetadataExtractor.CustomPropertiesHandler();
+ try (InputStream is = new ByteArrayInputStream(
+ xml.getBytes(StandardCharsets.UTF_8))) {
+ XMLReaderUtils.parseSAX(is, handler, new ParseContext());
+ }
+ Metadata metadata = new Metadata();
+ handler.applyTo(metadata);
+ return metadata;
+ }
+}