This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 394c737a47 3x-ooxml-bigdecimal-dos (#2840)
394c737a47 is described below

commit 394c737a470638993dc036bdff759f9faa625384
Author: Tim Allison <[email protected]>
AuthorDate: Wed May 27 10:10:01 2026 -0400

    3x-ooxml-bigdecimal-dos (#2840)
    
    bigdecimal issue identified by @tonghuaroot
---
 .../parser/microsoft/ooxml/MetadataExtractor.java  | 276 +++++++++++++++------
 .../microsoft/ooxml/MetadataExtractorTest.java     | 205 +++++++++++++++
 2 files changed, 409 insertions(+), 72 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 97efe3e186..0a4426cc5f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,17 +16,23 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.InputStream;
 import java.math.BigDecimal;
 import java.util.Date;
 import java.util.Optional;
 
 import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
-import 
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import 
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.DublinCore;
@@ -37,10 +43,12 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
 import 
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * OOXML metadata extractor.
@@ -51,6 +59,28 @@ import 
org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
  */
 public class MetadataExtractor {
 
+    private static final String CUSTOM_PROPERTIES_REL =
+            
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";;
+
+    /**
+     * Hard cap on the accumulated text-content of a single property element
+     * inside docProps/custom.xml. Real OOXML property values are at most a few
+     * hundred bytes; anything beyond this is either corruption or an attacker
+     * trying to drive memory or CPU pressure (cf. the {@code <vt:decimal>}
+     * BigDecimal DoS where a 1M-digit literal compresses ~1000:1 in deflate).
+     * 64 KB leaves headroom for any legitimate value while bounding the
+     * slow-path inputs decisively.
+     */
+    static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+    /**
+     * Hard cap on the {@code <vt:decimal>} text length passed to
+     * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+     * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+     * decimal values fit in well under 50 digits; 256 is generous.
+     */
+    static final int MAX_DECIMAL_LENGTH = 256;
+
     private final POIXMLTextExtractor extractor;
 
     public MetadataExtractor(POIXMLTextExtractor extractor) {
@@ -65,7 +95,13 @@ public class MetadataExtractor {
                         extractor instanceof XPSTextExtractor) && 
extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
-            extractMetadata(extractor.getCustomProperties(), metadata);
+            // Custom properties are read via SAX directly from the OPC part
+            // rather than through POI/XMLBeans. The XMLBeans path materializes
+            // an attacker-controlled <vt:decimal> through BigDecimal(String),
+            // which is O(n²) on JDK 17 -- a 3 KB crafted carrier with a
+            // 1,000,000-digit literal burns ~25 s of CPU before this method
+            // even returns. See ooxml-bigdecimal-dos.
+            extractCustomPropertiesViaSAX(extractor.getPackage(), metadata);
         }
     }
 
@@ -157,85 +193,181 @@ public class MetadataExtractor {
         }
     }
 
-    private void extractMetadata(POIXMLProperties.CustomProperties properties, 
Metadata metadata) {
-        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties 
props =
-                properties.getUnderlyingProperties();
-        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-            CTProperty property = props.getPropertyArray(i);
-            String val = null;
-            Date date = null;
-
-            if (property.isSetLpwstr()) {
-                val = property.getLpwstr();
-            } else if (property.isSetLpstr()) {
-                val = property.getLpstr();
-            } else if (property.isSetDate()) {
-                date = property.getDate().getTime();
-            } else if (property.isSetFiletime()) {
-                date = property.getFiletime().getTime();
-            } else if (property.isSetBool()) {
-                val = Boolean.toString(property.getBool());
+    /**
+     * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+     * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+     * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+     * during XML deserialization, which is O(n²) in the digit count on
+     * JDK 17. By reading the part ourselves we can cap both the buffered
+     * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+     * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+     * runs.
+     */
+    private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata 
metadata) {
+        if (opcPackage == null) {
+            return;
+        }
+        try {
+            PackagePart custPart = getRelatedPart(opcPackage, 
CUSTOM_PROPERTIES_REL);
+            if (custPart == null) {
+                return;
+            }
+            CustomPropertiesHandler handler = new CustomPropertiesHandler();
+            try (InputStream is = custPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, handler, new ParseContext());
             }
+            handler.applyTo(metadata);
+        } catch (Exception e) {
+            //swallow
+        }
+    }
 
-            // Integers
-            else if (property.isSetI1()) {
-                val = Integer.toString(property.getI1());
-            } else if (property.isSetI2()) {
-                val = Integer.toString(property.getI2());
-            } else if (property.isSetI4()) {
-                val = Integer.toString(property.getI4());
-            } else if (property.isSetI8()) {
-                val = Long.toString(property.getI8());
-            } else if (property.isSetInt()) {
-                val = Integer.toString(property.getInt());
+    private static PackagePart getRelatedPart(OPCPackage opcPackage, String 
relationshipType) {
+        try {
+            PackageRelationshipCollection rels =
+                    opcPackage.getRelationshipsByType(relationshipType);
+            if (rels == null || rels.size() == 0) {
+                return null;
             }
+            PackageRelationship rel = rels.getRelationship(0);
+            if (rel == null) {
+                return null;
+            }
+            return opcPackage.getPart(rel);
+        } catch (Exception e) {
+            return null;
+        }
+    }
 
-            // Unsigned Integers
-            else if (property.isSetUi1()) {
-                val = Integer.toString(property.getUi1());
-            } else if (property.isSetUi2()) {
-                val = Integer.toString(property.getUi2());
-            } else if (property.isSetUi4()) {
-                val = Long.toString(property.getUi4());
-            } else if (property.isSetUi8()) {
-                val = property.getUi8().toString();
-            } else if (property.isSetUint()) {
-                val = Long.toString(property.getUint());
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream 
parsing.
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
+    /**
+     * SAX handler for {@code docProps/custom.xml} (custom properties).
+     * Matches the schema defined by Microsoft's
+     * {@code 
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+     * namespace, with value types coming from the {@code vt:} namespace.
+     */
+    static class CustomPropertiesHandler extends DefaultHandler {
+
+        private static final String VT_NS =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
+
+        private final Metadata customMetadata = new Metadata();
+        private String currentPropertyName;
+        private String currentValueType;
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) {
+            if ("property".equals(localName)) {
+                currentPropertyName = atts.getValue("name");
+                currentValueType = null;
+            } else if (VT_NS.equals(uri) && currentPropertyName != null
+                    && currentValueType == null) {
+                // Only the direct vt: child of <property> is captured.
+                // Containers like <vt:vector>/<vt:array> latch 
currentValueType
+                // here and their scalar children are then ignored, matching 
the
+                // prior POI/XMLBeans behavior which skipped vectors/arrays.
+                currentValueType = localName;
+                textBuffer.setLength(0);
             }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) {
+            appendCapped(textBuffer, ch, start, length);
+        }
 
-            // Reals
-            else if (property.isSetR4()) {
-                val = Float.toString(property.getR4());
-            } else if (property.isSetR8()) {
-                val = Double.toString(property.getR8());
-            } else if (property.isSetDecimal()) {
-                BigDecimal d = property.getDecimal();
-                if (d == null) {
-                    val = null;
-                } else {
-                    val = d.toPlainString();
+        @Override
+        public void endElement(String uri, String localName, String qName) {
+            if (VT_NS.equals(uri) && currentValueType != null &&
+                    localName.equals(currentValueType) && currentPropertyName 
!= null) {
+                String raw = textBuffer.toString();
+                String trimmed = raw.trim();
+                String propName = "custom:" + currentPropertyName;
+                switch (currentValueType) {
+                    case "lpwstr":
+                    case "lpstr":
+                    case "bstr":
+                        // String values are user-controlled metadata content;
+                        // preserve leading/trailing whitespace as the prior
+                        // POI getLpwstr()/getLpstr() path did.
+                        customMetadata.set(propName, raw);
+                        break;
+                    case "filetime":
+                    case "date":
+                        Property tikaProp = Property.externalDate(propName);
+                        customMetadata.set(tikaProp, trimmed);
+                        break;
+                    case "bool":
+                        // xs:boolean lexical space allows "1"/"0" alongside
+                        // "true"/"false"; the prior POI path emitted
+                        // Boolean.toString(...). Preserve that normalization.
+                        if ("1".equals(trimmed) || 
"true".equalsIgnoreCase(trimmed)) {
+                            customMetadata.set(propName, "true");
+                        } else if ("0".equals(trimmed) || 
"false".equalsIgnoreCase(trimmed)) {
+                            customMetadata.set(propName, "false");
+                        }
+                        break;
+                    case "i1":
+                    case "i2":
+                    case "i4":
+                    case "int":
+                    case "ui1":
+                    case "ui2":
+                        customMetadata.set(propName, trimmed);
+                        break;
+                    case "i8":
+                    case "ui4":
+                    case "ui8":
+                    case "uint":
+                        customMetadata.set(propName, trimmed);
+                        break;
+                    case "r4":
+                    case "r8":
+                        customMetadata.set(propName, trimmed);
+                        break;
+                    case "decimal":
+                        // BigDecimal(String) is O(n²) on JDK 17; cap the input
+                        // length to keep an attacker-controlled <vt:decimal>
+                        // from burning CPU. Real values are < 50 chars; 256 is
+                        // generous. See ooxml-bigdecimal-dos.
+                        if (trimmed.length() > MAX_DECIMAL_LENGTH) {
+                            break;
+                        }
+                        try {
+                            BigDecimal d = new BigDecimal(trimmed);
+                            customMetadata.set(propName, d.toPlainString());
+                        } catch (NumberFormatException e) {
+                            //swallow
+                        }
+                        break;
+                    default:
+                        break;
                 }
-            } else if (property.isSetArray()) {
-                // TODO Fetch the array values and output
-            } else if (property.isSetVector()) {
-                // TODO Fetch the vector values and output
-            } else if (property.isSetBlob() || property.isSetOblob()) {
-                // TODO Decode, if possible
-            } else if (property.isSetStream() || property.isSetOstream() ||
-                    property.isSetVstream()) {
-                // TODO Decode, if possible
-            } else if (property.isSetStorage() || property.isSetOstorage()) {
-                // TODO Decode, if possible
-            } else {
-                // This type isn't currently supported yet, skip the property
+                currentValueType = null;
+            } else if ("property".equals(localName)) {
+                currentPropertyName = null;
+                currentValueType = null;
             }
+        }
 
-            String propName = "custom:" + property.getName();
-            if (date != null) {
-                Property tikaProp = Property.externalDate(propName);
-                metadata.set(tikaProp, date);
-            } else if (val != null) {
-                metadata.set(propName, val);
+        void applyTo(Metadata metadata) {
+            for (String name : customMetadata.names()) {
+                for (String value : customMetadata.getValues(name)) {
+                    metadata.add(name, value);
+                }
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
new file mode 100644
index 0000000000..d941bcfc7b
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractorTest.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link MetadataExtractor}'s SAX-based
+ * custom-properties path (backport of the 4.x SAXBasedMetadataExtractor fix).
+ * <p>
+ * A 3 KB OOXML carrier whose {@code <vt:decimal>} contains a 1,000,000-digit
+ * numeric literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)) when reached through POI/XMLBeans.
+ * 3.x's MetadataExtractor now reads {@code docProps/custom.xml} via SAX
+ * directly, bypassing XMLBeans, and rejects decimal literals longer than
+ * {@link MetadataExtractor#MAX_DECIMAL_LENGTH} before constructing BigDecimal.
+ */
+public class MetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new char[MetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 
10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        MetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        MetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        char[] padding = new char[MetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 
1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        MetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 0, 3);
+        assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+        // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+        // attacker's 1M-digit DoS payload. With the cap in place this should
+        // complete in milliseconds and the property should NOT be set.
+        String hugeDigits = "9".repeat(1000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
hugeDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"),
+                "oversized decimal must be rejected, not parsed");
+        assertTrue(elapsedMs < 2_000,
+                "parse must complete quickly; took " + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadCompletesQuickly() throws 
Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+        // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+        // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+        // decimal-length check then rejects it.
+        String attackDigits = "9".repeat(1_000_000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"));
+        assertTrue(elapsedMs < 2_000,
+                "1M-digit attack payload must not trigger O(n²) BigDecimal; 
took "
+                        + elapsedMs + "ms");
+    }
+
+    @Test
+    public void stringValuesPreserveLeadingAndTrailingWhitespace() throws 
Exception {
+        // The prior POI/XMLBeans path returned the raw element text for
+        // lpwstr/lpstr/bstr; the SAX path must not trim string content.
+        Metadata m = parseCustomProperties(customProperty("note", "lpwstr", "  
hello  "));
+        assertEquals("  hello  ", m.get("custom:note"));
+    }
+
+    @Test
+    public void boolLexicalOneIsNormalizedToTrue() throws Exception {
+        // xs:boolean allows "1"/"0"; previous code emitted 
Boolean.toString(...).
+        Metadata m = parseCustomProperties(customProperty("flag", "bool", 
"1"));
+        assertEquals("true", m.get("custom:flag"));
+    }
+
+    @Test
+    public void boolLexicalZeroIsNormalizedToFalse() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("flag", "bool", 
"0"));
+        assertEquals("false", m.get("custom:flag"));
+    }
+
+    @Test
+    public void boolLexicalTrueAndFalsePassThrough() throws Exception {
+        assertEquals("true",
+                parseCustomProperties(customProperty("a", "bool", 
"true")).get("custom:a"));
+        assertEquals("false",
+                parseCustomProperties(customProperty("b", "bool", 
"false")).get("custom:b"));
+    }
+
+    @Test
+    public void vectorContainingScalarIsNotEmittedAsScalar() throws Exception {
+        // Old POI/XMLBeans path explicitly skipped vector/array. The SAX path
+        // must not leak a nested <vt:lpstr> inside <vt:vector> as a scalar.
+        String xml = CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"items\">"
+                + "<vt:vector size=\"2\" baseType=\"lpstr\">"
+                + "<vt:lpstr>foo</vt:lpstr>"
+                + "<vt:lpstr>bar</vt:lpstr>"
+                + "</vt:vector>"
+                + "</property>"
+                + CUSTOM_FOOTER;
+        Metadata m = parseCustomProperties(xml);
+        assertNull(m.get("custom:items"),
+                "vector contents must not be emitted as a scalar custom 
property");
+    }
+
+    @Test
+    public void oversizedStringIsTruncatedNotRejected() throws Exception {
+        // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+        // accumulation would still be a memory pressure vector. The buffer
+        // cap stops accumulation at 64 KB; the truncated value still flows.
+        String giantString = "a".repeat(200_000);
+        Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", 
giantString));
+        String got = m.get("custom:bigstr");
+        assertNotNull(got, "string-typed property survives truncation");
+        assertEquals(MetadataExtractor.MAX_TEXT_BUFFER_LENGTH, got.length(),
+                "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+    }
+
+    // ===== helpers =====
+
+    private static String customProperty(String name, String type, String 
value) {
+        return CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"" + name + "\">"
+                + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+                + "</property>"
+                + CUSTOM_FOOTER;
+    }
+
+    private static Metadata parseCustomProperties(String xml) throws Exception 
{
+        MetadataExtractor.CustomPropertiesHandler handler =
+                new MetadataExtractor.CustomPropertiesHandler();
+        SAXParserFactory factory = SAXParserFactory.newInstance();
+        factory.setNamespaceAware(true);
+        SAXParser parser = factory.newSAXParser();
+        parser.parse(new InputSource(new ByteArrayInputStream(
+                xml.getBytes(StandardCharsets.UTF_8))), handler);
+        Metadata metadata = new Metadata();
+        handler.applyTo(metadata);
+        return metadata;
+    }
+}

Reply via email to