Copilot commented on code in PR #2838:
URL: https://github.com/apache/tika/pull/2838#discussion_r3308541214


##########
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.core.testutil;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+
+/**
+ * Base class for plugin {@code ConfigExamplesTest} classes.
+ * <p>
+ * Reads JSON examples from {@code src/test/resources/config-examples/} and
+ * verifies that each one loads via {@link TikaLoader} <strong>and</strong> 
that
+ * the {@code fetchers}/{@code emitters}/{@code pipes-iterator} sections have
+ * the shape the pipes loader actually consumes
+ * ({@code {"id": {"type-name": {...config...}}}}).
+ * <p>
+ * Without the shape check, an example using an unsupported array form like
+ * <pre>
+ *     "fetchers": [ {"file-system-fetcher": {"id": "fsf", ...}} ]
+ * </pre>
+ * loads successfully but registers <em>zero</em> components, which surfaces 
only
+ * at runtime as {@code "Can't find fetcher for id=fsf. Available: []"}.
+ */
+public abstract class AbstractConfigExamplesTest {
+

Review Comment:
   This new test utility class is not referenced anywhere in the repository (no 
subclasses found via search). It is also unrelated to this PR's stated purpose 
("fix potential sax dos"). Consider removing it from this PR and submitting it 
separately together with the plugin `ConfigExamplesTest` classes that will use 
it, so the SAX DoS fix can be reviewed and merged on its own.
   



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java:
##########
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        // Pre-fill to one short of the cap; next 3 chars should partially 
land.
+        char[] padding = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 
0, 3);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+        // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+        // attacker's 1M-digit DoS payload. With the cap in place this should
+        // complete in milliseconds and the property should NOT be set.
+        String hugeDigits = "9".repeat(1000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
hugeDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"),
+                "oversized decimal must be rejected, not parsed");
+        assertTrue(elapsedMs < 2_000,
+                "parse must complete quickly; took " + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadCompletesQuickly() throws 
Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+        // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+        // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+        // decimal-length check then rejects it.
+        String attackDigits = "9".repeat(1_000_000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"));
+        assertTrue(elapsedMs < 2_000,
+                "1M-digit attack payload must not trigger O(n²) BigDecimal; 
took "
+                        + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedStringIsTruncatedNotRejected() throws Exception {
+        // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+        // accumulation would still be a memory pressure vector. The buffer
+        // cap stops accumulation at 64 KB; the truncated value still flows.
+        String giantString = "a".repeat(200_000);
+        Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", 
giantString));
+        String got = m.get("custom:bigstr");
+        assertNotNull(got, "string-typed property survives truncation");
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
got.length(),
+                "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+    }
+
+    // ===== helpers =====
+
+    private static String customProperty(String name, String type, String 
value) {
+        return CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"" + name + "\">"
+                + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+                + "</property>"
+                + CUSTOM_FOOTER;
+    }
+
+    private static Metadata parseCustomProperties(String xml) throws Exception 
{
+        SAXBasedMetadataExtractor.CustomPropertiesHandler handler =
+                new SAXBasedMetadataExtractor.CustomPropertiesHandler();
+        SAXParserFactory factory = SAXParserFactory.newInstance();
+        factory.setNamespaceAware(true);
+        SAXParser parser = factory.newSAXParser();
+        parser.parse(new InputSource(new ByteArrayInputStream(
+                xml.getBytes(StandardCharsets.UTF_8))), handler);

Review Comment:
   The test helper uses a plain JAXP `SAXParserFactory` instead of the hardened 
`XMLReaderUtils.parseSAX(...)` that the production code actually uses. This 
means tests do not exercise the same parsing path (e.g., 
`OfflineContentHandler` wrapping, entity expansion limits, secure-processing 
features) and may diverge from runtime behavior. Consider parsing via 
`XMLReaderUtils.parseSAX(...)` so the cap defenses are validated under the 
production parser configuration.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java:
##########
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        // Pre-fill to one short of the cap; next 3 chars should partially 
land.
+        char[] padding = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 
0, 3);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+        // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+        // attacker's 1M-digit DoS payload. With the cap in place this should
+        // complete in milliseconds and the property should NOT be set.
+        String hugeDigits = "9".repeat(1000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
hugeDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"),
+                "oversized decimal must be rejected, not parsed");
+        assertTrue(elapsedMs < 2_000,
+                "parse must complete quickly; took " + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadCompletesQuickly() throws 
Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+        // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+        // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+        // decimal-length check then rejects it.
+        String attackDigits = "9".repeat(1_000_000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"));
+        assertTrue(elapsedMs < 2_000,
+                "1M-digit attack payload must not trigger O(n²) BigDecimal; 
took "
+                        + elapsedMs + "ms");

Review Comment:
   The timing assertion `elapsedMs < 2_000` is a wall-clock assertion that can 
flake on slow/loaded CI hardware, especially because the test still accumulates 
the full 1,000,000-char input through SAX `characters()` calls and the 64 KB 
`appendCapped` buffer. The functional `assertNull(m.get("custom:evil"))` check 
is sufficient to validate the cap; the time bound risks intermittent failures 
without providing a strong additional guarantee. Consider dropping the timing 
assertion or raising the bound substantially.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to