Copilot commented on code in PR #2838: URL: https://github.com/apache/tika/pull/2838#discussion_r3308541214
########## tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java: ########## @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core.testutil; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Map; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; + +/** + * Base class for plugin {@code ConfigExamplesTest} classes. + * <p> + * Reads JSON examples from {@code src/test/resources/config-examples/} and + * verifies that each one loads via {@link TikaLoader} <strong>and</strong> that + * the {@code fetchers}/{@code emitters}/{@code pipes-iterator} sections have + * the shape the pipes loader actually consumes + * ({@code {"id": {"type-name": {...config...}}}}). + * <p> + * Without the shape check, an example using an unsupported array form like + * <pre> + * "fetchers": [ {"file-system-fetcher": {"id": "fsf", ...}} ] + * </pre> + * loads successfully but registers <em>zero</em> components, which surfaces only + * at runtime as {@code "Can't find fetcher for id=fsf. Available: []"}. + */ +public abstract class AbstractConfigExamplesTest { + Review Comment: This new test utility class is not referenced anywhere in the repository (no subclasses found via search). It is also unrelated to this PR's stated purpose ("fix potential sax dos"). Consider removing it from this PR and submitting it separately together with the plugin `ConfigExamplesTest` classes that will use it, so the SAX DoS fix can be reviewed and merged on its own. ########## tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.junit.jupiter.api.Test; +import org.xml.sax.InputSource; + +import org.apache.tika.metadata.Metadata; + +/** + * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}. + * <p> + * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML + * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric + * literal would otherwise burn ~25 s of CPU per file in JDK 17's + * {@code BigDecimal(String)} (O(n²)). + */ +public class SAXBasedMetadataExtractorTest { + + private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>" + + "<Properties xmlns=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/custom-properties\"" + + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/docPropsVTypes\">"; + private static final String CUSTOM_FOOTER = "</Properties>"; + + @Test + public void appendCappedTruncatesAtLimit() { + StringBuilder buf = new StringBuilder(); + char[] giant = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000]; + java.util.Arrays.fill(giant, '9'); + + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "buffer must be capped at MAX_TEXT_BUFFER_LENGTH"); + + // Further appends after the cap are silent no-ops. + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "appends past the cap must be silently dropped"); + } + + @Test + public void appendCappedRespectsRemainingRoom() { + StringBuilder buf = new StringBuilder(); + // Pre-fill to one short of the cap; next 3 chars should partially land. + char[] padding = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1]; + java.util.Arrays.fill(padding, 'x'); + buf.append(padding); + + SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 0, 3); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "remaining room (1 char) must be filled; overflow dropped"); + assertEquals('a', buf.charAt(buf.length() - 1)); + } + + @Test + public void normalDecimalIsExtracted() throws Exception { + Metadata m = parseCustomProperties(customProperty("price", "decimal", "1234.56")); + assertEquals("1234.56", m.get("custom:price")); + } + + @Test + public void oversizedDecimalIsSkippedNotParsed() throws Exception { + // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the + // attacker's 1M-digit DoS payload. With the cap in place this should + // complete in milliseconds and the property should NOT be set. + String hugeDigits = "9".repeat(1000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", hugeDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil"), + "oversized decimal must be rejected, not parsed"); + assertTrue(elapsedMs < 2_000, + "parse must complete quickly; took " + elapsedMs + "ms"); + } + + @Test + public void oversizedDecimalAttackPayloadCompletesQuickly() throws Exception { + // Reporter's actual attack shape: 1,000,000 digits. Without the cap + // this takes ~25 s on JDK 17. With the cap the SAX read still has to + // accumulate the buffer (bounded to 64 KB by appendCapped) and the + // decimal-length check then rejects it. + String attackDigits = "9".repeat(1_000_000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", attackDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil")); + assertTrue(elapsedMs < 2_000, + "1M-digit attack payload must not trigger O(n²) BigDecimal; took " + + elapsedMs + "ms"); + } + + @Test + public void oversizedStringIsTruncatedNotRejected() throws Exception { + // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text + // accumulation would still be a memory pressure vector. The buffer + // cap stops accumulation at 64 KB; the truncated value still flows. + String giantString = "a".repeat(200_000); + Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", giantString)); + String got = m.get("custom:bigstr"); + assertNotNull(got, "string-typed property survives truncation"); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, got.length(), + "string value must be capped at MAX_TEXT_BUFFER_LENGTH"); + } + + // ===== helpers ===== + + private static String customProperty(String name, String type, String value) { + return CUSTOM_HEADER + + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" pid=\"2\"" + + " name=\"" + name + "\">" + + "<vt:" + type + ">" + value + "</vt:" + type + ">" + + "</property>" + + CUSTOM_FOOTER; + } + + private static Metadata parseCustomProperties(String xml) throws Exception { + SAXBasedMetadataExtractor.CustomPropertiesHandler handler = + new SAXBasedMetadataExtractor.CustomPropertiesHandler(); + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser parser = factory.newSAXParser(); + parser.parse(new InputSource(new ByteArrayInputStream( + xml.getBytes(StandardCharsets.UTF_8))), handler); Review Comment: The test helper uses a plain JAXP `SAXParserFactory` instead of the hardened `XMLReaderUtils.parseSAX(...)` that the production code actually uses. This means tests do not exercise the same parsing path (e.g., `OfflineContentHandler` wrapping, entity expansion limits, secure-processing features) and may diverge from runtime behavior. Consider parsing via `XMLReaderUtils.parseSAX(...)` so the cap defenses are validated under the production parser configuration. ########## tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.junit.jupiter.api.Test; +import org.xml.sax.InputSource; + +import org.apache.tika.metadata.Metadata; + +/** + * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}. + * <p> + * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML + * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric + * literal would otherwise burn ~25 s of CPU per file in JDK 17's + * {@code BigDecimal(String)} (O(n²)). + */ +public class SAXBasedMetadataExtractorTest { + + private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>" + + "<Properties xmlns=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/custom-properties\"" + + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument" + + "/2006/docPropsVTypes\">"; + private static final String CUSTOM_FOOTER = "</Properties>"; + + @Test + public void appendCappedTruncatesAtLimit() { + StringBuilder buf = new StringBuilder(); + char[] giant = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000]; + java.util.Arrays.fill(giant, '9'); + + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "buffer must be capped at MAX_TEXT_BUFFER_LENGTH"); + + // Further appends after the cap are silent no-ops. + SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "appends past the cap must be silently dropped"); + } + + @Test + public void appendCappedRespectsRemainingRoom() { + StringBuilder buf = new StringBuilder(); + // Pre-fill to one short of the cap; next 3 chars should partially land. + char[] padding = new char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1]; + java.util.Arrays.fill(padding, 'x'); + buf.append(padding); + + SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 0, 3); + assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, buf.length(), + "remaining room (1 char) must be filled; overflow dropped"); + assertEquals('a', buf.charAt(buf.length() - 1)); + } + + @Test + public void normalDecimalIsExtracted() throws Exception { + Metadata m = parseCustomProperties(customProperty("price", "decimal", "1234.56")); + assertEquals("1234.56", m.get("custom:price")); + } + + @Test + public void oversizedDecimalIsSkippedNotParsed() throws Exception { + // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the + // attacker's 1M-digit DoS payload. With the cap in place this should + // complete in milliseconds and the property should NOT be set. + String hugeDigits = "9".repeat(1000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", hugeDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil"), + "oversized decimal must be rejected, not parsed"); + assertTrue(elapsedMs < 2_000, + "parse must complete quickly; took " + elapsedMs + "ms"); + } + + @Test + public void oversizedDecimalAttackPayloadCompletesQuickly() throws Exception { + // Reporter's actual attack shape: 1,000,000 digits. Without the cap + // this takes ~25 s on JDK 17. With the cap the SAX read still has to + // accumulate the buffer (bounded to 64 KB by appendCapped) and the + // decimal-length check then rejects it. + String attackDigits = "9".repeat(1_000_000); + long start = System.nanoTime(); + Metadata m = parseCustomProperties(customProperty("evil", "decimal", attackDigits)); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertNull(m.get("custom:evil")); + assertTrue(elapsedMs < 2_000, + "1M-digit attack payload must not trigger O(n²) BigDecimal; took " + + elapsedMs + "ms"); Review Comment: The timing assertion `elapsedMs < 2_000` is a wall-clock assertion that can flake on slow/loaded CI hardware, especially because the test still accumulates the full 1,000,000-char input through SAX `characters()` calls and the 64 KB `appendCapped` buffer. The functional `assertNull(m.get("custom:evil"))` check is sufficient to validate the cap; the time bound risks intermittent failures without providing a strong additional guarantee. Consider dropping the timing assertion or raising the bound substantially. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
