tballison commented on code in PR #2838:
URL: https://github.com/apache/tika/pull/2838#discussion_r3311054930


##########
tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/testutil/AbstractConfigExamplesTest.java:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.core.testutil;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+
+/**
+ * Base class for plugin {@code ConfigExamplesTest} classes.
+ * <p>
+ * Reads JSON examples from {@code src/test/resources/config-examples/} and
+ * verifies that each one loads via {@link TikaLoader} <strong>and</strong> 
that
+ * the {@code fetchers}/{@code emitters}/{@code pipes-iterator} sections have
+ * the shape the pipes loader actually consumes
+ * ({@code {"id": {"type-name": {...config...}}}}).
+ * <p>
+ * Without the shape check, an example using an unsupported array form like
+ * <pre>
+ *     "fetchers": [ {"file-system-fetcher": {"id": "fsf", ...}} ]
+ * </pre>
+ * loads successfully but registers <em>zero</em> components, which surfaces 
only
+ * at runtime as {@code "Can't find fetcher for id=fsf. Available: []"}.
+ */
+public abstract class AbstractConfigExamplesTest {
+

Review Comment:
   y, this is less than ideal and should be pushed to a separate pr. good catch.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java:
##########
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.InputSource;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        // Pre-fill to one short of the cap; next 3 chars should partially 
land.
+        char[] padding = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 
0, 3);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void oversizedDecimalIsSkippedNotParsed() throws Exception {
+        // 1,000 digits is well past MAX_DECIMAL_LENGTH (256) but far below the
+        // attacker's 1M-digit DoS payload. With the cap in place this should
+        // complete in milliseconds and the property should NOT be set.
+        String hugeDigits = "9".repeat(1000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
hugeDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"),
+                "oversized decimal must be rejected, not parsed");
+        assertTrue(elapsedMs < 2_000,
+                "parse must complete quickly; took " + elapsedMs + "ms");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadCompletesQuickly() throws 
Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. Without the cap
+        // this takes ~25 s on JDK 17. With the cap the SAX read still has to
+        // accumulate the buffer (bounded to 64 KB by appendCapped) and the
+        // decimal-length check then rejects it.
+        String attackDigits = "9".repeat(1_000_000);
+        long start = System.nanoTime();
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+        assertNull(m.get("custom:evil"));
+        assertTrue(elapsedMs < 2_000,
+                "1M-digit attack payload must not trigger O(n²) BigDecimal; 
took "
+                        + elapsedMs + "ms");

Review Comment:
   agree. fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to