This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4650-refactor-zip-parser
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d2a22d5869a0b7bbee94ebbf0bcf15398b53701d
Author: tallison <[email protected]>
AuthorDate: Thu Feb 5 08:34:43 2026 -0500

    TIKA-4650 -- add benchmark
---
 .../org/apache/tika/parser/pkg/ZipBenchmark.java   | 145 +++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
new file mode 100644
index 0000000000..26be54ebda
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+
+public class ZipBenchmark {
+
+    // Toggle this to switch between DefaultHandler and RecursiveParserWrapper
+    private static final boolean USE_RECURSIVE_PARSER_WRAPPER = true;
+
+    @Test
+    public void benchmarkAutoDetectParser(@TempDir Path tempDir) throws 
Exception {
+        // Enable to run
+        assumeTrue(true, "Set to true to run");
+
+        int iterations = 40;
+        int warmupIterations = 6;
+
+        Path smallZip = tempDir.resolve("small.zip");
+        createBenchmarkZip(smallZip, 10, 1024);
+        System.out.println("Small: " + Files.size(smallZip) / 1024 + " KB");
+
+        Path mediumZip = tempDir.resolve("medium.zip");
+        createBenchmarkZip(mediumZip, 1000, 100 * 1024);
+        System.out.println("Medium: " + Files.size(mediumZip) / (1024 * 1024) 
+ " MB");
+
+        Path largeZip = tempDir.resolve("large.zip");
+        createBenchmarkZip(largeZip, 5000, 500 * 1024);
+        System.out.println("Large: " + Files.size(largeZip) / (1024 * 1024) + 
" MB");
+
+        System.out.println("\n=== ZIP Benchmark ===");
+        System.out.println("Mode: " + (USE_RECURSIVE_PARSER_WRAPPER ? 
"RecursiveParserWrapper" : "DefaultHandler"));
+        System.out.println();
+
+        System.out.println("Small ZIP (10 entries, 10KB):");
+        runBenchmark(smallZip, 10, iterations, warmupIterations);
+
+        System.out.println("\nMedium ZIP (1000 entries, ~100MB):");
+        runBenchmark(mediumZip, 1000, 20, 4);
+
+        System.out.println("\nLarge ZIP (5000 entries, ~2.5GB):");
+        runBenchmark(largeZip, 5000, 10, 2);
+    }
+
+    private void createBenchmarkZip(Path zipPath, int numEntries, int 
entrySize) throws Exception {
+        try (java.util.zip.ZipOutputStream zos =
+                     new 
java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
+            zos.setMethod(java.util.zip.ZipOutputStream.STORED);
+            java.util.Random random = new java.util.Random(42);
+            byte[] content = new byte[entrySize];
+            random.nextBytes(content);
+            java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+            crc.update(content);
+            long crcValue = crc.getValue();
+
+            for (int i = 0; i < numEntries; i++) {
+                java.util.zip.ZipEntry entry = new 
java.util.zip.ZipEntry("entry" + i + ".txt");
+                entry.setMethod(java.util.zip.ZipEntry.STORED);
+                entry.setSize(content.length);
+                entry.setCompressedSize(content.length);
+                entry.setCrc(crcValue);
+                zos.putNextEntry(entry);
+                zos.write(content);
+                zos.closeEntry();
+            }
+        }
+    }
+
+    private void runBenchmark(Path zipPath, int numEntries, int iterations, 
int warmup) throws Exception {
+        AutoDetectParser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+
+        long sizeKB = Files.size(zipPath) / 1024;
+        String sizeStr = sizeKB >= 1024 ? (sizeKB / 1024) + " MB" : sizeKB + " 
KB";
+        System.out.printf(Locale.ROOT, "  Entries: %d, Size: %s%n", 
numEntries, sizeStr);
+
+        // Warmup
+        for (int i = 0; i < warmup; i++) {
+            try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+                if (USE_RECURSIVE_PARSER_WRAPPER) {
+                    parseWithRecursiveWrapper(parser, tis, context);
+                } else {
+                    parser.parse(tis, new DefaultHandler(), new Metadata(), 
context);
+                }
+            }
+        }
+
+        // Benchmark
+        long start = System.nanoTime();
+        for (int i = 0; i < iterations; i++) {
+            try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+                if (USE_RECURSIVE_PARSER_WRAPPER) {
+                    parseWithRecursiveWrapper(parser, tis, context);
+                } else {
+                    parser.parse(tis, new DefaultHandler(), new Metadata(), 
context);
+                }
+            }
+        }
+        long duration = System.nanoTime() - start;
+
+        double avgMs = duration / (double) iterations / 1_000_000.0;
+        System.out.printf(Locale.ROOT, "  Average: %.3f ms%n", avgMs);
+    }
+
+    private void parseWithRecursiveWrapper(AutoDetectParser parser, 
TikaInputStream tis,
+                                           ParseContext context) throws 
Exception {
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
+        BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(factory);
+        wrapper.parse(tis, handler, new Metadata(), context);
+    }
+}

Reply via email to