This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4650-3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 20726385cc4b7a39a967ff4f067bfb06f3a01c05
Author: tallison <[email protected]>
AuthorDate: Thu Feb 5 08:35:53 2026 -0500

    TIKA-4650 - improvements for 3.x
---
 .../org/apache/tika/parser/pkg/PackageParser.java  |  81 ++++++++++++
 .../org/apache/tika/parser/pkg/ZipBenchmark.java   | 144 +++++++++++++++++++++
 .../detect/zip/DefaultZipContainerDetector.java    |   7 +-
 3 files changed, 231 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index d7599bf29e..b6aaff4e9c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -32,6 +32,7 @@ import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Date;
+import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -53,6 +54,7 @@ import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.xml.sax.ContentHandler;
@@ -265,6 +267,16 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
     private void _parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
                 ParseContext context, TemporaryResources tmp)
             throws TikaException, IOException, SAXException {
+        // Check if detector already opened a ZipFile and stored it in 
openContainer
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = tis.getOpenContainer();
+            if (container instanceof ZipFile) {
+                parseZipFile((ZipFile) container, handler, metadata, context);
+                return;
+            }
+        }
+
         ArchiveInputStream ais = null;
         String encoding = null;
         try {
@@ -352,6 +364,75 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
         }
     }
 
+    /**
+     * Parse a ZipFile that was already opened by the detector.
+     * This avoids the overhead of re-opening the file.
+     */
+    private void parseZipFile(ZipFile zipFile, ContentHandler handler, 
Metadata metadata,
+                              ParseContext context) throws IOException, 
SAXException, TikaException {
+        // Update media type if not already set to a specialization
+        String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingContentTypeString == null) {
+            metadata.set(Metadata.CONTENT_TYPE, ZIP.toString());
+        } else {
+            MediaType incomingMediaType = 
MediaType.parse(incomingContentTypeString);
+            if (incomingMediaType != null && 
!PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) {
+                metadata.set(Metadata.CONTENT_TYPE, ZIP.toString());
+            }
+        }
+
+        EmbeddedDocumentExtractor extractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        try {
+            Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
+            while (entries.hasMoreElements()) {
+                ZipArchiveEntry entry = entries.nextElement();
+                if (!entry.isDirectory()) {
+                    parseZipEntry(zipFile, entry, extractor, metadata, xhtml);
+                }
+            }
+        } finally {
+            xhtml.endDocument();
+        }
+    }
+
+    /**
+     * Parse a single entry from a ZipFile.
+     */
+    private void parseZipEntry(ZipFile zipFile, ZipArchiveEntry entry,
+                               EmbeddedDocumentExtractor extractor, Metadata 
parentMetadata,
+                               XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        String name = entry.getName();
+
+        // Try to detect charset of archive entry in case of non-unicode 
filename is used
+        if (detectCharsetsInEntryNames) {
+            byte[] entryName = entry.getRawName();
+            if (entryName != null && entryName.length >= 
MIN_BYTES_FOR_DETECTING_CHARSET) {
+                Charset charset = getEncodingDetector().detect(
+                        new UnsynchronizedByteArrayInputStream(entryName), new 
Metadata());
+                if (charset != null) {
+                    name = new String(entryName, charset);
+                }
+            }
+        }
+
+        Metadata entryMetadata = handleEntryMetadata(name,
+                entry.getCreationTime() != null ? new 
Date(entry.getCreationTime().toMillis()) : null,
+                entry.getLastModifiedTime() != null ? new 
Date(entry.getLastModifiedTime().toMillis()) : null,
+                entry.getSize(), xhtml);
+
+        if (extractor.shouldParseEmbedded(entryMetadata)) {
+            try (InputStream entryStream = zipFile.getInputStream(entry)) {
+                extractor.parseEmbedded(entryStream, xhtml, entryMetadata, 
true);
+            }
+        }
+    }
+
     /**
      * Parse the entries of the zip archive
      *
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
new file mode 100644
index 0000000000..44e3e0761e
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+
+public class ZipBenchmark {
+
+    // Toggle this to switch between DefaultHandler and RecursiveParserWrapper
+    private static final boolean USE_RECURSIVE_PARSER_WRAPPER = true;
+
+    @Test
+    public void benchmarkAutoDetectParser(@TempDir Path tempDir) throws 
Exception {
+        // Enable to run
+        assumeTrue(true, "Set to true to run");
+
+        int iterations = 40;
+        int warmupIterations = 6;
+
+        Path smallZip = tempDir.resolve("small.zip");
+        createBenchmarkZip(smallZip, 10, 1024);
+        System.out.println("Small: " + Files.size(smallZip) / 1024 + " KB");
+
+        Path mediumZip = tempDir.resolve("medium.zip");
+        createBenchmarkZip(mediumZip, 1000, 100 * 1024);
+        System.out.println("Medium: " + Files.size(mediumZip) / (1024 * 1024) 
+ " MB");
+
+        Path largeZip = tempDir.resolve("large.zip");
+        createBenchmarkZip(largeZip, 5000, 500 * 1024);
+        System.out.println("Large: " + Files.size(largeZip) / (1024 * 1024) + 
" MB");
+
+        System.out.println("\n=== ZIP Benchmark ===");
+        System.out.println("Mode: " + (USE_RECURSIVE_PARSER_WRAPPER ? 
"RecursiveParserWrapper" : "DefaultHandler"));
+        System.out.println();
+
+        System.out.println("Small ZIP (10 entries, 10KB):");
+        runBenchmark(smallZip, 10, iterations, warmupIterations);
+
+        System.out.println("\nMedium ZIP (1000 entries, ~100MB):");
+        runBenchmark(mediumZip, 1000, 20, 4);
+
+        System.out.println("\nLarge ZIP (5000 entries, ~2.5GB):");
+        runBenchmark(largeZip, 5000, 10, 2);
+    }
+
+    private void createBenchmarkZip(Path zipPath, int numEntries, int 
entrySize) throws Exception {
+        try (java.util.zip.ZipOutputStream zos =
+                     new 
java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
+            zos.setMethod(java.util.zip.ZipOutputStream.STORED);
+            java.util.Random random = new java.util.Random(42);
+            byte[] content = new byte[entrySize];
+            random.nextBytes(content);
+            java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+            crc.update(content);
+            long crcValue = crc.getValue();
+
+            for (int i = 0; i < numEntries; i++) {
+                java.util.zip.ZipEntry entry = new 
java.util.zip.ZipEntry("entry" + i + ".txt");
+                entry.setMethod(java.util.zip.ZipEntry.STORED);
+                entry.setSize(content.length);
+                entry.setCompressedSize(content.length);
+                entry.setCrc(crcValue);
+                zos.putNextEntry(entry);
+                zos.write(content);
+                zos.closeEntry();
+            }
+        }
+    }
+
+    private void runBenchmark(Path zipPath, int numEntries, int iterations, 
int warmup) throws Exception {
+        AutoDetectParser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+
+        long sizeKB = Files.size(zipPath) / 1024;
+        String sizeStr = sizeKB >= 1024 ? (sizeKB / 1024) + " MB" : sizeKB + " 
KB";
+        System.out.printf(Locale.ROOT, "  Entries: %d, Size: %s%n", 
numEntries, sizeStr);
+
+        // Warmup
+        for (int i = 0; i < warmup; i++) {
+            try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+                if (USE_RECURSIVE_PARSER_WRAPPER) {
+                    parseWithRecursiveWrapper(parser, tis, context);
+                } else {
+                    parser.parse(tis, new DefaultHandler(), new Metadata(), 
context);
+                }
+            }
+        }
+
+        // Benchmark
+        long start = System.nanoTime();
+        for (int i = 0; i < iterations; i++) {
+            try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+                if (USE_RECURSIVE_PARSER_WRAPPER) {
+                    parseWithRecursiveWrapper(parser, tis, context);
+                } else {
+                    parser.parse(tis, new DefaultHandler(), new Metadata(), 
context);
+                }
+            }
+        }
+        long duration = System.nanoTime() - start;
+
+        double avgMs = duration / (double) iterations / 1_000_000.0;
+        System.out.printf(Locale.ROOT, "  Average: %.3f ms%n", avgMs);
+    }
+
+    private void parseWithRecursiveWrapper(AutoDetectParser parser, 
TikaInputStream tis,
+                                           ParseContext context) throws 
Exception {
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
+        BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(factory);
+        wrapper.parse(tis, handler, new Metadata(), context);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 18f8180616..c47c299a58 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -282,7 +282,12 @@ public class DefaultZipContainerDetector implements 
Detector {
         }
         // Fallback: it's still a zip file, we just don't know what kind of one
         if (zip != null) {
-            IOUtils.closeQuietly(zip);
+            // Store ZipFile in openContainer so parser can reuse it
+            if (tis.getOpenContainer() == null) {
+                tis.setOpenContainer(zip);
+            } else {
+                tis.addCloseableResource(zip);
+            }
             return MediaType.APPLICATION_ZIP;
         }
         if (LOG.isDebugEnabled()) {

Reply via email to