This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 858a7da782 TIKA-4650 - improvements for 3.x (#2585)
858a7da782 is described below
commit 858a7da78221e13214c23870c7a68f73291acef6
Author: Tim Allison <[email protected]>
AuthorDate: Thu Feb 5 11:57:14 2026 -0500
TIKA-4650 - improvements for 3.x (#2585)
---
.../org/apache/tika/parser/pkg/PackageParser.java | 83 ++++++++++++
.../org/apache/tika/parser/pkg/ZipBenchmark.java | 144 +++++++++++++++++++++
.../detect/zip/DefaultZipContainerDetector.java | 7 +-
3 files changed, 233 insertions(+), 1 deletion(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index d7599bf29e..44a6752ea9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -32,6 +32,7 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Date;
+import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
@@ -53,6 +54,7 @@ import
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.ContentHandler;
@@ -265,6 +267,18 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
private void _parse(InputStream stream, ContentHandler handler, Metadata
metadata,
ParseContext context, TemporaryResources tmp)
throws TikaException, IOException, SAXException {
+ // Check if detector already opened a ZipFile and stored it in
openContainer
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = tis.getOpenContainer();
+ if (container instanceof ZipFile) {
+ // Ensure the ZipFile gets closed when tmp is closed
+ tmp.addResource((ZipFile) container);
+ parseZipFile((ZipFile) container, handler, metadata, context);
+ return;
+ }
+ }
+
ArchiveInputStream ais = null;
String encoding = null;
try {
@@ -352,6 +366,75 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
}
}
+ /**
+ * Parse a ZipFile that was already opened by the detector.
+ * This avoids the overhead of re-opening the file.
+ */
+ private void parseZipFile(ZipFile zipFile, ContentHandler handler,
Metadata metadata,
+ ParseContext context) throws IOException,
SAXException, TikaException {
+ // Update media type if not already set to a specialization
+ String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
+ if (incomingContentTypeString == null) {
+ metadata.set(Metadata.CONTENT_TYPE, ZIP.toString());
+ } else {
+ MediaType incomingMediaType =
MediaType.parse(incomingContentTypeString);
+ if (incomingMediaType != null &&
!PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) {
+ metadata.set(Metadata.CONTENT_TYPE, ZIP.toString());
+ }
+ }
+
+ EmbeddedDocumentExtractor extractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ try {
+ Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ if (!entry.isDirectory()) {
+ parseZipEntry(zipFile, entry, extractor, metadata, xhtml);
+ }
+ }
+ } finally {
+ xhtml.endDocument();
+ }
+ }
+
+ /**
+ * Parse a single entry from a ZipFile.
+ */
+ private void parseZipEntry(ZipFile zipFile, ZipArchiveEntry entry,
+ EmbeddedDocumentExtractor extractor, Metadata
parentMetadata,
+ XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ String name = entry.getName();
+
+ // Try to detect charset of archive entry in case of non-unicode
filename is used
+ if (detectCharsetsInEntryNames) {
+ byte[] entryName = entry.getRawName();
+ if (entryName != null && entryName.length >=
MIN_BYTES_FOR_DETECTING_CHARSET) {
+ Charset charset = getEncodingDetector().detect(
+ new UnsynchronizedByteArrayInputStream(entryName), new
Metadata());
+ if (charset != null) {
+ name = new String(entryName, charset);
+ }
+ }
+ }
+
+ Metadata entryMetadata = handleEntryMetadata(name,
+ entry.getCreationTime() != null ? new
Date(entry.getCreationTime().toMillis()) : null,
+ entry.getLastModifiedTime() != null ? new
Date(entry.getLastModifiedTime().toMillis()) : null,
+ entry.getSize(), xhtml);
+
+ if (extractor.shouldParseEmbedded(entryMetadata)) {
+ try (InputStream entryStream = zipFile.getInputStream(entry)) {
+ extractor.parseEmbedded(entryStream, xhtml, entryMetadata,
true);
+ }
+ }
+ }
+
/**
* Parse the entries of the zip archive
*
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
new file mode 100644
index 0000000000..44e3e0761e
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+
+public class ZipBenchmark {
+
+ // Toggle this to switch between DefaultHandler and RecursiveParserWrapper
+ private static final boolean USE_RECURSIVE_PARSER_WRAPPER = true;
+
+ @Test
+ public void benchmarkAutoDetectParser(@TempDir Path tempDir) throws
Exception {
+ // Enable to run
+ assumeTrue(true, "Set to true to run");
+
+ int iterations = 40;
+ int warmupIterations = 6;
+
+ Path smallZip = tempDir.resolve("small.zip");
+ createBenchmarkZip(smallZip, 10, 1024);
+ System.out.println("Small: " + Files.size(smallZip) / 1024 + " KB");
+
+ Path mediumZip = tempDir.resolve("medium.zip");
+ createBenchmarkZip(mediumZip, 1000, 100 * 1024);
+ System.out.println("Medium: " + Files.size(mediumZip) / (1024 * 1024)
+ " MB");
+
+ Path largeZip = tempDir.resolve("large.zip");
+ createBenchmarkZip(largeZip, 5000, 500 * 1024);
+ System.out.println("Large: " + Files.size(largeZip) / (1024 * 1024) +
" MB");
+
+ System.out.println("\n=== ZIP Benchmark ===");
+ System.out.println("Mode: " + (USE_RECURSIVE_PARSER_WRAPPER ?
"RecursiveParserWrapper" : "DefaultHandler"));
+ System.out.println();
+
+ System.out.println("Small ZIP (10 entries, 10KB):");
+ runBenchmark(smallZip, 10, iterations, warmupIterations);
+
+ System.out.println("\nMedium ZIP (1000 entries, ~100MB):");
+ runBenchmark(mediumZip, 1000, 20, 4);
+
+ System.out.println("\nLarge ZIP (5000 entries, ~2.5GB):");
+ runBenchmark(largeZip, 5000, 10, 2);
+ }
+
+ private void createBenchmarkZip(Path zipPath, int numEntries, int
entrySize) throws Exception {
+ try (java.util.zip.ZipOutputStream zos =
+ new
java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
+ zos.setMethod(java.util.zip.ZipOutputStream.STORED);
+ java.util.Random random = new java.util.Random(42);
+ byte[] content = new byte[entrySize];
+ random.nextBytes(content);
+ java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+ crc.update(content);
+ long crcValue = crc.getValue();
+
+ for (int i = 0; i < numEntries; i++) {
+ java.util.zip.ZipEntry entry = new
java.util.zip.ZipEntry("entry" + i + ".txt");
+ entry.setMethod(java.util.zip.ZipEntry.STORED);
+ entry.setSize(content.length);
+ entry.setCompressedSize(content.length);
+ entry.setCrc(crcValue);
+ zos.putNextEntry(entry);
+ zos.write(content);
+ zos.closeEntry();
+ }
+ }
+ }
+
+ private void runBenchmark(Path zipPath, int numEntries, int iterations,
int warmup) throws Exception {
+ AutoDetectParser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+
+ long sizeKB = Files.size(zipPath) / 1024;
+ String sizeStr = sizeKB >= 1024 ? (sizeKB / 1024) + " MB" : sizeKB + "
KB";
+ System.out.printf(Locale.ROOT, " Entries: %d, Size: %s%n",
numEntries, sizeStr);
+
+ // Warmup
+ for (int i = 0; i < warmup; i++) {
+ try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+ if (USE_RECURSIVE_PARSER_WRAPPER) {
+ parseWithRecursiveWrapper(parser, tis, context);
+ } else {
+ parser.parse(tis, new DefaultHandler(), new Metadata(),
context);
+ }
+ }
+ }
+
+ // Benchmark
+ long start = System.nanoTime();
+ for (int i = 0; i < iterations; i++) {
+ try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+ if (USE_RECURSIVE_PARSER_WRAPPER) {
+ parseWithRecursiveWrapper(parser, tis, context);
+ } else {
+ parser.parse(tis, new DefaultHandler(), new Metadata(),
context);
+ }
+ }
+ }
+ long duration = System.nanoTime() - start;
+
+ double avgMs = duration / (double) iterations / 1_000_000.0;
+ System.out.printf(Locale.ROOT, " Average: %.3f ms%n", avgMs);
+ }
+
+ private void parseWithRecursiveWrapper(AutoDetectParser parser,
TikaInputStream tis,
+ ParseContext context) throws
Exception {
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
+ BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+ RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(factory);
+ wrapper.parse(tis, handler, new Metadata(), context);
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 18f8180616..c47c299a58 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -282,7 +282,12 @@ public class DefaultZipContainerDetector implements
Detector {
}
// Fallback: it's still a zip file, we just don't know what kind of one
if (zip != null) {
- IOUtils.closeQuietly(zip);
+ // Store ZipFile in openContainer so parser can reuse it
+ if (tis.getOpenContainer() == null) {
+ tis.setOpenContainer(zip);
+ } else {
+ tis.addCloseableResource(zip);
+ }
return MediaType.APPLICATION_ZIP;
}
if (LOG.isDebugEnabled()) {