This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4650-3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 20726385cc4b7a39a967ff4f067bfb06f3a01c05 Author: tallison <[email protected]> AuthorDate: Thu Feb 5 08:35:53 2026 -0500 TIKA-4650 - improvements for 3.x --- .../org/apache/tika/parser/pkg/PackageParser.java | 81 ++++++++++++ .../org/apache/tika/parser/pkg/ZipBenchmark.java | 144 +++++++++++++++++++++ .../detect/zip/DefaultZipContainerDetector.java | 7 +- 3 files changed, 231 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index d7599bf29e..b6aaff4e9c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -32,6 +32,7 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.util.Collections; import java.util.Date; +import java.util.Enumeration; import java.util.HashSet; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -53,6 +54,7 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; @@ -265,6 +267,16 @@ public class PackageParser extends AbstractEncodingDetectorParser { private void _parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context, TemporaryResources tmp) throws TikaException, IOException, SAXException { + // Check if detector already opened a ZipFile and stored it in openContainer + if (stream instanceof TikaInputStream) { + TikaInputStream tis = (TikaInputStream) stream; + Object container = tis.getOpenContainer(); + if (container instanceof ZipFile) { + parseZipFile((ZipFile) container, handler, metadata, context); + return; + } + } + ArchiveInputStream ais = null; String encoding = null; try { @@ -352,6 +364,75 @@ public class PackageParser extends AbstractEncodingDetectorParser { } } + /** + * Parse a ZipFile that was already opened by the detector. + * This avoids the overhead of re-opening the file. + */ + private void parseZipFile(ZipFile zipFile, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + // Update media type if not already set to a specialization + String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE); + if (incomingContentTypeString == null) { + metadata.set(Metadata.CONTENT_TYPE, ZIP.toString()); + } else { + MediaType incomingMediaType = MediaType.parse(incomingContentTypeString); + if (incomingMediaType != null && !PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) { + metadata.set(Metadata.CONTENT_TYPE, ZIP.toString()); + } + } + + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + try { + Enumeration<ZipArchiveEntry> entries = zipFile.getEntries(); + while (entries.hasMoreElements()) { + ZipArchiveEntry entry = entries.nextElement(); + if (!entry.isDirectory()) { + parseZipEntry(zipFile, entry, extractor, metadata, xhtml); + } + } + } finally { + xhtml.endDocument(); + } + } + + /** + * Parse a single entry from a ZipFile. + */ + private void parseZipEntry(ZipFile zipFile, ZipArchiveEntry entry, + EmbeddedDocumentExtractor extractor, Metadata parentMetadata, + XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { + String name = entry.getName(); + + // Try to detect charset of archive entry in case of non-unicode filename is used + if (detectCharsetsInEntryNames) { + byte[] entryName = entry.getRawName(); + if (entryName != null && entryName.length >= MIN_BYTES_FOR_DETECTING_CHARSET) { + Charset charset = getEncodingDetector().detect( + new UnsynchronizedByteArrayInputStream(entryName), new Metadata()); + if (charset != null) { + name = new String(entryName, charset); + } + } + } + + Metadata entryMetadata = handleEntryMetadata(name, + entry.getCreationTime() != null ? new Date(entry.getCreationTime().toMillis()) : null, + entry.getLastModifiedTime() != null ? new Date(entry.getLastModifiedTime().toMillis()) : null, + entry.getSize(), xhtml); + + if (extractor.shouldParseEmbedded(entryMetadata)) { + try (InputStream entryStream = zipFile.getInputStream(entry)) { + extractor.parseEmbedded(entryStream, xhtml, entryMetadata, true); + } + } + } + /** * Parse the entries of the zip archive * diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java new file mode 100644 index 0000000000..44e3e0761e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipBenchmark.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; + +public class ZipBenchmark { + + // Toggle this to switch between DefaultHandler and RecursiveParserWrapper + private static final boolean USE_RECURSIVE_PARSER_WRAPPER = true; + + @Test + public void benchmarkAutoDetectParser(@TempDir Path tempDir) throws Exception { + // Enable to run + assumeTrue(true, "Set to true to run"); + + int iterations = 40; + int warmupIterations = 6; + + Path smallZip = tempDir.resolve("small.zip"); + createBenchmarkZip(smallZip, 10, 1024); + System.out.println("Small: " + Files.size(smallZip) / 1024 + " KB"); + + Path mediumZip = tempDir.resolve("medium.zip"); + createBenchmarkZip(mediumZip, 1000, 100 * 1024); + System.out.println("Medium: " + Files.size(mediumZip) / (1024 * 1024) + " MB"); + + Path largeZip = tempDir.resolve("large.zip"); + createBenchmarkZip(largeZip, 5000, 500 * 1024); + System.out.println("Large: " + Files.size(largeZip) / (1024 * 1024) + " MB"); + + System.out.println("\n=== ZIP Benchmark ==="); + System.out.println("Mode: " + (USE_RECURSIVE_PARSER_WRAPPER ? "RecursiveParserWrapper" : "DefaultHandler")); + System.out.println(); + + System.out.println("Small ZIP (10 entries, 10KB):"); + runBenchmark(smallZip, 10, iterations, warmupIterations); + + System.out.println("\nMedium ZIP (1000 entries, ~100MB):"); + runBenchmark(mediumZip, 1000, 20, 4); + + System.out.println("\nLarge ZIP (5000 entries, ~2.5GB):"); + runBenchmark(largeZip, 5000, 10, 2); + } + + private void createBenchmarkZip(Path zipPath, int numEntries, int entrySize) throws Exception { + try (java.util.zip.ZipOutputStream zos = + new java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) { + zos.setMethod(java.util.zip.ZipOutputStream.STORED); + java.util.Random random = new java.util.Random(42); + byte[] content = new byte[entrySize]; + random.nextBytes(content); + java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(content); + long crcValue = crc.getValue(); + + for (int i = 0; i < numEntries; i++) { + java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("entry" + i + ".txt"); + entry.setMethod(java.util.zip.ZipEntry.STORED); + entry.setSize(content.length); + entry.setCompressedSize(content.length); + entry.setCrc(crcValue); + zos.putNextEntry(entry); + zos.write(content); + zos.closeEntry(); + } + } + } + + private void runBenchmark(Path zipPath, int numEntries, int iterations, int warmup) throws Exception { + AutoDetectParser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + + long sizeKB = Files.size(zipPath) / 1024; + String sizeStr = sizeKB >= 1024 ? (sizeKB / 1024) + " MB" : sizeKB + " KB"; + System.out.printf(Locale.ROOT, " Entries: %d, Size: %s%n", numEntries, sizeStr); + + // Warmup + for (int i = 0; i < warmup; i++) { + try (TikaInputStream tis = TikaInputStream.get(zipPath)) { + if (USE_RECURSIVE_PARSER_WRAPPER) { + parseWithRecursiveWrapper(parser, tis, context); + } else { + parser.parse(tis, new DefaultHandler(), new Metadata(), context); + } + } + } + + // Benchmark + long start = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + try (TikaInputStream tis = TikaInputStream.get(zipPath)) { + if (USE_RECURSIVE_PARSER_WRAPPER) { + parseWithRecursiveWrapper(parser, tis, context); + } else { + parser.parse(tis, new DefaultHandler(), new Metadata(), context); + } + } + } + long duration = System.nanoTime() - start; + + double avgMs = duration / (double) iterations / 1_000_000.0; + System.out.printf(Locale.ROOT, " Average: %.3f ms%n", avgMs); + } + + private void parseWithRecursiveWrapper(AutoDetectParser parser, TikaInputStream tis, + ParseContext context) throws Exception { + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); + BasicContentHandlerFactory factory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory); + wrapper.parse(tis, handler, new Metadata(), context); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 18f8180616..c47c299a58 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -282,7 +282,12 @@ public class DefaultZipContainerDetector implements Detector { } // Fallback: it's still a zip file, we just don't know what kind of one if (zip != null) { - IOUtils.closeQuietly(zip); + // Store ZipFile in openContainer so parser can reuse it + if (tis.getOpenContainer() == null) { + tis.setOpenContainer(zip); + } else { + tis.addCloseableResource(zip); + } return MediaType.APPLICATION_ZIP; } if (LOG.isDebugEnabled()) {
