(tika) 01/02: improve zip detection + parsing, WIP

tallison Thu, 05 Feb 2026 05:35:09 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4650-refactor-zip-parser
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 79a636ed65b8b2f4b55052ca088f2971570fec11
Author: tallison <[email protected]>
AuthorDate: Thu Feb 5 06:54:19 2026 -0500

    improve zip detection + parsing, WIP
---
 .../main/java/org/apache/tika/metadata/Zip.java    | 130 ++++
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  10 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |   2 +-
 .../tika/parser/pkg/AbstractArchiveParser.java     |  84 +++
 .../org/apache/tika/parser/pkg/PackageParser.java  | 456 ++------------
 .../java/org/apache/tika/parser/pkg/RarParser.java |   6 +-
 .../org/apache/tika/parser/pkg/SevenZParser.java   | 164 +++++
 .../java/org/apache/tika/parser/pkg/ZipParser.java | 690 +++++++++++++++++++++
 .../apache/tika/parser/pkg/ZipParserConfig.java    | 105 ++++
 .../apache/tika/parser/pkg/PackageParserTest.java  |  45 +-
 .../org/apache/tika/parser/pkg/ZipParserTest.java  | 456 ++++++++++++--
 .../detect/zip/DefaultZipContainerDetector.java    |  72 ++-
 .../org/apache/tika/zip/utils/ZipSalvager.java     | 212 +++++--
 13 files changed, 1858 insertions(+), 574 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Zip.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Zip.java
new file mode 100644
index 0000000000..4715a85383
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Zip.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * ZIP file properties collection.
+ *
+ * @since Apache Tika 4.0
+ */
+public interface Zip {
+
+    String ZIP_PREFIX = "zip" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    // ==================== Detector Hints ====================
+    // These are set by the detector to communicate state to the parser.
+    // The detector MUST always set these when detecting a ZIP file,
+    // overwriting any user-provided values.
+
+    /**
+     * Set by the detector to indicate whether it successfully opened the ZIP 
as a ZipFile.
+     * If true, the ZipFile is available via 
TikaInputStream.getOpenContainer().
+     * If false, ZipFile failed to open (truncated, corrupt, etc.) and parser 
should use streaming.
+     */
+    Property DETECTOR_ZIPFILE_OPENED =
+            Property.internalBoolean(ZIP_PREFIX + "detectorZipFileOpened");
+
+    /**
+     * Set by the detector to indicate whether streaming required 
DATA_DESCRIPTOR support.
+     * If true, parser should start streaming with 
allowStoredEntriesWithDataDescriptor=true.
+     */
+    Property DETECTOR_DATA_DESCRIPTOR_REQUIRED =
+            Property.internalBoolean(ZIP_PREFIX + 
"detectorDataDescriptorRequired");
+
+    /**
+     * Set to true if the ZIP file was salvaged (rebuilt from a 
corrupt/truncated original).
+     * This indicates that the ZIP could not be opened directly and was 
repaired by
+     * streaming through the local headers and reconstructing a valid ZIP 
structure.
+     */
+    Property SALVAGED = Property.internalBoolean(ZIP_PREFIX + "salvaged");
+
+    // ==================== Entry Metadata ====================
+    // These are set on embedded document metadata for each ZIP entry.
+
+    /**
+     * Comment associated with a ZIP entry.
+     */
+    Property COMMENT = Property.externalText(ZIP_PREFIX + "comment");
+
+    /**
+     * Compression method used for the entry (0=stored, 8=deflated, etc.).
+     */
+    Property COMPRESSION_METHOD = Property.externalInteger(ZIP_PREFIX + 
"compressionMethod");
+
+    /**
+     * Compressed size of the entry in bytes.
+     */
+    Property COMPRESSED_SIZE = Property.externalText(ZIP_PREFIX + 
"compressedSize");
+
+    /**
+     * Uncompressed size of the entry in bytes.
+     */
+    Property UNCOMPRESSED_SIZE = Property.externalText(ZIP_PREFIX + 
"uncompressedSize");
+
+    /**
+     * CRC-32 checksum of the uncompressed entry data.
+     */
+    Property CRC32 = Property.externalText(ZIP_PREFIX + "crc32");
+
+    /**
+     * Unix file mode/permissions for the entry.
+     */
+    Property UNIX_MODE = Property.externalInteger(ZIP_PREFIX + "unixMode");
+
+    /**
+     * Platform that created the entry (0=MS-DOS, 3=Unix, etc.).
+     */
+    Property PLATFORM = Property.externalInteger(ZIP_PREFIX + "platform");
+
+    /**
+     * Version of ZIP specification used to create the entry.
+     */
+    Property VERSION_MADE_BY = Property.externalInteger(ZIP_PREFIX + 
"versionMadeBy");
+
+    /**
+     * Whether the entry is encrypted.
+     */
+    Property ENCRYPTED = Property.externalBoolean(ZIP_PREFIX + "encrypted");
+
+    // ==================== Integrity Check Results ====================
+    // These are set on the parent document metadata after integrity checking.
+
+    /**
+     * Result of the integrity check comparing central directory to local 
headers.
+     * Values: "PASS" (no issues), "FAIL" (issues found), "PARTIAL" (only 
duplicate check done).
+     */
+    Property INTEGRITY_CHECK_RESULT = Property.internalText(ZIP_PREFIX + 
"integrityCheckResult");
+
+    /**
+     * Entry names that appear multiple times in the local headers (streaming).
+     * Duplicate entries are a potential attack vector.
+     */
+    Property DUPLICATE_ENTRY_NAMES = Property.internalTextBag(ZIP_PREFIX + 
"duplicateEntryNames");
+
+    /**
+     * Entry names that exist in central directory but not in local headers.
+     */
+    Property CENTRAL_DIRECTORY_ONLY_ENTRIES =
+            Property.internalTextBag(ZIP_PREFIX + 
"centralDirectoryOnlyEntries");
+
+    /**
+     * Entry names that exist in local headers but not in central directory.
+     * These are "hidden" entries that some tools won't see.
+     */
+    Property LOCAL_HEADER_ONLY_ENTRIES =
+            Property.internalTextBag(ZIP_PREFIX + "localHeaderOnlyEntries");
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index ebe10300cb..0d6ff8ec2b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Locale;
 
 import org.apache.poi.extractor.ExtractorFactory;
@@ -101,10 +102,13 @@ public class OOXMLExtractorFactory {
                 pkg = ((OPCPackageWrapper) 
tis.getOpenContainer()).getOPCPackage();
             } else {
                 try {
-                    pkg = OPCPackage.open(tis.getFile().getPath(), 
PackageAccess.READ);
+                    pkg = OPCPackage.open(tis.getPath().toString(), 
PackageAccess.READ);
                 } catch (InvalidOperationException e) {
-                    tmpRepairedCopy = 
Files.createTempFile("tika-ooxml-repair-", "").toFile();
-                    ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
+                    Path tmpRepairedPath = 
Files.createTempFile("tika-ooxml-repair-", "");
+                    tmpRepairedCopy = tmpRepairedPath.toFile();
+                    tis.enableRewind();
+                    ZipSalvager.salvageCopy(tis, tmpRepairedPath, false);
+                    tis.rewind();
                     pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                 }
                 tis.setOpenContainer(new OPCPackageWrapper(pkg));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index fb8fbce040..9779008017 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -171,7 +171,7 @@ public class EpubParser implements Parser {
         try (TemporaryResources resources = new TemporaryResources()) {
             Path salvaged =
                     
resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
-            ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
+            ZipSalvager.salvageCopy(brokenZip, salvaged);
             try (ZipFile zipFile = 
ZipFile.builder().setFile(salvaged.toFile()).get()) {
                 return bufferedParseZipFile(zipFile, bodyHandler, xhtml, 
metadata, context, false);
             } catch (EpubZipException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java
new file mode 100644
index 0000000000..a599c91cd4
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.util.Date;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AbstractEncodingDetectorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Abstract base class for archive parsers that provides common functionality
+ * for handling embedded documents within archives.
+ */
+public abstract class AbstractArchiveParser extends 
AbstractEncodingDetectorParser {
+
+    public AbstractArchiveParser() {
+        super();
+    }
+
+    public AbstractArchiveParser(EncodingDetector encodingDetector) {
+        super(encodingDetector);
+    }
+
+    /**
+     * Handles metadata for an archive entry and writes appropriate XHTML 
elements.
+     *
+     * @param name       the entry name
+     * @param createAt   creation date (may be null)
+     * @param modifiedAt modification date (may be null)
+     * @param size       entry size (may be null)
+     * @param xhtml      the XHTML content handler
+     * @param context    the parse context
+     * @return metadata object populated with entry information
+     */
+    public static Metadata handleEntryMetadata(String name, Date createAt, 
Date modifiedAt,
+                                               Long size, XHTMLContentHandler 
xhtml,
+                                               ParseContext context)
+            throws SAXException, IOException, TikaException {
+        Metadata entrydata = Metadata.newInstance(context);
+        if (createAt != null) {
+            entrydata.set(TikaCoreProperties.CREATED, createAt);
+        }
+        if (modifiedAt != null) {
+            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+        }
+        if (size != null) {
+            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+        }
+        if (name != null && name.length() > 0) {
+            name = name.replace("\\", "/");
+            entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            entrydata.set(TikaCoreProperties.INTERNAL_PATH, name);
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", name);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+        }
+        return entrydata;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 0d8528fea8..ab1bfa65c0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -16,389 +16,108 @@
  */
 package org.apache.tika.parser.pkg;
 
-
 import static org.apache.tika.detect.zip.PackageConstants.AR;
 import static org.apache.tika.detect.zip.PackageConstants.ARJ;
 import static org.apache.tika.detect.zip.PackageConstants.CPIO;
 import static org.apache.tika.detect.zip.PackageConstants.DUMP;
-import static org.apache.tika.detect.zip.PackageConstants.JAR;
-import static org.apache.tika.detect.zip.PackageConstants.SEVENZ;
+import static org.apache.tika.detect.zip.PackageConstants.GTAR;
 import static org.apache.tika.detect.zip.PackageConstants.TAR;
-import static org.apache.tika.detect.zip.PackageConstants.ZIP;
 
 import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashSet;
 import java.util.Set;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.zip.ZipEntry;
 
-import org.apache.commons.compress.PasswordRequiredException;
 import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.StreamingNotSupportedException;
 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.arj.ArjArchiveInputStream;
 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
 import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
-import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
-import org.apache.commons.compress.archivers.sevenz.SevenZFile;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.config.TikaComponent;
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
- * Parser for various packaging formats. Package entries will be written to
- * the XHTML event stream as &lt;div class="package-entry"&gt; elements that
- * contain the (optional) entry name as a &lt;h1&gt; element and the full
- * structured body content of the parsed entry.
+ * Parser for streaming archive formats: AR, ARJ, CPIO, DUMP, TAR.
+ * <p>
+ * Package entries will be written to the XHTML event stream as
+ * &lt;div class="package-entry"&gt; elements that contain the (optional)
+ * entry name as a &lt;h1&gt; element and the full structured body content
+ * of the parsed entry.
  * <p>
- * User must have JCE Unlimited Strength jars installed for encryption to
- * work with 7Z files (see: COMPRESS-299 and TIKA-1521).  If the jars
- * are not installed, an IOException will be thrown, and potentially
- * wrapped in a TikaException.
+ * For ZIP/JAR archives, see {@link ZipParser}.
+ * For 7z archives, see {@link SevenZParser}.
  */
 @TikaComponent
-public class PackageParser extends AbstractEncodingDetectorParser {
+public class PackageParser extends AbstractArchiveParser {
 
-    //We used to avoid overwriting file types if the file type
-    //was a specialization of zip/tar.  We determined specialization of zip
-    //via TikaConfig at parse time.
-    //The following is an inelegant hack, but until we can serialize 
TikaConfig,
-    //or dramatically rework the ForkParser to avoid serialization
-    //of parsers, this is what we have.
-    //There is at least a test in PackageParserTest that makes sure that we
-    //keep this list updated.
-    //This is now legacy behavior: TODO figure out if we can go make to using 
the
-    //mime registry
-    static final Set<MediaType> PACKAGE_SPECIALIZATIONS = 
loadPackageSpecializations();
-    /**
-     * Serial version UID
-     */
     private static final long serialVersionUID = -5331043266963888708L;
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ);
-
-    // The number of bytes of entry name to detect charset properly
-    private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100;
-
-
-    static final Set<MediaType> loadPackageSpecializations() {
-        Set<MediaType> zipSpecializations = new HashSet<>();
-        for (String mediaTypeString : new String[]{
-                //specializations of ZIP
-                "application/bizagi-modeler", "application/epub+zip",
-                "application/hwp+zip",
-                "application/java-archive",
-                "application/vnd.adobe.air-application-installer-package+zip",
-                "application/vnd.android.package-archive", 
"application/vnd.apple.iwork",
-                "application/vnd.apple.keynote", 
"application/vnd.apple.numbers",
-                "application/vnd.apple.pages", 
"application/vnd.apple.unknown.13",
-                "application/vnd.etsi.asic-e+zip", 
"application/vnd.etsi.asic-s+zip",
-                "application/vnd.google-earth.kmz", 
"application/vnd.mindjet.mindmanager",
-                "application/vnd.ms-excel.addin.macroenabled.12",
-                "application/vnd.ms-excel.sheet.binary.macroenabled.12",
-                "application/vnd.ms-excel.sheet.macroenabled.12",
-                "application/vnd.ms-excel.template.macroenabled.12",
-                "application/vnd.ms-powerpoint.addin.macroenabled.12",
-                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
-                "application/vnd.ms-powerpoint.slide.macroenabled.12",
-                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
-                "application/vnd.ms-powerpoint.template.macroenabled.12",
-                "application/vnd.ms-visio.drawing",
-                "application/vnd.ms-visio.drawing.macroenabled.12",
-                "application/vnd.ms-visio.stencil",
-                "application/vnd.ms-visio.stencil.macroenabled.12",
-                "application/vnd.ms-visio.template",
-                "application/vnd.ms-visio.template.macroenabled.12",
-                "application/vnd.ms-word.document.macroenabled.12",
-                "application/vnd.ms-word.template.macroenabled.12",
-                "application/vnd.ms-xpsdocument", 
"application/vnd.oasis.opendocument.formula",
-                
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                
"application/vnd.openxmlformats-officedocument.presentationml.slide",
-                
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
-                
"application/vnd.openxmlformats-officedocument.presentationml.template",
-                
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                
"application/vnd.openxmlformats-officedocument.spreadsheetml.template",
-                
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
-                "application/x-ibooks+zip", "application/x-itunes-ipa",
-                "application/x-tika-iworks-protected", 
"application/x-tika-java-enterprise-archive",
-                "application/x-tika-java-web-archive", 
"application/x-tika-ooxml",
-                "application/x-tika-visio-ooxml", "application/x-xliff+zip", 
"application/x-xmind",
-                "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc",
-                "application/vnd.sun.xml.writer", 
"application/vnd.sun.xml.writer.template",
-                "application/vnd.sun.xml.draw", 
"application/vnd.sun.xml.impress",
-                "application/vnd.openofficeorg.autotext",
-                "application/vnd.oasis.opendocument.graphics-template",
-                "application/vnd.oasis.opendocument.text-web",
-                "application/vnd.oasis.opendocument.spreadsheet-template",
-                "application/vnd.oasis.opendocument.graphics",
-                "application/vnd.oasis.opendocument.image-template",
-                "application/vnd.oasis.opendocument.text",
-                "application/vnd.oasis.opendocument.text-template",
-                "application/vnd.oasis.opendocument.presentation",
-                "application/vnd.oasis.opendocument.chart",
-                "application/vnd.openofficeorg.extension",
-                "application/vnd.oasis.opendocument.spreadsheet",
-                "application/vnd.oasis.opendocument.image",
-                "application/vnd.oasis.opendocument.formula-template",
-                "application/vnd.oasis.opendocument.presentation-template",
-                "application/vnd.oasis.opendocument.chart-template",
-                "application/vnd.oasis.opendocument.text-master",
-                "application/vnd.adobe.indesign-idml-package",
-                "application/x-gtar", //specialization of tar
-                "application/x-wacz", "application/x-vnd.datapackage+zip"
-        }) {
-            zipSpecializations.add(MediaType.parse(mediaTypeString));
-        }
-        return Collections.unmodifiableSet(zipSpecializations);
-    }
-
-    //not clear what we should use instead?
-    @Deprecated
-    static MediaType getMediaType(ArchiveInputStream stream) {
-        if (stream instanceof JarArchiveInputStream) {
-            return JAR;
-        } else if (stream instanceof ZipArchiveInputStream) {
-            return ZIP;
-        } else if (stream instanceof ArArchiveInputStream) {
-            return AR;
-        } else if (stream instanceof CpioArchiveInputStream) {
-            return CPIO;
-        } else if (stream instanceof DumpArchiveInputStream) {
-            return DUMP;
-        } else if (stream instanceof TarArchiveInputStream) {
-            return TAR;
-        } else if (stream instanceof SevenZWrapper) {
-            return SEVENZ;
-        } else {
-            return MediaType.OCTET_STREAM;
-        }
-    }
 
-    protected static Metadata handleEntryMetadata(String name, Date createAt, 
Date modifiedAt,
-                                                  Long size, 
XHTMLContentHandler xhtml,
-                                                  ParseContext context)
-            throws SAXException, IOException, TikaException {
-        Metadata entrydata = Metadata.newInstance(context);
-        if (createAt != null) {
-            entrydata.set(TikaCoreProperties.CREATED, createAt);
-        }
-        if (modifiedAt != null) {
-            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
-        }
-        if (size != null) {
-            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
-        }
-        if (name != null && name.length() > 0) {
-            name = name.replace("\\", "/");
-            entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
-            entrydata.set(TikaCoreProperties.INTERNAL_PATH, name);
-            AttributesImpl attributes = new AttributesImpl();
-            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-            attributes.addAttribute("", "id", "id", "CDATA", name);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
-        }
-        return entrydata;
-    }
-
-    private boolean detectCharsetsInEntryNames = true;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            MediaType.set(AR, ARJ, CPIO, DUMP, TAR);
 
     public PackageParser() {
         super();
     }
 
-    public PackageParser(EncodingDetector encodingDetector) {
-        super(encodingDetector);
-    }
-
+    @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
+    @Override
     public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
                       ParseContext context) throws IOException, SAXException, 
TikaException {
-        // Enable rewind capability since we may need to re-read for 7z or 
data descriptor handling
-        tis.enableRewind();
-
-        TemporaryResources tmp = new TemporaryResources();
-        // Shield the TikaInputStream from being closed when we close archive 
streams.
-        // This allows us to reset and re-read the stream for data descriptor 
handling.
         tis.setCloseShield();
         try {
-            _parse(tis, handler, metadata, context, tmp);
+            doParse(tis, handler, metadata, context);
         } finally {
             tis.removeCloseShield();
         }
     }
 
-    private void _parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
-                ParseContext context, TemporaryResources tmp)
-            throws TikaException, IOException, SAXException {
-        ArchiveInputStream ais = null;
-        String encoding = null;
+    private void doParse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
+                         ParseContext context) throws TikaException, 
IOException, SAXException {
+        ArchiveInputStream ais;
         try {
             ArchiveStreamFactory factory =
                     context.get(ArchiveStreamFactory.class, new 
ArchiveStreamFactory());
-            encoding = factory.getEntryEncoding();
-            // At the end we want to close the archive stream to release
-            // any associated resources, but the underlying document stream
-            // should not be closed
-            //TODO -- we've probably already detected the stream by here. We 
should
-            //rely on that detection and not re-detect.
-            encoding = factory.getEntryEncoding();
-                // At the end we want to close the archive stream to release
-                // any associated resources, but the underlying document stream
-                // should not be closed
             ais = factory.createArchiveInputStream(tis);
-
-        } catch (StreamingNotSupportedException sne) {
-            // Most archive formats work on streams, but a few need files
-            if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
-                // Rework as a file, and wrap
-                tis.rewind();
-
-                // Seven Zip suports passwords, was one given?
-                String password = null;
-                PasswordProvider provider = 
context.get(PasswordProvider.class);
-                if (provider != null) {
-                    password = provider.getPassword(metadata);
-                }
-
-                SevenZFile sevenz;
-                try {
-                    SevenZFile.Builder builder = new 
SevenZFile.Builder().setFile(tis.getFile());
-                    if (password == null) {
-                        sevenz = builder.get();
-                    } else {
-                        sevenz = 
builder.setPassword(password.toCharArray()).get();
-                    }
-                } catch (PasswordRequiredException e) {
-                    throw new EncryptedDocumentException(e);
-                }
-
-                // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a 
little nasty
-                ais = new SevenZWrapper(sevenz);
-            } else {
-                throw new TikaException("Unknown non-streaming format " + 
sne.getFormat(), sne);
-            }
         } catch (ArchiveException e) {
             throw new TikaException("Unable to unpack document stream", e);
         }
 
         updateMediaType(ais, metadata);
-        // Use the delegate parser to parse the contained document
+
         EmbeddedDocumentExtractor extractor =
                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
context);
         xhtml.startDocument();
 
-        // mark before we start parsing entries for potential reset
-        //needed for mutable int by ref, not for thread safety.
-        //this keeps track of how many entries were processed.
-        AtomicInteger entryCnt = new AtomicInteger();
-        try {
-            parseEntries(ais, metadata, extractor, xhtml, false, entryCnt, 
context);
-        } catch (UnsupportedZipFeatureException zfe) {
-            // If this is a zip archive which requires a data descriptor, 
parse it again
-            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
-                // Close archive input stream and create a new one that could 
handle data descriptor
-                ais.close();
-                tis.rewind();
-                ais = new ZipArchiveInputStream(tis, encoding, true, true);
-                parseEntries(ais, metadata, extractor, xhtml, true, entryCnt, 
context);
-            }
-        } finally {
-            ais.close();
-            xhtml.endDocument();
-        }
-    }
-
-    /**
-     * Parse the entries of the zip archive
-     *
-     * @param ais                     archive input stream
-     * @param metadata                document metadata (input and output)
-     * @param extractor               the delegate parser
-     * @param xhtml                   the xhtml handler
-     * @param shouldUseDataDescriptor indicates if a data descriptor is 
required or not
-     * @param entryCnt                index of the entry
-     * @throws TikaException if the document could not be parsed
-     * @throws IOException   if a UnsupportedZipFeatureException is met
-     * @throws SAXException  if the SAX events could not be processed
-     */
-    private void parseEntries(ArchiveInputStream ais, Metadata metadata,
-                              EmbeddedDocumentExtractor extractor, 
XHTMLContentHandler xhtml,
-                              boolean shouldUseDataDescriptor, AtomicInteger 
entryCnt,
-                              ParseContext context)
-            throws TikaException, IOException, SAXException {
         try {
             ArchiveEntry entry = ais.getNextEntry();
             while (entry != null) {
-                if (shouldUseDataDescriptor && entryCnt.get() > 0) {
-                    // With shouldUseDataDescriptor being true, we are reading
-                    // the zip once again. The number of entryCnt entries have
-                    // already been parsed in the last time, so we can just
-                    // skip these entries.
-                    entryCnt.decrementAndGet();
-                    entry = ais.getNextEntry();
-                    continue;
-                }
-
                 if (!entry.isDirectory()) {
                     parseEntry(ais, entry, extractor, metadata, xhtml, 
context);
                 }
-
-                if (!shouldUseDataDescriptor) {
-                    // Record the number of entries we have read, this is used
-                    // for zip archives using Data Descriptor. It's used for
-                    // skipping the entries we have already read
-                    entryCnt.incrementAndGet();
-                }
-
                 entry = ais.getNextEntry();
             }
-        } catch (UnsupportedZipFeatureException zfe) {
-
-            // If it's an encrypted document of unknown password, report as 
such
-            if (zfe.getFeature() == Feature.ENCRYPTION) {
-                throw new EncryptedDocumentException(zfe);
-            }
-
-            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
-                throw zfe;
-            }
-            // Otherwise throw the exception
-            throw new TikaException("UnsupportedZipFeature", zfe);
-        } catch (PasswordRequiredException pre) {
-            throw new EncryptedDocumentException(pre);
+        } finally {
+            ais.close();
+            xhtml.endDocument();
         }
     }
 
@@ -408,59 +127,52 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
             return;
         }
 
-        //now see if the user or an earlier step has passed in a content type
         String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
         if (incomingContentTypeString == null) {
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             return;
         }
 
-
         MediaType incomingMediaType = 
MediaType.parse(incomingContentTypeString);
         if (incomingMediaType == null) {
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             return;
         }
 
-        if (!PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) {
+        // Don't overwrite if incoming type is a TAR specialization (e.g., 
gtar)
+        if (!incomingMediaType.equals(GTAR)) {
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
         }
     }
 
+    private static MediaType getMediaType(ArchiveInputStream stream) {
+        if (stream instanceof ArArchiveInputStream) {
+            return AR;
+        } else if (stream instanceof ArjArchiveInputStream) {
+            return ARJ;
+        } else if (stream instanceof CpioArchiveInputStream) {
+            return CPIO;
+        } else if (stream instanceof DumpArchiveInputStream) {
+            return DUMP;
+        } else if (stream instanceof TarArchiveInputStream) {
+            return TAR;
+        } else {
+            return MediaType.OCTET_STREAM;
+        }
+    }
+
     private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry,
                             EmbeddedDocumentExtractor extractor, Metadata 
parentMetadata,
                             XHTMLContentHandler xhtml, ParseContext context)
             throws SAXException, IOException, TikaException {
+
         String name = entry.getName();
-        
-        //Try to detect charset of archive entry in case of non-unicode 
filename is used
-        if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) {
-            // Extend short entry name to improve accuracy of charset detection
-            byte[] entryName = ((ZipArchiveEntry) entry).getRawName();
-            byte[] extendedEntryName = entryName;
-            if (0 < entryName.length && entryName.length < 
MIN_BYTES_FOR_DETECTING_CHARSET) {
-                int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET 
/ entryName.length);
-                extendedEntryName = new byte[len];
-                for (int i = 0; i < len; i++) {
-                    extendedEntryName[i] = entryName[i % entryName.length];
-                }
-            }
 
-            try (TikaInputStream tis = TikaInputStream.get(extendedEntryName)) 
{
-                Charset candidate = getEncodingDetector().detect(tis, 
parentMetadata, context);
-                if (candidate != null) {
-                    name = new String(((ZipArchiveEntry) entry).getRawName(), 
candidate);
-                }
-            }
-        }
-        
         if (archive.canReadEntryData(entry)) {
-            // Fetch the metadata on the entry contained in the archive
-            Metadata entrydata =
-                    handleEntryMetadata(name, null, 
entry.getLastModifiedDate(), entry.getSize(),
-                            xhtml, context);
+            Metadata entrydata = handleEntryMetadata(
+                    name, null, entry.getLastModifiedDate(), entry.getSize(),
+                    xhtml, context);
 
-            // Recurse into the entry if desired
             if (extractor.shouldParseEmbedded(entrydata)) {
                 TemporaryResources tmp = new TemporaryResources();
                 try {
@@ -471,82 +183,12 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
                 }
             }
         } else {
-            name = (name == null) ? "" : name;
-            if (entry instanceof ZipArchiveEntry) {
-                ZipArchiveEntry zipArchiveEntry = (ZipArchiveEntry) entry;
-                boolean usesEncryption = 
zipArchiveEntry.getGeneralPurposeBit().usesEncryption();
-                if (usesEncryption) {
-                    EmbeddedDocumentUtil.recordEmbeddedStreamException(
-                            new EncryptedDocumentException("stream (" + name + 
") is encrypted"),
-                            parentMetadata);
-                }
-
-                // do not write to the handler if
-                // UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR
-                // is met, we will catch this exception and read the zip 
archive once again
-                boolean usesDataDescriptor =
-                        
zipArchiveEntry.getGeneralPurposeBit().usesDataDescriptor();
-                if (usesDataDescriptor && zipArchiveEntry.getMethod() == 
ZipEntry.STORED) {
-                    throw new UnsupportedZipFeatureException(
-                            
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR,
-                            zipArchiveEntry);
-                }
-            } else {
-                EmbeddedDocumentUtil.recordEmbeddedStreamException(
-                        new TikaException("Can't read archive stream (" + name 
+ ")"),
-                        parentMetadata);
-            }
-            if (name.length() > 0) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(
+                    new TikaException("Can't read archive stream (" + name + 
")"),
+                    parentMetadata);
+            if (name != null && !name.isEmpty()) {
                 xhtml.element("p", name);
             }
         }
     }
-
-    // Pending a fix for COMPRESS-269, we have to wrap ourselves
-    private static class SevenZWrapper extends ArchiveInputStream {
-        private SevenZFile file;
-
-        private SevenZWrapper(SevenZFile file) {
-            this.file = file;
-        }
-
-        @Override
-        public int read() throws IOException {
-            return file.read();
-        }
-
-        @Override
-        public int read(byte[] b) throws IOException {
-            return file.read(b);
-        }
-
-        @Override
-        public int read(byte[] b, int off, int len) throws IOException {
-            return file.read(b, off, len);
-        }
-
-        @Override
-        public ArchiveEntry getNextEntry() throws IOException {
-            return file.getNextEntry();
-        }
-
-        @Override
-        public void close() throws IOException {
-            file.close();
-        }
-    }
-
-    /**
-     * Whether or not to run the default charset detector against entry
-     * names in ZipFiles. The default is <code>true</code>.
-     *
-     * @param detectCharsetsInEntryNames
-     */
-    public void setDetectCharsetsInEntryNames(boolean 
detectCharsetsInEntryNames) {
-        this.detectCharsetsInEntryNames = detectCharsetsInEntryNames;
-    }
-
-    public boolean isDetectCharsetsInEntryNames() {
-        return detectCharsetsInEntryNames;
-    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 96785a7b5d..36a96be8f1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -83,9 +83,9 @@ public class RarParser implements Parser {
             FileHeader header = rar.nextFileHeader();
             while (header != null && !Thread.currentThread().isInterrupted()) {
                 if (!header.isDirectory()) {
-                    Metadata entrydata = 
PackageParser.handleEntryMetadata(header.getFileName(),
-                            header.getCTime(), header.getMTime(), 
header.getFullUnpackSize(),
-                            xhtml, context);
+                    Metadata entrydata = 
AbstractArchiveParser.handleEntryMetadata(
+                            header.getFileName(), header.getCTime(), 
header.getMTime(),
+                            header.getFullUnpackSize(), xhtml, context);
                     try (TikaInputStream rarTis = 
TikaInputStream.get(rar.getInputStream(header))) {
                         if (extractor.shouldParseEmbedded(entrydata)) {
                             extractor.parseEmbedded(rarTis, handler, 
entrydata, context, true);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java
new file mode 100644
index 0000000000..192cfbd497
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.detect.zip.PackageConstants.SEVENZ;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.compress.PasswordRequiredException;
+import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser for 7z (Seven Zip) archives.
+ * <p>
+ * This parser requires file-based access (not streaming) because
+ * the 7z format requires random access to the archive.
+ * <p>
+ * User must have JCE Unlimited Strength jars installed for encryption
+ * to work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars
+ * are not installed, an IOException will be thrown, and potentially
+ * wrapped in a TikaException.
+ */
+@TikaComponent
+public class SevenZParser extends AbstractArchiveParser {
+
+    private static final long serialVersionUID = -5331043266963888710L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(SEVENZ);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+
+        // Seven Zip supports passwords, was one given?
+        String password = null;
+        PasswordProvider provider = context.get(PasswordProvider.class);
+        if (provider != null) {
+            password = provider.getPassword(metadata);
+        }
+
+        SevenZFile sevenZFile;
+        try {
+            SevenZFile.Builder builder = new 
SevenZFile.Builder().setFile(tis.getFile());
+            if (password == null) {
+                sevenZFile = builder.get();
+            } else {
+                sevenZFile = builder.setPassword(password.toCharArray()).get();
+            }
+        } catch (PasswordRequiredException e) {
+            throw new EncryptedDocumentException(e);
+        }
+
+        metadata.set(Metadata.CONTENT_TYPE, SEVENZ.toString());
+
+        EmbeddedDocumentExtractor extractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
context);
+        xhtml.startDocument();
+
+        try {
+            SevenZArchiveEntry entry = sevenZFile.getNextEntry();
+            while (entry != null) {
+                if (!entry.isDirectory()) {
+                    parseEntry(sevenZFile, entry, extractor, metadata, xhtml, 
context);
+                }
+                entry = sevenZFile.getNextEntry();
+            }
+        } finally {
+            sevenZFile.close();
+            xhtml.endDocument();
+        }
+    }
+
+    private void parseEntry(SevenZFile sevenZFile, SevenZArchiveEntry entry,
+                            EmbeddedDocumentExtractor extractor, Metadata 
parentMetadata,
+                            XHTMLContentHandler xhtml, ParseContext context)
+            throws SAXException, IOException, TikaException {
+
+        String name = entry.getName();
+        Metadata entrydata = handleEntryMetadata(
+                name,
+                entry.getHasCreationDate() ? entry.getCreationDate() : null,
+                entry.getHasLastModifiedDate() ? entry.getLastModifiedDate() : 
null,
+                entry.getSize(),
+                xhtml,
+                context);
+
+        if (extractor.shouldParseEmbedded(entrydata)) {
+            TemporaryResources tmp = new TemporaryResources();
+            try {
+                TikaInputStream tis = TikaInputStream.get(
+                        new SevenZEntryInputStream(sevenZFile), tmp, 
entrydata);
+                extractor.parseEmbedded(tis, xhtml, entrydata, new 
ParseContext(), true);
+            } finally {
+                tmp.dispose();
+            }
+        }
+    }
+
+    /**
+     * InputStream wrapper for reading the current entry from a SevenZFile.
+     */
+    private static class SevenZEntryInputStream extends InputStream {
+        private final SevenZFile file;
+
+        SevenZEntryInputStream(SevenZFile file) {
+            this.file = file;
+        }
+
+        @Override
+        public int read() throws IOException {
+            return file.read();
+        }
+
+        @Override
+        public int read(byte[] b) throws IOException {
+            return file.read(b);
+        }
+
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            return file.read(b, off, len);
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
new file mode 100644
index 0000000000..0c48731058
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -0,0 +1,690 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.apache.tika.detect.zip.PackageConstants.JAR;
+import static org.apache.tika.detect.zip.PackageConstants.ZIP;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.file.attribute.FileTime;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
+import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.Zip;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.zip.utils.ZipSalvager;
+
+/**
+ * Parser for ZIP and JAR archives using file-based access for complete 
metadata extraction.
+ * <p>
+ * This parser handles:
+ * <ul>
+ *   <li>Standard ZIP archives</li>
+ *   <li>JAR (Java Archive) files</li>
+ *   <li>Archive and entry comments</li>
+ *   <li>Unix permissions and file attributes</li>
+ *   <li>Charset detection for non-Unicode entry names</li>
+ *   <li>Encryption detection</li>
+ * </ul>
+ * <p>
+ * This parser prefers file-based access (ZipFile) for complete metadata 
extraction,
+ * but falls back to streaming (ZipArchiveInputStream) for edge-case ZIPs that
+ * cannot be read as files (e.g., those with data descriptors that overlap the
+ * central directory).
+ */
+@TikaComponent()
+public class ZipParser extends AbstractArchiveParser {
+
+    /**
+     * Set of media types that are specializations of ZIP (e.g., Office 
documents, EPUB, APK).
+     * Used to avoid overwriting more specific media types with generic 
"application/zip".
+     */
+    public static final Set<MediaType> ZIP_SPECIALIZATIONS = 
loadZipSpecializations();
+
+    private static final long serialVersionUID = -5331043266963888709L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = MediaType.set(ZIP, 
JAR);
+
+    private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100;
+
+    /**
+     * Maximum number of entries to record in integrity check metadata fields.
+     * Prevents excessive metadata in ZIPs with many discrepancies.
+     */
+    private static final int MAX_INTEGRITY_CHECK_ENTRIES = 100;
+
+    private final ZipParserConfig defaultConfig;
+
+    private static Set<MediaType> loadZipSpecializations() {
+        Set<MediaType> zipSpecializations = new HashSet<>();
+        for (String mediaTypeString : new String[]{
+                //specializations of ZIP
+                "application/bizagi-modeler", "application/epub+zip",
+                "application/hwp+zip",
+                "application/java-archive",
+                "application/vnd.adobe.air-application-installer-package+zip",
+                "application/vnd.android.package-archive", 
"application/vnd.apple.iwork",
+                "application/vnd.apple.keynote", 
"application/vnd.apple.numbers",
+                "application/vnd.apple.pages", 
"application/vnd.apple.unknown.13",
+                "application/vnd.etsi.asic-e+zip", 
"application/vnd.etsi.asic-s+zip",
+                "application/vnd.google-earth.kmz", 
"application/vnd.mindjet.mindmanager",
+                "application/vnd.ms-excel.addin.macroenabled.12",
+                "application/vnd.ms-excel.sheet.binary.macroenabled.12",
+                "application/vnd.ms-excel.sheet.macroenabled.12",
+                "application/vnd.ms-excel.template.macroenabled.12",
+                "application/vnd.ms-powerpoint.addin.macroenabled.12",
+                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+                "application/vnd.ms-powerpoint.slide.macroenabled.12",
+                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+                "application/vnd.ms-powerpoint.template.macroenabled.12",
+                "application/vnd.ms-visio.drawing",
+                "application/vnd.ms-visio.drawing.macroenabled.12",
+                "application/vnd.ms-visio.stencil",
+                "application/vnd.ms-visio.stencil.macroenabled.12",
+                "application/vnd.ms-visio.template",
+                "application/vnd.ms-visio.template.macroenabled.12",
+                "application/vnd.ms-word.document.macroenabled.12",
+                "application/vnd.ms-word.template.macroenabled.12",
+                "application/vnd.ms-xpsdocument", 
"application/vnd.oasis.opendocument.formula",
+                
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                
"application/vnd.openxmlformats-officedocument.presentationml.slide",
+                
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+                
"application/vnd.openxmlformats-officedocument.presentationml.template",
+                
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                
"application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+                "application/x-ibooks+zip", "application/x-itunes-ipa",
+                "application/x-tika-iworks-protected", 
"application/x-tika-java-enterprise-archive",
+                "application/x-tika-java-web-archive", 
"application/x-tika-ooxml",
+                "application/x-tika-visio-ooxml", "application/x-xliff+zip", 
"application/x-xmind",
+                "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc",
+                "application/vnd.sun.xml.writer", 
"application/vnd.sun.xml.writer.template",
+                "application/vnd.sun.xml.draw", 
"application/vnd.sun.xml.impress",
+                "application/vnd.openofficeorg.autotext",
+                "application/vnd.oasis.opendocument.graphics-template",
+                "application/vnd.oasis.opendocument.text-web",
+                "application/vnd.oasis.opendocument.spreadsheet-template",
+                "application/vnd.oasis.opendocument.graphics",
+                "application/vnd.oasis.opendocument.image-template",
+                "application/vnd.oasis.opendocument.text",
+                "application/vnd.oasis.opendocument.text-template",
+                "application/vnd.oasis.opendocument.presentation",
+                "application/vnd.oasis.opendocument.chart",
+                "application/vnd.openofficeorg.extension",
+                "application/vnd.oasis.opendocument.spreadsheet",
+                "application/vnd.oasis.opendocument.image",
+                "application/vnd.oasis.opendocument.formula-template",
+                "application/vnd.oasis.opendocument.presentation-template",
+                "application/vnd.oasis.opendocument.chart-template",
+                "application/vnd.oasis.opendocument.text-master",
+                "application/vnd.adobe.indesign-idml-package",
+                "application/x-wacz", "application/x-vnd.datapackage+zip"
+        }) {
+            zipSpecializations.add(MediaType.parse(mediaTypeString));
+        }
+        return Collections.unmodifiableSet(zipSpecializations);
+    }
+
+    public ZipParser() {
+        super();
+        this.defaultConfig = new ZipParserConfig();
+    }
+
+    public ZipParser(ZipParserConfig config) {
+        super();
+        this.defaultConfig = config;
+    }
+
+    /**
+     * Constructor for JSON-based configuration.
+     */
+    public ZipParser(JsonConfig jsonConfig) throws TikaConfigException {
+        this(ConfigDeserializer.buildConfig(jsonConfig, 
ZipParserConfig.class));
+    }
+
+    public ZipParser(EncodingDetector encodingDetector) {
+        super(encodingDetector);
+        this.defaultConfig = new ZipParserConfig();
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+
+        ZipParserConfig config = context.get(ZipParserConfig.class, 
defaultConfig);
+
+        if (tis.getOpenContainer() instanceof ZipFile) {
+            // detectEntryName handles charset decoding from raw bytes, no 
need to reopen
+            parseWithZipFile((ZipFile) tis.getOpenContainer(), tis, handler, 
metadata, context, config);
+            return;
+        }
+
+        // Check detector hints - if detector already tried ZipFile and 
failed, go straight to streaming
+        String detectorZipFileOpened = 
metadata.get(Zip.DETECTOR_ZIPFILE_OPENED);
+        if ("false".equals(detectorZipFileOpened)) {
+            // Detector already tried and failed - skip ZipFile, use streaming
+            // Enable rewind for DATA_DESCRIPTOR retry in parseWithStream
+            tis.enableRewind();
+            String dataDescriptorRequired = 
metadata.get(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED);
+            parseWithStream(tis, handler, metadata, context, config,
+                    "true".equals(dataDescriptorRequired));
+            return;
+        }
+
+        // No detector hint - try to open ZipFile (with salvaging fallback)
+        // This likely means that the user didn't apply a detector first or 
the zip detector was not in the chain
+        ZipFile zipFile = ZipSalvager.tryToOpenZipFile(tis, metadata, 
config.getEntryEncoding());
+
+        if (zipFile != null) {
+            // ZipFile opened (directly or via salvaging) - use file-based 
parsing
+            parseWithZipFile(zipFile, tis, handler, metadata, context, config);
+        } else {
+            // ZipFile and salvaging both failed - use streaming
+            // Enable rewind for DATA_DESCRIPTOR retry in parseWithStream
+            // (may be redundant if tryToOpenZipFile already called it, but 
that's safe)
+            tis.enableRewind();
+            parseWithStream(tis, handler, metadata, context, config, false);
+        }
+    }
+
+    /**
+     * Parses using a pre-opened ZipFile passed from the detector.
+     *
+     * @param zipFile  the pre-opened ZipFile from detector
+     * @param tis      the TikaInputStream (for integrity check rewind)
+     * @param handler  the content handler
+     * @param metadata the metadata
+     * @param context  the parse context
+     * @param config   the parser configuration
+     */
+    private void parseWithZipFile(ZipFile zipFile, TikaInputStream tis, 
ContentHandler handler,
+                                   Metadata metadata, ParseContext context, 
ZipParserConfig config)
+            throws IOException, SAXException, TikaException {
+
+        // Collect entry names from central directory for integrity check
+        Set<String> centralDirectoryEntries = config.isIntegrityCheck()
+                ? new LinkedHashSet<>() : null;
+
+        // Don't close the ZipFile - it was passed from the detector and will 
be closed
+        // when TikaInputStream is closed (it's set as the openContainer)
+        updateMediaType(zipFile, metadata);
+
+        EmbeddedDocumentExtractor extractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
context);
+        xhtml.startDocument();
+
+        try {
+            Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
+            while (entries.hasMoreElements()) {
+                ZipArchiveEntry entry = entries.nextElement();
+                if (centralDirectoryEntries != null) {
+                    centralDirectoryEntries.add(entry.getName());
+                }
+                if (!entry.isDirectory()) {
+                    parseZipFileEntry(zipFile, entry, extractor, metadata, 
xhtml, context, config);
+                }
+            }
+        } finally {
+            xhtml.endDocument();
+        }
+
+        // Perform integrity check if enabled
+        if (config.isIntegrityCheck()) {
+            tis.enableRewind();
+            tis.rewind();
+            performIntegrityCheck(tis, metadata, centralDirectoryEntries, 
config);
+        }
+    }
+
+    /**
+     * Parses using streaming with optional initial data descriptor support.
+     *
+     * @param tis                    the TikaInputStream
+     * @param handler                the content handler
+     * @param metadata               the metadata
+     * @param context                the parse context
+     * @param config                 the parser configuration
+     * @param startWithDataDescriptor whether to start with data descriptor 
support enabled
+     */
+    private void parseWithStream(TikaInputStream tis, ContentHandler handler, 
Metadata metadata,
+                                  ParseContext context, ZipParserConfig config,
+                                  boolean startWithDataDescriptor)
+            throws IOException, SAXException, TikaException {
+
+        // Track entry names for duplicate detection during streaming
+        Set<String> seenEntryNames = config.isIntegrityCheck()
+                ? new LinkedHashSet<>() : null;
+        List<String> duplicates = config.isIntegrityCheck()
+                ? new ArrayList<>() : null;
+
+        String encoding = config.getEntryEncoding() != null
+                ? config.getEntryEncoding().name()
+                : null;
+        ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, encoding, 
true, startWithDataDescriptor);
+
+        updateMediaType(metadata);
+
+        EmbeddedDocumentExtractor extractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
context);
+        xhtml.startDocument();
+
+        AtomicInteger entryCnt = new AtomicInteger();
+        try {
+            parseStreamEntries(zis, metadata, extractor, xhtml, false, 
entryCnt, context, config,
+                    seenEntryNames, duplicates);
+        } catch (UnsupportedZipFeatureException zfe) {
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR && 
!startWithDataDescriptor) {
+                // Re-read with data descriptor support
+                zis.close();
+                tis.rewind();
+                zis = new ZipArchiveInputStream(tis, encoding, true, true);
+                parseStreamEntries(zis, metadata, extractor, xhtml, true, 
entryCnt, context, config,
+                        seenEntryNames, duplicates);
+            } else {
+                throw zfe;
+            }
+        } finally {
+            zis.close();
+            xhtml.endDocument();
+        }
+
+        // Record integrity check results (streaming only = can't compare to 
central directory)
+        if (config.isIntegrityCheck()) {
+            if (duplicates.isEmpty()) {
+                // No duplicates found, but we couldn't compare to central 
directory
+                metadata.set(Zip.INTEGRITY_CHECK_RESULT, "PARTIAL");
+            } else {
+                metadata.set(Zip.INTEGRITY_CHECK_RESULT, "FAIL");
+                for (String dup : duplicates) {
+                    metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup);
+                }
+            }
+        }
+    }
+
+    private void parseStreamEntries(ZipArchiveInputStream zis, Metadata 
metadata,
+                                     EmbeddedDocumentExtractor extractor, 
XHTMLContentHandler xhtml,
+                                     boolean shouldUseDataDescriptor, 
AtomicInteger entryCnt,
+                                     ParseContext context, ZipParserConfig 
config,
+                                     Set<String> seenEntryNames, List<String> 
duplicates)
+            throws TikaException, IOException, SAXException {
+
+        try {
+            ArchiveEntry entry = zis.getNextEntry();
+            while (entry != null) {
+                if (shouldUseDataDescriptor && entryCnt.get() > 0) {
+                    // Skip already-processed entries on re-read
+                    entryCnt.decrementAndGet();
+                    entry = zis.getNextEntry();
+                    continue;
+                }
+
+                if (!entry.isDirectory() && entry instanceof ZipArchiveEntry) {
+                    parseStreamEntry(zis, (ZipArchiveEntry) entry, extractor, 
metadata,
+                            xhtml, context, config);
+
+                    // Track duplicates AFTER successful processing
+                    // (if DATA_DESCRIPTOR exception occurs, we'll re-read 
this entry)
+                    if (seenEntryNames != null && duplicates != null) {
+                        String name = entry.getName();
+                        if (seenEntryNames.contains(name)) {
+                            if (duplicates.size() < 
MAX_INTEGRITY_CHECK_ENTRIES) {
+                                duplicates.add(name);
+                            }
+                        } else {
+                            seenEntryNames.add(name);
+                        }
+                    }
+                }
+
+                // Increment AFTER successful processing
+                if (!shouldUseDataDescriptor) {
+                    entryCnt.incrementAndGet();
+                }
+
+                entry = zis.getNextEntry();
+            }
+        } catch (UnsupportedZipFeatureException zfe) {
+            if (zfe.getFeature() == Feature.ENCRYPTION) {
+                throw new EncryptedDocumentException(zfe);
+            }
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
+                throw zfe;
+            }
+            throw new TikaException("UnsupportedZipFeature", zfe);
+        }
+    }
+
+    private void updateMediaType(ZipFile zipFile, Metadata metadata) {
+        MediaType type = ZIP;
+        Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
+        if (entries.hasMoreElements()) {
+            ZipArchiveEntry first = entries.nextElement();
+            if ("META-INF/MANIFEST.MF".equals(first.getName())) {
+                type = JAR;
+            }
+        }
+        setMediaTypeIfNotSpecialization(metadata, type);
+    }
+
+    private void updateMediaType(Metadata metadata) {
+        setMediaTypeIfNotSpecialization(metadata, ZIP);
+    }
+
+    private void setMediaTypeIfNotSpecialization(Metadata metadata, MediaType 
type) {
+        String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingContentTypeString == null) {
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            return;
+        }
+
+        MediaType incomingMediaType = 
MediaType.parse(incomingContentTypeString);
+        if (incomingMediaType == null) {
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            return;
+        }
+
+        if (!ZIP_SPECIALIZATIONS.contains(incomingMediaType)) {
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+        }
+    }
+
+    private void parseZipFileEntry(ZipFile zipFile, ZipArchiveEntry entry,
+                                    EmbeddedDocumentExtractor extractor, 
Metadata parentMetadata,
+                                    XHTMLContentHandler xhtml, ParseContext 
context,
+                                    ZipParserConfig config)
+            throws SAXException, IOException, TikaException {
+
+        String name = detectEntryName(entry, parentMetadata, context, config);
+
+        if (entry.getGeneralPurposeBit().usesEncryption()) {
+            handleEncryptedEntry(name, parentMetadata, xhtml);
+            return;
+        }
+
+        Metadata entryMetadata = buildEntryMetadata(entry, name, context);
+
+        writeEntryXhtml(name, xhtml);
+
+        if (extractor.shouldParseEmbedded(entryMetadata)) {
+            TemporaryResources tmp = new TemporaryResources();
+            try (InputStream entryStream = zipFile.getInputStream(entry)) {
+                TikaInputStream tis = TikaInputStream.get(entryStream, tmp, 
entryMetadata);
+                extractor.parseEmbedded(tis, xhtml, entryMetadata, new 
ParseContext(), true);
+            } finally {
+                tmp.dispose();
+            }
+        }
+    }
+
+    private void parseStreamEntry(ZipArchiveInputStream zis, ZipArchiveEntry 
entry,
+                                   EmbeddedDocumentExtractor extractor, 
Metadata parentMetadata,
+                                   XHTMLContentHandler xhtml, ParseContext 
context,
+                                   ZipParserConfig config)
+            throws SAXException, IOException, TikaException {
+
+        String name = detectEntryName(entry, parentMetadata, context, config);
+
+        if (!zis.canReadEntryData(entry)) {
+            if (entry.getGeneralPurposeBit().usesEncryption()) {
+                handleEncryptedEntry(name, parentMetadata, xhtml);
+            } else if (entry.getGeneralPurposeBit().usesDataDescriptor()
+                    && entry.getMethod() == java.util.zip.ZipEntry.STORED) {
+                throw new 
UnsupportedZipFeatureException(Feature.DATA_DESCRIPTOR, entry);
+            } else {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(
+                        new TikaException("Can't read archive stream (" + name 
+ ")"),
+                        parentMetadata);
+                if (name != null && !name.isEmpty()) {
+                    xhtml.element("p", name);
+                }
+            }
+            return;
+        }
+
+        Metadata entryMetadata = buildEntryMetadata(entry, name, context);
+
+        writeEntryXhtml(name, xhtml);
+
+        if (extractor.shouldParseEmbedded(entryMetadata)) {
+            TemporaryResources tmp = new TemporaryResources();
+            try {
+                TikaInputStream tis = TikaInputStream.get(zis, tmp, 
entryMetadata);
+                extractor.parseEmbedded(tis, xhtml, entryMetadata, new 
ParseContext(), true);
+            } finally {
+                tmp.dispose();
+            }
+        }
+    }
+
+    private String detectEntryName(ZipArchiveEntry entry, Metadata 
parentMetadata,
+                                    ParseContext context, ZipParserConfig 
config) throws IOException {
+        // If user specified an encoding, decode raw bytes with that charset
+        // This avoids needing to reopen the ZipFile with a different charset
+        if (config.getEntryEncoding() != null) {
+            return new String(entry.getRawName(), config.getEntryEncoding());
+        }
+
+        // If charset detection is enabled, try to detect and decode
+        if (config.isDetectCharsetsInEntryNames()) {
+            byte[] entryName = entry.getRawName();
+            byte[] extendedEntryName = entryName;
+            if (0 < entryName.length && entryName.length < 
MIN_BYTES_FOR_DETECTING_CHARSET) {
+                int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET 
/ entryName.length);
+                extendedEntryName = new byte[len];
+                for (int i = 0; i < len; i++) {
+                    extendedEntryName[i] = entryName[i % entryName.length];
+                }
+            }
+
+            try (TikaInputStream detectStream = 
TikaInputStream.get(extendedEntryName)) {
+                Charset candidate = getEncodingDetector().detect(detectStream, 
parentMetadata, context);
+                if (candidate != null) {
+                    return new String(entry.getRawName(), candidate);
+                }
+            }
+        }
+
+        // Fall back to default decoding
+        return entry.getName();
+    }
+
+    private void handleEncryptedEntry(String name, Metadata parentMetadata,
+                                       XHTMLContentHandler xhtml) throws 
SAXException {
+        EmbeddedDocumentUtil.recordEmbeddedStreamException(
+                new EncryptedDocumentException("stream (" + name + ") is 
encrypted"),
+                parentMetadata);
+        if (name != null && !name.isEmpty()) {
+            xhtml.element("p", name);
+        }
+    }
+
+    private Metadata buildEntryMetadata(ZipArchiveEntry entry, String name, 
ParseContext context)
+            throws IOException, TikaException, SAXException {
+        Metadata entryMetadata = Metadata.newInstance(context);
+
+        if (name != null && name.length() > 0) {
+            name = name.replace("\\", "/");
+            entryMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            entryMetadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+        }
+
+        FileTime creationTime = entry.getCreationTime();
+        if (creationTime != null) {
+            entryMetadata.set(TikaCoreProperties.CREATED, 
creationTime.toInstant().toString());
+        }
+        FileTime modifiedTime = entry.getLastModifiedTime();
+        if (modifiedTime != null) {
+            entryMetadata.set(TikaCoreProperties.MODIFIED, 
modifiedTime.toInstant().toString());
+        }
+
+        long size = entry.getSize();
+        if (size >= 0) {
+            entryMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+            entryMetadata.set(Zip.UNCOMPRESSED_SIZE, Long.toString(size));
+        }
+        long compressedSize = entry.getCompressedSize();
+        if (compressedSize >= 0) {
+            entryMetadata.set(Zip.COMPRESSED_SIZE, 
Long.toString(compressedSize));
+        }
+
+        entryMetadata.set(Zip.COMPRESSION_METHOD, entry.getMethod());
+
+        long crc = entry.getCrc();
+        if (crc >= 0) {
+            entryMetadata.set(Zip.CRC32, Long.toString(crc));
+        }
+
+        int unixMode = entry.getUnixMode();
+        if (unixMode != 0) {
+            entryMetadata.set(Zip.UNIX_MODE, unixMode);
+        }
+
+        entryMetadata.set(Zip.PLATFORM, entry.getPlatform());
+        entryMetadata.set(Zip.VERSION_MADE_BY, entry.getVersionMadeBy());
+
+        String entryComment = entry.getComment();
+        if (entryComment != null && !entryComment.isEmpty()) {
+            entryMetadata.set(Zip.COMMENT, entryComment);
+        }
+
+        return entryMetadata;
+    }
+
+    private void writeEntryXhtml(String name, XHTMLContentHandler xhtml) 
throws SAXException {
+        if (name != null && name.length() > 0) {
+            org.xml.sax.helpers.AttributesImpl attributes = new 
org.xml.sax.helpers.AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", name);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+        }
+    }
+
+    /**
+     * Performs integrity check by streaming through the ZIP and comparing
+     * local file headers against the central directory entries.
+     *
+     * @param tis                     the TikaInputStream (must be rewound)
+     * @param metadata                the parent metadata to record results
+     * @param centralDirectoryEntries entry names from the central directory
+     * @param config                  the parser configuration
+     */
+    private void performIntegrityCheck(TikaInputStream tis, Metadata metadata,
+                                        Set<String> centralDirectoryEntries,
+                                        ZipParserConfig config) throws 
IOException {
+
+        String encoding = config.getEntryEncoding() != null
+                ? config.getEntryEncoding().name()
+                : null;
+
+        Set<String> seenInStream = new LinkedHashSet<>();
+        List<String> duplicates = new ArrayList<>();
+        List<String> localHeaderOnly = new ArrayList<>();
+
+        try (ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, 
encoding, true, true)) {
+            ZipArchiveEntry entry;
+            while ((entry = zis.getNextZipEntry()) != null) {
+                String name = entry.getName();
+
+                // Check for duplicates
+                if (seenInStream.contains(name)) {
+                    if (duplicates.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
+                        duplicates.add(name);
+                    }
+                } else {
+                    seenInStream.add(name);
+                }
+
+                // Check for entries not in central directory
+                if (!centralDirectoryEntries.contains(name)) {
+                    if (localHeaderOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
+                        localHeaderOnly.add(name);
+                    }
+                }
+            }
+        } catch (IOException e) {
+            // If streaming fails, we still record what we found
+        }
+
+        // Find entries in central directory but not in local headers
+        List<String> centralOnly = new ArrayList<>();
+        for (String cdEntry : centralDirectoryEntries) {
+            if (!seenInStream.contains(cdEntry)) {
+                if (centralOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) {
+                    centralOnly.add(cdEntry);
+                }
+            }
+        }
+
+        // Record results
+        boolean passed = duplicates.isEmpty() && localHeaderOnly.isEmpty() && 
centralOnly.isEmpty();
+        metadata.set(Zip.INTEGRITY_CHECK_RESULT, passed ? "PASS" : "FAIL");
+
+        for (String dup : duplicates) {
+            metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup);
+        }
+        for (String local : localHeaderOnly) {
+            metadata.add(Zip.LOCAL_HEADER_ONLY_ENTRIES, local);
+        }
+        for (String cd : centralOnly) {
+            metadata.add(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES, cd);
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java
new file mode 100644
index 0000000000..9bc53aaca9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.Serializable;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+
+import org.apache.tika.exception.TikaConfigException;
+
+/**
+ * Configuration for {@link ZipParser}.
+ */
+public class ZipParserConfig implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Whether to run charset detection on entry names to handle
+     * non-Unicode filenames. Default is true.
+     */
+    private boolean detectCharsetsInEntryNames = true;
+
+    /**
+     * The charset to use for reading entry names. If null, the parser
+     * will use the platform default or auto-detect based on
+     * {@link #detectCharsetsInEntryNames}.
+     */
+    private Charset entryEncoding = null;
+
+    /**
+     * Whether to perform integrity checking by comparing the central directory
+     * (read via file-based access) against local file headers (read via 
streaming).
+     * This can detect:
+     * <ul>
+     *   <li>Duplicate entry names (potential attack vector)</li>
+     *   <li>Entries in central directory but not in local headers</li>
+     *   <li>Entries in local headers but not in central directory</li>
+     * </ul>
+     * Default is true. When enabled, the ZIP is parsed twice if file-based 
access
+     * succeeds. If only streaming is possible, duplicate detection is still 
performed
+     * but central directory comparison is skipped (result will be "PARTIAL" 
if no
+     * duplicates are found).
+     */
+    private boolean integrityCheck = true;
+
+    public ZipParserConfig() {
+    }
+
+    public boolean isDetectCharsetsInEntryNames() {
+        return detectCharsetsInEntryNames;
+    }
+
+    public void setDetectCharsetsInEntryNames(boolean 
detectCharsetsInEntryNames) {
+        this.detectCharsetsInEntryNames = detectCharsetsInEntryNames;
+    }
+
+    public Charset getEntryEncoding() {
+        return entryEncoding;
+    }
+
+    public void setEntryEncoding(Charset entryEncoding) {
+        this.entryEncoding = entryEncoding;
+    }
+
+    /**
+     * Set the entry encoding from a string (for JSON deserialization).
+     *
+     * @param charsetName the charset name
+     * @throws TikaConfigException if the charset is not supported
+     */
+    public void setEntryEncodingName(String charsetName) throws 
TikaConfigException {
+        if (charsetName == null || charsetName.isEmpty()) {
+            this.entryEncoding = null;
+            return;
+        }
+        try {
+            this.entryEncoding = Charset.forName(charsetName);
+        } catch (UnsupportedCharsetException e) {
+            throw new TikaConfigException("Unsupported charset: " + 
charsetName, e);
+        }
+    }
+
+    public boolean isIntegrityCheck() {
+        return integrityCheck;
+    }
+
+    public void setIntegrityCheck(boolean integrityCheck) {
+        this.integrityCheck = integrityCheck;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index bac9a1bb75..f209ec0189 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -38,45 +38,52 @@ public class PackageParserTest extends TikaTest {
 
     @Test
     public void testCoverage() throws Exception {
-        //test that the package parser covers all inputstreams handled
-        //by ArchiveStreamFactory.  When we update commons-compress, and they 
add
-        //a new stream type, we want to make sure that we're handling it.
+        // Test that the archive parsers collectively cover all input streams 
handled
+        // by ArchiveStreamFactory. When we update commons-compress, and they 
add
+        // a new stream type, we want to make sure that we're handling it.
         ArchiveStreamFactory archiveStreamFactory =
                 new ArchiveStreamFactory(StandardCharsets.UTF_8.name());
+
         PackageParser packageParser = new PackageParser();
+        ZipParser zipParser = new ZipParser();
+        SevenZParser sevenZParser = new SevenZParser();
         ParseContext parseContext = new ParseContext();
+
+        // Combine supported types from all archive parsers
+        Set<MediaType> allSupportedTypes = new HashSet<>();
+        
allSupportedTypes.addAll(packageParser.getSupportedTypes(parseContext));
+        allSupportedTypes.addAll(zipParser.getSupportedTypes(parseContext));
+        allSupportedTypes.addAll(sevenZParser.getSupportedTypes(parseContext));
+
         for (String name : archiveStreamFactory.getInputStreamArchiveNames()) {
             MediaType mt = PackageConstants.getMediaType(name);
-            //use this instead of assertNotEquals so that we report the
-            //name of the missing stream
+            // Use this instead of assertNotEquals so that we report the
+            // name of the missing stream
             if (mt.equals(MediaType.OCTET_STREAM)) {
                 fail("getting octet-stream for: " + name);
             }
 
-            if (!packageParser.getSupportedTypes(parseContext).contains(mt)) {
-                fail("PackageParser should support: " + mt.toString());
+            if (!allSupportedTypes.contains(mt)) {
+                fail("Archive parsers should support: " + mt.toString());
             }
         }
     }
 
     @Test
-    public void testSpecializations() throws Exception {
-        //Test that our manually constructed list of children of zip and tar
-        //in PackageParser is current with TikaLoader's media type registry.
+    public void testZipSpecializations() throws Exception {
+        // Test that our manually constructed list of ZIP specializations
+        // in ZipParser is current with TikaLoader's media type registry.
         MediaTypeRegistry mediaTypeRegistry = 
TikaLoader.getMediaTypeRegistry();
-        Set<MediaType> currentSpecializations = new HashSet<>();
-        MediaType tar = MediaType.parse("application/x-tar");
+        Set<MediaType> currentZipSpecializations = new HashSet<>();
         for (MediaType type : mediaTypeRegistry.getTypes()) {
-            if (mediaTypeRegistry.isSpecializationOf(type, 
MediaType.APPLICATION_ZIP) ||
-                    mediaTypeRegistry.isSpecializationOf(type, tar)) {
-                currentSpecializations.add(type);
-//                System.out.println("\""+type.toString()+"\",");
+            if (mediaTypeRegistry.isSpecializationOf(type, 
MediaType.APPLICATION_ZIP)) {
+                currentZipSpecializations.add(type);
             }
         }
-        for (MediaType mediaType : currentSpecializations) {
-            
assertTrue(PackageParser.PACKAGE_SPECIALIZATIONS.contains(mediaType),
+        for (MediaType mediaType : currentZipSpecializations) {
+            assertTrue(ZipParser.ZIP_SPECIALIZATIONS.contains(mediaType),
                     "missing: " + mediaType);
         }
-        assertEquals(currentSpecializations.size(), 
PackageParser.PACKAGE_SPECIALIZATIONS.size());
+        assertEquals(currentZipSpecializations.size(), 
ZipParser.ZIP_SPECIALIZATIONS.size());
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 47292d27e2..9652f05643 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -17,23 +17,29 @@
 package org.apache.tika.parser.pkg;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
 import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
+import org.junit.jupiter.api.io.TempDir;
 
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.Zip;
+import org.apache.tika.parser.ParseContext;
 
 /**
  * Test case for parsing zip files.
@@ -46,40 +52,32 @@ public class ZipParserTest extends AbstractPkgTest {
      */
     @Test
     public void testEmbedded() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-documents.zip");
 
-        try (TikaInputStream tis = 
getResourceAsStream("/test-documents/test-documents.zip")) {
-            AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext);
-        }
+        // First metadata is the container, rest are embedded documents
+        // With recursive parsing, we get more than 10 entries due to nested 
documents
+        // (e.g., ODT, PPT, DOC contain embedded resources)
+        assertTrue(metadataList.size() >= 10, "Expected at least 10 metadata 
entries");
 
-        // Should have found all 9 documents
-        assertEquals(9, tracker.filenames.size());
-        assertEquals(9, tracker.mediatypes.size());
-        assertEquals(9, tracker.modifiedAts.size());
-
-        // Should have names and modified dates, but not content types,
-        //  as zip doesn't store the content types
-        assertEquals("testEXCEL.xls", tracker.filenames.get(0));
-        assertEquals("testHTML.html", tracker.filenames.get(1));
-        assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
-        assertEquals("testPDF.pdf", tracker.filenames.get(3));
-        assertEquals("testPPT.ppt", tracker.filenames.get(4));
-        assertEquals("testRTF.rtf", tracker.filenames.get(5));
-        assertEquals("testTXT.txt", tracker.filenames.get(6));
-        assertEquals("testWORD.doc", tracker.filenames.get(7));
-        assertEquals("testXML.xml", tracker.filenames.get(8));
-
-        for (String type : tracker.mediatypes) {
-            assertNull(type);
-        }
-        for (String crt : tracker.createdAts) {
-            assertNull(crt);
-        }
-        for (String mod : tracker.modifiedAts) {
-            assertNotNull(mod);
-            assertTrue(mod.startsWith("20"), "Modified at " + mod);
+        // Collect all resource names for verification
+        List<String> resourceNames = new java.util.ArrayList<>();
+        for (Metadata m : metadataList) {
+            String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+            if (name != null) {
+                resourceNames.add(name);
+            }
         }
+
+        // Should contain all 9 direct embedded files from the ZIP
+        assertContains("testEXCEL.xls", resourceNames);
+        assertContains("testHTML.html", resourceNames);
+        assertContains("testOpenOffice2.odt", resourceNames);
+        assertContains("testPDF.pdf", resourceNames);
+        assertContains("testPPT.ppt", resourceNames);
+        assertContains("testRTF.rtf", resourceNames);
+        assertContains("testTXT.txt", resourceNames);
+        assertContains("testWORD.doc", resourceNames);
+        assertContains("testXML.xml", resourceNames);
     }
 
     /**
@@ -98,19 +96,50 @@ public class ZipParserTest extends AbstractPkgTest {
 
     @Test // TIKA-936
     public void testCustomEncoding() throws Exception {
-        ArchiveStreamFactory factory = new ArchiveStreamFactory("SJIS");
-        trackingContext.set(ArchiveStreamFactory.class, factory);
+        ZipParserConfig config = new ZipParserConfig();
+        config.setEntryEncoding(Charset.forName("SJIS"));
+        ParseContext context = new ParseContext();
+        context.set(ZipParserConfig.class, config);
+
+        List<Metadata> metadataList;
+        try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64(
+                "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" +
+                        
"eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" +
+                        
"QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" +
+                        
"AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
+            metadataList = getRecursiveMetadata(tis, new Metadata(), context, 
false);
+        }
 
+        // Container + 1 embedded document
+        assertEquals(2, metadataList.size());
+        assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
+                metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+
+    @Test
+    public void testCharsetAutoDetectionDisabled() throws Exception {
+        // Test that disabling charset detection leaves non-UTF8 names as-is 
(garbled)
+        ZipParserConfig config = new ZipParserConfig();
+        config.setDetectCharsetsInEntryNames(false);
+        ParseContext context = new ParseContext();
+        context.set(ZipParserConfig.class, config);
+
+        List<Metadata> metadataList;
         try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64(
                 "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" +
                         
"eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" +
                         
"QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" +
                         
"AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
-            AUTO_DETECT_PARSER.parse(tis, new DefaultHandler(), new 
Metadata(), trackingContext);
+            metadataList = getRecursiveMetadata(tis, new Metadata(), context, 
false);
         }
 
-        assertEquals(1, tracker.filenames.size());
-        assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", 
tracker.filenames.get(0));
+        // Container + 1 embedded document
+        assertEquals(2, metadataList.size());
+        String name = 
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        // With detection disabled, the SJIS bytes are interpreted as default 
charset (garbled)
+        // The correct Japanese name is 日本語メモ.txt - verify we DON'T get that
+        assertTrue(!"\u65E5\u672C\u8A9E\u30E1\u30E2.txt".equals(name),
+                "With detection disabled, SJIS name should NOT be correctly 
decoded");
     }
 
     @Test
@@ -138,23 +167,334 @@ public class ZipParserTest extends AbstractPkgTest {
 
     @Test
     public void testZipUsingStoredWithDataDescriptor() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (TikaInputStream tis = getResourceAsStream(
-                "/test-documents/testZip_with_DataDescriptor.zip")) {
-            AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext);
-
-            assertEquals(5, tracker.filenames.size());
-            assertEquals("en0", tracker.filenames.get(0));
-            assertEquals("en1", tracker.filenames.get(1));
-            assertEquals("en2", tracker.filenames.get(2));
-            assertEquals("en3", tracker.filenames.get(3));
-            assertEquals("en4", tracker.filenames.get(4));
-            assertEquals(1, tracker.lastSeenStart[0]);
-            assertEquals(2, tracker.lastSeenStart[1]);
-            assertEquals(3, tracker.lastSeenStart[2]);
-            assertEquals(4, tracker.lastSeenStart[3]);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testZip_with_DataDescriptor.zip");
+
+        // Container + 5 embedded documents
+        assertEquals(6, metadataList.size());
+        assertEquals("en0", 
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("en1", 
metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("en2", 
metadataList.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("en3", 
metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("en4", 
metadataList.get(5).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+        // This ZIP with DATA_DESCRIPTOR is salvaged and parsed with 
file-based access
+        // Integrity check can compare central directory vs local headers
+        Metadata containerMetadata = metadataList.get(0);
+        assertEquals("PASS", 
containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
+    }
+
+    @Test
+    public void testIntegrityCheckPass() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-documents.zip");
+
+        // Normal ZIP with file-based access should pass integrity check
+        Metadata containerMetadata = metadataList.get(0);
+        assertEquals("PASS", 
containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
+        assertNull(containerMetadata.get(Zip.DUPLICATE_ENTRY_NAMES));
+        assertNull(containerMetadata.get(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES));
+        assertNull(containerMetadata.get(Zip.LOCAL_HEADER_ONLY_ENTRIES));
+    }
+
+    @Test
+    public void testIntegrityCheckDisabled() throws Exception {
+        ZipParserConfig config = new ZipParserConfig();
+        config.setIntegrityCheck(false);
+        ParseContext context = new ParseContext();
+        context.set(ZipParserConfig.class, config);
+
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-documents.zip", context);
+
+        // Integrity check disabled - no result should be set
+        Metadata containerMetadata = metadataList.get(0);
+        assertNull(containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
+    }
+
+    @Test
+    public void testIntegrityCheckHiddenEntry(@TempDir Path tempDir) throws 
Exception {
+        // Create a ZIP with a hidden entry (in local headers but not central 
directory)
+        Path zipPath = tempDir.resolve("hidden-entry.zip");
+        byte[] zipBytes = createZipWithHiddenEntry();
+        Files.write(zipPath, zipBytes);
+
+        List<Metadata> metadataList = getRecursiveMetadata(zipPath, false);
+
+        Metadata containerMetadata = metadataList.get(0);
+        assertEquals("FAIL", 
containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT));
+        String[] localOnly = 
containerMetadata.getValues(Zip.LOCAL_HEADER_ONLY_ENTRIES);
+        assertEquals(1, localOnly.length);
+        assertEquals("hidden.txt", localOnly[0]);
+    }
+
+    /**
+     * Creates a ZIP file with an entry that exists in local headers but not 
in the
+     * central directory. This simulates a hidden/smuggled entry attack.
+     */
+    private byte[] createZipWithHiddenEntry() throws Exception {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+        // Entry 1: visible.txt (will be in both local header and central 
directory)
+        byte[] visible = "visible content".getBytes(StandardCharsets.UTF_8);
+        // Entry 2: hidden.txt (will be in local header ONLY - not in central 
directory)
+        byte[] hidden = "hidden content".getBytes(StandardCharsets.UTF_8);
+
+        // Local file header for visible.txt
+        int visibleLocalOffset = baos.size();
+        writeLocalFileHeader(baos, "visible.txt", visible);
+
+        // Local file header for hidden.txt (this won't have a central 
directory entry)
+        writeLocalFileHeader(baos, "hidden.txt", hidden);
+
+        // Central directory - only includes visible.txt
+        int centralDirOffset = baos.size();
+        writeCentralDirectoryEntry(baos, "visible.txt", visible, 
visibleLocalOffset);
+
+        // End of central directory
+        int centralDirSize = baos.size() - centralDirOffset;
+        writeEndOfCentralDirectory(baos, 1, centralDirSize, centralDirOffset);
+
+        return baos.toByteArray();
+    }
+
+    private void writeLocalFileHeader(ByteArrayOutputStream baos, String name, 
byte[] content)
+            throws Exception {
+        byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
+
+        // Local file header signature
+        writeInt(baos, 0x04034b50);
+        // Version needed
+        writeShort(baos, 10);
+        // General purpose bit flag
+        writeShort(baos, 0);
+        // Compression method (0 = stored)
+        writeShort(baos, 0);
+        // Last mod time/date
+        writeShort(baos, 0);
+        writeShort(baos, 0);
+        // CRC-32
+        writeInt(baos, (int) computeCrc32(content));
+        // Compressed size
+        writeInt(baos, content.length);
+        // Uncompressed size
+        writeInt(baos, content.length);
+        // File name length
+        writeShort(baos, nameBytes.length);
+        // Extra field length
+        writeShort(baos, 0);
+        // File name
+        baos.write(nameBytes);
+        // File data
+        baos.write(content);
+    }
+
+    private void writeCentralDirectoryEntry(ByteArrayOutputStream baos, String 
name,
+                                             byte[] content, int 
localHeaderOffset) throws Exception {
+        byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
+
+        // Central directory file header signature
+        writeInt(baos, 0x02014b50);
+        // Version made by
+        writeShort(baos, 20);
+        // Version needed
+        writeShort(baos, 10);
+        // General purpose bit flag
+        writeShort(baos, 0);
+        // Compression method
+        writeShort(baos, 0);
+        // Last mod time/date
+        writeShort(baos, 0);
+        writeShort(baos, 0);
+        // CRC-32
+        writeInt(baos, (int) computeCrc32(content));
+        // Compressed size
+        writeInt(baos, content.length);
+        // Uncompressed size
+        writeInt(baos, content.length);
+        // File name length
+        writeShort(baos, nameBytes.length);
+        // Extra field length
+        writeShort(baos, 0);
+        // File comment length
+        writeShort(baos, 0);
+        // Disk number start
+        writeShort(baos, 0);
+        // Internal file attributes
+        writeShort(baos, 0);
+        // External file attributes
+        writeInt(baos, 0);
+        // Relative offset of local header
+        writeInt(baos, localHeaderOffset);
+        // File name
+        baos.write(nameBytes);
+    }
+
+    private void writeEndOfCentralDirectory(ByteArrayOutputStream baos, int 
numEntries,
+                                             int centralDirSize, int 
centralDirOffset) {
+        // End of central directory signature
+        writeInt(baos, 0x06054b50);
+        // Disk number
+        writeShort(baos, 0);
+        // Disk number with central directory
+        writeShort(baos, 0);
+        // Number of entries on this disk
+        writeShort(baos, numEntries);
+        // Total number of entries
+        writeShort(baos, numEntries);
+        // Size of central directory
+        writeInt(baos, centralDirSize);
+        // Offset of central directory
+        writeInt(baos, centralDirOffset);
+        // Comment length
+        writeShort(baos, 0);
+    }
+
+    private void writeInt(ByteArrayOutputStream baos, int value) {
+        baos.write(value & 0xff);
+        baos.write((value >> 8) & 0xff);
+        baos.write((value >> 16) & 0xff);
+        baos.write((value >> 24) & 0xff);
+    }
+
+    private void writeShort(ByteArrayOutputStream baos, int value) {
+        baos.write(value & 0xff);
+        baos.write((value >> 8) & 0xff);
+    }
+
+    private long computeCrc32(byte[] data) {
+        java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+        crc.update(data);
+        return crc.getValue();
+    }
+
+    /**
+     * Microbenchmark to measure the performance impact of integrity checking.
+     * This test is disabled by default - remove the assumeTrue to run it.
+     *
+     * WARNING: The large ZIP test creates a multi-GB file and takes 
significant time.
+     */
+    @Test
+    public void benchmarkIntegrityCheck(@TempDir Path tempDir) throws 
Exception {
+        // Skip by default - set this to true to run the benchmark
+        assumeTrue(false, "Benchmark disabled by default - set to true to 
run");
+
+        int iterations = 20;
+        int warmupIterations = 3;
+
+        // Create small ZIP (10 entries, ~1KB each) - ~10KB total
+        Path smallZip = tempDir.resolve("small.zip");
+        System.out.println("Creating small ZIP (10 entries, ~10KB)...");
+        createBenchmarkZip(smallZip, 10, 1024);
+        System.out.println("  Created: " + Files.size(smallZip) / 1024 + " 
KB");
+
+        // Create medium ZIP (1000 entries, ~100KB each) - ~100MB total
+        Path mediumZip = tempDir.resolve("medium.zip");
+        System.out.println("Creating medium ZIP (1000 entries, ~100MB)...");
+        createBenchmarkZip(mediumZip, 1000, 100 * 1024);
+        System.out.println("  Created: " + Files.size(mediumZip) / (1024 * 
1024) + " MB");
+
+        // Create large ZIP (5000 entries, ~500KB each) - ~2.5GB total
+        Path largeZip = tempDir.resolve("large.zip");
+        System.out.println("Creating large ZIP (5000 entries, ~2.5GB)...");
+        createBenchmarkZip(largeZip, 5000, 500 * 1024);
+        System.out.println("  Created: " + Files.size(largeZip) / (1024 * 
1024) + " MB");
+
+        System.out.println();
+        System.out.println("=== Integrity Check Benchmark ===");
+        System.out.println("Iterations: " + iterations + " (warmup: " + 
warmupIterations + ")");
+        System.out.println();
+
+        // Benchmark small ZIP
+        System.out.println("Small ZIP (10 entries, ~10KB):");
+        runBenchmark(smallZip, iterations, warmupIterations);
+
+        System.out.println();
+
+        // Benchmark medium ZIP
+        System.out.println("Medium ZIP (1000 entries, ~100MB):");
+        runBenchmark(mediumZip, 10, 2);
+
+        System.out.println();
+
+        // Benchmark large ZIP
+        System.out.println("Large ZIP (5000 entries, ~2.5GB):");
+        runBenchmark(largeZip, 5, 1);
+    }
+
+    private void createBenchmarkZip(Path zipPath, int numEntries, int 
entrySize) throws Exception {
+        try (java.util.zip.ZipOutputStream zos =
+                     new 
java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) {
+            // Use STORED to avoid compression - we want actual file size
+            zos.setMethod(java.util.zip.ZipOutputStream.STORED);
+
+            // Use random data to prevent any accidental compression
+            java.util.Random random = new java.util.Random(42);
+            byte[] content = new byte[entrySize];
+            random.nextBytes(content);
+
+            for (int i = 0; i < numEntries; i++) {
+                java.util.zip.ZipEntry entry = new 
java.util.zip.ZipEntry("entry" + i + ".txt");
+                entry.setMethod(java.util.zip.ZipEntry.STORED);
+                entry.setSize(content.length);
+                entry.setCompressedSize(content.length);
+                entry.setCrc(computeCrc32(content));
+                zos.putNextEntry(entry);
+                zos.write(content);
+                zos.closeEntry();
+            }
+        }
+    }
+
+    private void runBenchmark(Path zipPath, int iterations, int 
warmupIterations) throws Exception {
+        ZipParser parser = new ZipParser();
+
+        // Config with integrity check enabled
+        ZipParserConfig configWithCheck = new ZipParserConfig();
+        configWithCheck.setIntegrityCheck(true);
+
+        // Config with integrity check disabled
+        ZipParserConfig configWithoutCheck = new ZipParserConfig();
+        configWithoutCheck.setIntegrityCheck(false);
+
+        // Warmup - with integrity check
+        for (int i = 0; i < warmupIterations; i++) {
+            parseZip(parser, zipPath, configWithCheck);
+        }
+
+        // Warmup - without integrity check
+        for (int i = 0; i < warmupIterations; i++) {
+            parseZip(parser, zipPath, configWithoutCheck);
+        }
+
+        // Benchmark with integrity check
+        long startWithCheck = System.nanoTime();
+        for (int i = 0; i < iterations; i++) {
+            parseZip(parser, zipPath, configWithCheck);
+        }
+        long durationWithCheck = System.nanoTime() - startWithCheck;
+
+        // Benchmark without integrity check
+        long startWithoutCheck = System.nanoTime();
+        for (int i = 0; i < iterations; i++) {
+            parseZip(parser, zipPath, configWithoutCheck);
+        }
+        long durationWithoutCheck = System.nanoTime() - startWithoutCheck;
+
+        double avgWithCheck = durationWithCheck / (double) iterations / 
1_000_000.0;
+        double avgWithoutCheck = durationWithoutCheck / (double) iterations / 
1_000_000.0;
+        double overhead = avgWithCheck - avgWithoutCheck;
+        double overheadPercent = (overhead / avgWithoutCheck) * 100;
+
+        System.out.printf("  Without integrity check: %.3f ms/parse%n", 
avgWithoutCheck);
+        System.out.printf("  With integrity check:    %.3f ms/parse%n", 
avgWithCheck);
+        System.out.printf("  Overhead:                %.3f ms (%.1f%%)%n", 
overhead, overheadPercent);
+    }
+
+    private void parseZip(ZipParser parser, Path zipPath, ZipParserConfig 
config) throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(ZipParserConfig.class, config);
+
+        try (TikaInputStream tis = TikaInputStream.get(zipPath)) {
+            Metadata metadata = new Metadata();
+            parser.parse(tis, new org.xml.sax.helpers.DefaultHandler(), 
metadata, context);
         }
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 23464be288..f23063409f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -42,8 +42,10 @@ import org.apache.tika.detect.DetectHelper;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Zip;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.zip.utils.ZipSalvager;
 
 /**
  * This class is designed to detect subtypes of zip-based file formats.
@@ -198,48 +200,52 @@ public class DefaultZipContainerDetector implements 
Detector {
      * This will call TikaInputStream's getFile(). If there are no exceptions,
      * it will place the ZipFile in TikaInputStream's openContainer and leave 
it
      * open.
+     * <p>
+     * Sets detector hints in metadata for the parser:
+     * <ul>
+     *   <li>{@link Zip#DETECTOR_ZIPFILE_OPENED} - true if ZipFile opened 
successfully</li>
+     *   <li>{@link Zip#DETECTOR_DATA_DESCRIPTOR_REQUIRED} - true if streaming 
needed data descriptor support</li>
+     * </ul>
      *
-     * @param tis
-     * @return
+     * @param tis the TikaInputStream
+     * @param metadata the metadata (will be updated with detector hints)
+     * @param parseContext the parse context
+     * @return the detected media type
      */
     private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata 
metadata, ParseContext parseContext) {
-        ZipFile zip = null;
-        try {
-            zip = ZipFile.builder().setFile(tis.getFile()).get();
-
-            for (ZipContainerDetector zipDetector : getDetectors()) {
-                MediaType type = zipDetector.detect(zip, tis);
-                if (type != null) {
-                    if (LOG.isDebugEnabled()) {
-                        LOG.debug("{} detected {}", zipDetector.getClass(),
-                                type.toString());
-                    }
-                    //e.g. if OPCPackage has already been set
-                    //don't overwrite it with the zip
-                    if (tis.getOpenContainer() == null) {
-                        tis.setOpenContainer(zip);
+        // Try to open ZipFile (with salvaging fallback)
+        ZipFile zip = ZipSalvager.tryToOpenZipFile(tis, metadata);
+
+        if (zip != null) {
+            // ZipFile opened (directly or via salvaging) - run file-based 
detection
+            try {
+                for (ZipContainerDetector zipDetector : getDetectors()) {
+                    MediaType type = zipDetector.detect(zip, tis);
+                    if (type != null) {
+                        if (LOG.isDebugEnabled()) {
+                            LOG.debug("{} detected {}", 
zipDetector.getClass(), type.toString());
+                        }
+                        return type;
                     } else {
-                        tis.addCloseableResource(zip);
-                    }
-                    return type;
-                } else {
-                    if (LOG.isDebugEnabled()) {
-                        LOG.debug("{} detected null", zipDetector.getClass());
+                        if (LOG.isDebugEnabled()) {
+                            LOG.debug("{} detected null", 
zipDetector.getClass());
+                        }
                     }
                 }
+            } catch (IOException e) {
+                // Detection failed - fall through to return plain ZIP
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("Detection failed on opened ZipFile", e);
+                }
             }
-        } catch (IOException e) {
-            //do nothing
-        }
-        // Fallback: it's still a zip file, we just don't know what kind of one
-        if (zip != null) {
-            IOUtils.closeQuietly(zip);
+            // No specific type detected - it's a plain ZIP
             return MediaType.APPLICATION_ZIP;
         }
+
+        // ZipFile failed to open even after salvaging - fall back to 
streaming detection
         if (LOG.isDebugEnabled()) {
-            LOG.debug("zip file failed to open; attempting streaming detect. 
Results may be imprecise");
+            LOG.debug("ZipFile and salvaging both failed; falling back to 
streaming detection");
         }
-        //problem opening zip file (truncated?)
         try {
             return detectStreamingFromPath(tis.getPath(), metadata, false);
         } catch (IOException e) {
@@ -265,6 +271,8 @@ public class DefaultZipContainerDetector implements 
Detector {
         } catch (UnsupportedZipFeatureException zfe) {
             if (allowStoredEntries == false &&
                     zfe.getFeature() == 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                // Set hint for parser that DATA_DESCRIPTOR support is required
+                metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true);
                 input.reset();
                 return detectStreaming(input, metadata, true);
             }
@@ -295,6 +303,8 @@ public class DefaultZipContainerDetector implements 
Detector {
         } catch (UnsupportedZipFeatureException zfe) {
             if (allowStoredEntries == false &&
                     zfe.getFeature() == 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                // Set hint for parser that DATA_DESCRIPTOR support is required
+                metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true);
                 return detectStreamingFromPath(p, metadata, true);
             }
         } catch (SecurityException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
index 52391bbf8c..1a5542895a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
@@ -17,81 +17,195 @@
 package org.apache.tika.zip.utils;
 
 import java.io.EOFException;
-import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.zip.ZipException;
 
 import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Zip;
 
 public class ZipSalvager {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(ZipSalvager.class);
 
     /**
-     * This streams the broken zip and rebuilds a new zip that
-     * is at least a valid zip file.  The contents of the final stream
-     * may be truncated, but the result should be a valid zip file.
+     * Tries to open a ZipFile from the TikaInputStream. If direct opening 
fails,
+     * attempts to salvage the ZIP and open the salvaged version.
      * <p>
-     * This does nothing fancy to fix the underlying broken zip.
-     * <p>
-     * This will close the inputstream
+     * On success:
+     * <ul>
+     *   <li>Sets {@link Zip#DETECTOR_ZIPFILE_OPENED} to true in metadata</li>
+     *   <li>Stores the ZipFile in tis.openContainer (if not already set)</li>
+     *   <li>Returns the opened ZipFile</li>
+     * </ul>
+     * On failure:
+     * <ul>
+     *   <li>Sets {@link Zip#DETECTOR_ZIPFILE_OPENED} to false in metadata</li>
+     *   <li>Returns null</li>
+     * </ul>
      *
-     * @param brokenZip
-     * @param salvagedZip
-     * @param allowStoredEntries
-     * @throws java.io.IOException
+     * @param tis      the TikaInputStream (must be file-backed)
+     * @param metadata the metadata to update with hints
+     * @param charset  optional charset for entry names (may be null)
+     * @return the opened ZipFile, or null if opening and salvaging both failed
      */
-    public static void salvageCopy(InputStream brokenZip, File salvagedZip,
-                                   boolean allowStoredEntries) throws 
IOException {
+    public static ZipFile tryToOpenZipFile(TikaInputStream tis, Metadata 
metadata, Charset charset) {
+        // First, try direct open
+        try {
+            ZipFile.Builder builder = new 
ZipFile.Builder().setFile(tis.getFile());
+            if (charset != null) {
+                builder.setCharset(charset);
+            }
+            ZipFile zipFile = builder.get();
+
+            // Direct open succeeded
+            metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, true);
+            if (tis.getOpenContainer() == null) {
+                tis.setOpenContainer(zipFile);
+            } else {
+                tis.addCloseableResource(zipFile);
+            }
+            return zipFile;
+        } catch (IOException e) {
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("ZipFile failed to open directly; attempting to 
salvage", e);
+            }
+        }
 
-        TikaInputStream tis = TikaInputStream.get(brokenZip);
-        // Enable rewind capability for retry on DATA_DESCRIPTOR feature
-        tis.enableRewind();
+        // Direct open failed - try salvaging
         try {
-            try (ZipArchiveOutputStream outputStream = new 
ZipArchiveOutputStream(salvagedZip);
-                    ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(
-                            CloseShieldInputStream.wrap(tis), "UTF8", false,
-                            allowStoredEntries)) {
-                ZipArchiveEntry zae = zipArchiveInputStream.getNextEntry();
+            final Path salvagedPath = Files.createTempFile("tika-salvaged-", 
".zip");
+            tis.enableRewind();
+            salvageCopy(tis, salvagedPath, false);
+            tis.rewind();
+
+            ZipFile.Builder builder = new 
ZipFile.Builder().setPath(salvagedPath);
+            if (charset != null) {
+                builder.setCharset(charset);
+            }
+            ZipFile salvagedZip = builder.get();
+
+            // Salvaging succeeded
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("Successfully salvaged ZIP to {}", salvagedPath);
+            }
+            metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, true);
+            metadata.set(Zip.SALVAGED, true);
+
+            // Add file deletion FIRST so it runs AFTER ZipFile is closed
+            // (TemporaryResources uses LIFO order)
+            tis.addCloseableResource(() -> {
                 try {
-                    processZAE(zae, zipArchiveInputStream, outputStream);
-                } catch (UnsupportedZipFeatureException uzfe) {
-                    if (uzfe.getFeature() ==
-                            
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
-                        //percolate up to allow for retry
-                        throw uzfe;
-                    }
-                    //else swallow
-                } catch (ZipException | EOFException e) {
-                    //swallow
+                    Files.deleteIfExists(salvagedPath);
+                } catch (IOException e) {
+                    LOG.warn("Failed to delete salvaged temp file: {}", 
salvagedPath, e);
+                    salvagedPath.toFile().deleteOnExit();
                 }
-                outputStream.flush();
-                outputStream.finish();
-            } catch (UnsupportedZipFeatureException e) {
-                //now retry
-                if (allowStoredEntries == false &&
-                        e.getFeature() == 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
-                    tis.rewind();
-                    salvageCopy(tis, salvagedZip, true);
-                } else {
-                    throw e;
+            });
+
+            // Then add ZipFile (will be closed before file deletion runs)
+            if (tis.getOpenContainer() == null) {
+                tis.setOpenContainer(salvagedZip);
+            } else {
+                tis.addCloseableResource(salvagedZip);
+            }
+            return salvagedZip;
+        } catch (IOException e) {
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("Salvaging failed", e);
+            }
+        }
+
+        // Both direct open and salvaging failed
+        metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, false);
+        return null;
+    }
+
+    /**
+     * Tries to open a ZipFile from the TikaInputStream using default charset.
+     *
+     * @see #tryToOpenZipFile(TikaInputStream, Metadata, Charset)
+     */
+    public static ZipFile tryToOpenZipFile(TikaInputStream tis, Metadata 
metadata) {
+        return tryToOpenZipFile(tis, metadata, null);
+    }
+
+    /**
+     * Streams the broken zip and rebuilds a new zip that is at least a valid 
zip file.
+     * The contents of the final stream may be truncated, but the result 
should be a valid zip file.
+     * <p>
+     * This does nothing fancy to fix the underlying broken zip.
+     * <p>
+     * This method does NOT close the TikaInputStream - the caller owns it.
+     * The caller should call {@code tis.enableRewind()} before calling this 
method
+     * if retry on DATA_DESCRIPTOR is needed.
+     *
+     * @param tis               the TikaInputStream to read from (not closed 
by this method)
+     * @param salvagedZip       the output path for the salvaged ZIP
+     * @param allowStoredEntries whether to allow stored entries with data 
descriptors
+     * @throws IOException if salvaging fails
+     */
+    public static void salvageCopy(TikaInputStream tis, Path salvagedZip,
+                                   boolean allowStoredEntries) throws 
IOException {
+        try (ZipArchiveOutputStream outputStream = new 
ZipArchiveOutputStream(Files.newOutputStream(salvagedZip));
+                ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(
+                        CloseShieldInputStream.wrap(tis), "UTF8", false,
+                        allowStoredEntries)) {
+            ZipArchiveEntry zae = zipArchiveInputStream.getNextEntry();
+            try {
+                processZAE(zae, zipArchiveInputStream, outputStream);
+            } catch (UnsupportedZipFeatureException uzfe) {
+                if (uzfe.getFeature() ==
+                        
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                    //percolate up to allow for retry
+                    throw uzfe;
                 }
-            } catch (IOException e) {
-                LOG.warn("problem fixing zip", e);
+                //else swallow
+            } catch (ZipException | EOFException e) {
+                //swallow
             }
-        } finally {
-            tis.close();
+            outputStream.flush();
+            outputStream.finish();
+        } catch (UnsupportedZipFeatureException e) {
+            //now retry with data descriptor support
+            if (!allowStoredEntries &&
+                    e.getFeature() == 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                tis.rewind();
+                salvageCopy(tis, salvagedZip, true);
+            } else {
+                throw e;
+            }
+        } catch (IOException e) {
+            LOG.warn("problem fixing zip", e);
+        }
+    }
+
+    /**
+     * Streams a broken zip from a Path and rebuilds a valid zip file.
+     * <p>
+     * This is a convenience method that creates a TikaInputStream internally.
+     *
+     * @param brokenZip   the path to the broken ZIP file
+     * @param salvagedZip the path for the salvaged ZIP output
+     * @throws IOException if salvaging fails
+     */
+    public static void salvageCopy(Path brokenZip, Path salvagedZip) throws 
IOException {
+        try (TikaInputStream tis = TikaInputStream.get(brokenZip)) {
+            tis.enableRewind();
+            salvageCopy(tis, salvagedZip, false);
         }
     }
 
@@ -122,10 +236,4 @@ public class ZipSalvager {
             zae = zipArchiveInputStream.getNextEntry();
         }
     }
-
-    public static void salvageCopy(File brokenZip, File salvagedZip) throws 
IOException {
-        try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
-            salvageCopy(is, salvagedZip, false);
-        }
-    }
 }

(tika) 01/02: improve zip detection + parsing, WIP

Reply via email to