This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4650-refactor-zip-parser in repository https://gitbox.apache.org/repos/asf/tika.git
commit 79a636ed65b8b2f4b55052ca088f2971570fec11 Author: tallison <[email protected]> AuthorDate: Thu Feb 5 06:54:19 2026 -0500 improve zip detection + parsing, WIP --- .../main/java/org/apache/tika/metadata/Zip.java | 130 ++++ .../microsoft/ooxml/OOXMLExtractorFactory.java | 10 +- .../org/apache/tika/parser/epub/EpubParser.java | 2 +- .../tika/parser/pkg/AbstractArchiveParser.java | 84 +++ .../org/apache/tika/parser/pkg/PackageParser.java | 456 ++------------ .../java/org/apache/tika/parser/pkg/RarParser.java | 6 +- .../org/apache/tika/parser/pkg/SevenZParser.java | 164 +++++ .../java/org/apache/tika/parser/pkg/ZipParser.java | 690 +++++++++++++++++++++ .../apache/tika/parser/pkg/ZipParserConfig.java | 105 ++++ .../apache/tika/parser/pkg/PackageParserTest.java | 45 +- .../org/apache/tika/parser/pkg/ZipParserTest.java | 456 ++++++++++++-- .../detect/zip/DefaultZipContainerDetector.java | 72 ++- .../org/apache/tika/zip/utils/ZipSalvager.java | 212 +++++-- 13 files changed, 1858 insertions(+), 574 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Zip.java b/tika-core/src/main/java/org/apache/tika/metadata/Zip.java new file mode 100644 index 0000000000..4715a85383 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/Zip.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * ZIP file properties collection. + * + * @since Apache Tika 4.0 + */ +public interface Zip { + + String ZIP_PREFIX = "zip" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + + // ==================== Detector Hints ==================== + // These are set by the detector to communicate state to the parser. + // The detector MUST always set these when detecting a ZIP file, + // overwriting any user-provided values. + + /** + * Set by the detector to indicate whether it successfully opened the ZIP as a ZipFile. + * If true, the ZipFile is available via TikaInputStream.getOpenContainer(). + * If false, ZipFile failed to open (truncated, corrupt, etc.) and parser should use streaming. + */ + Property DETECTOR_ZIPFILE_OPENED = + Property.internalBoolean(ZIP_PREFIX + "detectorZipFileOpened"); + + /** + * Set by the detector to indicate whether streaming required DATA_DESCRIPTOR support. + * If true, parser should start streaming with allowStoredEntriesWithDataDescriptor=true. + */ + Property DETECTOR_DATA_DESCRIPTOR_REQUIRED = + Property.internalBoolean(ZIP_PREFIX + "detectorDataDescriptorRequired"); + + /** + * Set to true if the ZIP file was salvaged (rebuilt from a corrupt/truncated original). + * This indicates that the ZIP could not be opened directly and was repaired by + * streaming through the local headers and reconstructing a valid ZIP structure. + */ + Property SALVAGED = Property.internalBoolean(ZIP_PREFIX + "salvaged"); + + // ==================== Entry Metadata ==================== + // These are set on embedded document metadata for each ZIP entry. + + /** + * Comment associated with a ZIP entry. + */ + Property COMMENT = Property.externalText(ZIP_PREFIX + "comment"); + + /** + * Compression method used for the entry (0=stored, 8=deflated, etc.). + */ + Property COMPRESSION_METHOD = Property.externalInteger(ZIP_PREFIX + "compressionMethod"); + + /** + * Compressed size of the entry in bytes. + */ + Property COMPRESSED_SIZE = Property.externalText(ZIP_PREFIX + "compressedSize"); + + /** + * Uncompressed size of the entry in bytes. + */ + Property UNCOMPRESSED_SIZE = Property.externalText(ZIP_PREFIX + "uncompressedSize"); + + /** + * CRC-32 checksum of the uncompressed entry data. + */ + Property CRC32 = Property.externalText(ZIP_PREFIX + "crc32"); + + /** + * Unix file mode/permissions for the entry. + */ + Property UNIX_MODE = Property.externalInteger(ZIP_PREFIX + "unixMode"); + + /** + * Platform that created the entry (0=MS-DOS, 3=Unix, etc.). + */ + Property PLATFORM = Property.externalInteger(ZIP_PREFIX + "platform"); + + /** + * Version of ZIP specification used to create the entry. + */ + Property VERSION_MADE_BY = Property.externalInteger(ZIP_PREFIX + "versionMadeBy"); + + /** + * Whether the entry is encrypted. + */ + Property ENCRYPTED = Property.externalBoolean(ZIP_PREFIX + "encrypted"); + + // ==================== Integrity Check Results ==================== + // These are set on the parent document metadata after integrity checking. + + /** + * Result of the integrity check comparing central directory to local headers. + * Values: "PASS" (no issues), "FAIL" (issues found), "PARTIAL" (only duplicate check done). + */ + Property INTEGRITY_CHECK_RESULT = Property.internalText(ZIP_PREFIX + "integrityCheckResult"); + + /** + * Entry names that appear multiple times in the local headers (streaming). + * Duplicate entries are a potential attack vector. + */ + Property DUPLICATE_ENTRY_NAMES = Property.internalTextBag(ZIP_PREFIX + "duplicateEntryNames"); + + /** + * Entry names that exist in central directory but not in local headers. + */ + Property CENTRAL_DIRECTORY_ONLY_ENTRIES = + Property.internalTextBag(ZIP_PREFIX + "centralDirectoryOnlyEntries"); + + /** + * Entry names that exist in local headers but not in central directory. + * These are "hidden" entries that some tools won't see. + */ + Property LOCAL_HEADER_ONLY_ENTRIES = + Property.internalTextBag(ZIP_PREFIX + "localHeaderOnlyEntries"); +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index ebe10300cb..0d6ff8ec2b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.util.Locale; import org.apache.poi.extractor.ExtractorFactory; @@ -101,10 +102,13 @@ public class OOXMLExtractorFactory { pkg = ((OPCPackageWrapper) tis.getOpenContainer()).getOPCPackage(); } else { try { - pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); + pkg = OPCPackage.open(tis.getPath().toString(), PackageAccess.READ); } catch (InvalidOperationException e) { - tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile(); - ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy); + Path tmpRepairedPath = Files.createTempFile("tika-ooxml-repair-", ""); + tmpRepairedCopy = tmpRepairedPath.toFile(); + tis.enableRewind(); + ZipSalvager.salvageCopy(tis, tmpRepairedPath, false); + tis.rewind(); pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); } tis.setOpenContainer(new OPCPackageWrapper(pkg)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index fb8fbce040..9779008017 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -171,7 +171,7 @@ public class EpubParser implements Parser { try (TemporaryResources resources = new TemporaryResources()) { Path salvaged = resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString())); - ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile()); + ZipSalvager.salvageCopy(brokenZip, salvaged); try (ZipFile zipFile = ZipFile.builder().setFile(salvaged.toFile()).get()) { return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false); } catch (EpubZipException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java new file mode 100644 index 0000000000..a599c91cd4 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/AbstractArchiveParser.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.IOException; +import java.util.Date; + +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AbstractEncodingDetectorParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * Abstract base class for archive parsers that provides common functionality + * for handling embedded documents within archives. + */ +public abstract class AbstractArchiveParser extends AbstractEncodingDetectorParser { + + public AbstractArchiveParser() { + super(); + } + + public AbstractArchiveParser(EncodingDetector encodingDetector) { + super(encodingDetector); + } + + /** + * Handles metadata for an archive entry and writes appropriate XHTML elements. + * + * @param name the entry name + * @param createAt creation date (may be null) + * @param modifiedAt modification date (may be null) + * @param size entry size (may be null) + * @param xhtml the XHTML content handler + * @param context the parse context + * @return metadata object populated with entry information + */ + public static Metadata handleEntryMetadata(String name, Date createAt, Date modifiedAt, + Long size, XHTMLContentHandler xhtml, + ParseContext context) + throws SAXException, IOException, TikaException { + Metadata entrydata = Metadata.newInstance(context); + if (createAt != null) { + entrydata.set(TikaCoreProperties.CREATED, createAt); + } + if (modifiedAt != null) { + entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt); + } + if (size != null) { + entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); + } + if (name != null && name.length() > 0) { + name = name.replace("\\", "/"); + entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + entrydata.set(TikaCoreProperties.INTERNAL_PATH, name); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", name); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + return entrydata; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index 0d8528fea8..ab1bfa65c0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -16,389 +16,108 @@ */ package org.apache.tika.parser.pkg; - import static org.apache.tika.detect.zip.PackageConstants.AR; import static org.apache.tika.detect.zip.PackageConstants.ARJ; import static org.apache.tika.detect.zip.PackageConstants.CPIO; import static org.apache.tika.detect.zip.PackageConstants.DUMP; -import static org.apache.tika.detect.zip.PackageConstants.JAR; -import static org.apache.tika.detect.zip.PackageConstants.SEVENZ; +import static org.apache.tika.detect.zip.PackageConstants.GTAR; import static org.apache.tika.detect.zip.PackageConstants.TAR; -import static org.apache.tika.detect.zip.PackageConstants.ZIP; import java.io.IOException; -import java.nio.charset.Charset; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.zip.ZipEntry; -import org.apache.commons.compress.PasswordRequiredException; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; -import org.apache.commons.compress.archivers.StreamingNotSupportedException; import org.apache.commons.compress.archivers.ar.ArArchiveInputStream; +import org.apache.commons.compress.archivers.arj.ArjArchiveInputStream; import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream; import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream; -import org.apache.commons.compress.archivers.jar.JarArchiveInputStream; -import org.apache.commons.compress.archivers.sevenz.SevenZFile; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; -import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; -import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature; -import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; -import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.config.TikaComponent; -import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.PasswordProvider; import org.apache.tika.sax.XHTMLContentHandler; /** - * Parser for various packaging formats. Package entries will be written to - * the XHTML event stream as <div class="package-entry"> elements that - * contain the (optional) entry name as a <h1> element and the full - * structured body content of the parsed entry. + * Parser for streaming archive formats: AR, ARJ, CPIO, DUMP, TAR. + * <p> + * Package entries will be written to the XHTML event stream as + * <div class="package-entry"> elements that contain the (optional) + * entry name as a <h1> element and the full structured body content + * of the parsed entry. * <p> - * User must have JCE Unlimited Strength jars installed for encryption to - * work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars - * are not installed, an IOException will be thrown, and potentially - * wrapped in a TikaException. + * For ZIP/JAR archives, see {@link ZipParser}. + * For 7z archives, see {@link SevenZParser}. */ @TikaComponent -public class PackageParser extends AbstractEncodingDetectorParser { +public class PackageParser extends AbstractArchiveParser { - //We used to avoid overwriting file types if the file type - //was a specialization of zip/tar. We determined specialization of zip - //via TikaConfig at parse time. - //The following is an inelegant hack, but until we can serialize TikaConfig, - //or dramatically rework the ForkParser to avoid serialization - //of parsers, this is what we have. - //There is at least a test in PackageParserTest that makes sure that we - //keep this list updated. - //This is now legacy behavior: TODO figure out if we can go make to using the - //mime registry - static final Set<MediaType> PACKAGE_SPECIALIZATIONS = loadPackageSpecializations(); - /** - * Serial version UID - */ private static final long serialVersionUID = -5331043266963888708L; - private static final Set<MediaType> SUPPORTED_TYPES = - MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ); - - // The number of bytes of entry name to detect charset properly - private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100; - - - static final Set<MediaType> loadPackageSpecializations() { - Set<MediaType> zipSpecializations = new HashSet<>(); - for (String mediaTypeString : new String[]{ - //specializations of ZIP - "application/bizagi-modeler", "application/epub+zip", - "application/hwp+zip", - "application/java-archive", - "application/vnd.adobe.air-application-installer-package+zip", - "application/vnd.android.package-archive", "application/vnd.apple.iwork", - "application/vnd.apple.keynote", "application/vnd.apple.numbers", - "application/vnd.apple.pages", "application/vnd.apple.unknown.13", - "application/vnd.etsi.asic-e+zip", "application/vnd.etsi.asic-s+zip", - "application/vnd.google-earth.kmz", "application/vnd.mindjet.mindmanager", - "application/vnd.ms-excel.addin.macroenabled.12", - "application/vnd.ms-excel.sheet.binary.macroenabled.12", - "application/vnd.ms-excel.sheet.macroenabled.12", - "application/vnd.ms-excel.template.macroenabled.12", - "application/vnd.ms-powerpoint.addin.macroenabled.12", - "application/vnd.ms-powerpoint.presentation.macroenabled.12", - "application/vnd.ms-powerpoint.slide.macroenabled.12", - "application/vnd.ms-powerpoint.slideshow.macroenabled.12", - "application/vnd.ms-powerpoint.template.macroenabled.12", - "application/vnd.ms-visio.drawing", - "application/vnd.ms-visio.drawing.macroenabled.12", - "application/vnd.ms-visio.stencil", - "application/vnd.ms-visio.stencil.macroenabled.12", - "application/vnd.ms-visio.template", - "application/vnd.ms-visio.template.macroenabled.12", - "application/vnd.ms-word.document.macroenabled.12", - "application/vnd.ms-word.template.macroenabled.12", - "application/vnd.ms-xpsdocument", "application/vnd.oasis.opendocument.formula", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.presentationml.slide", - "application/vnd.openxmlformats-officedocument.presentationml.slideshow", - "application/vnd.openxmlformats-officedocument.presentationml.template", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.openxmlformats-officedocument.spreadsheetml.template", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.wordprocessingml.template", - "application/x-ibooks+zip", "application/x-itunes-ipa", - "application/x-tika-iworks-protected", "application/x-tika-java-enterprise-archive", - "application/x-tika-java-web-archive", "application/x-tika-ooxml", - "application/x-tika-visio-ooxml", "application/x-xliff+zip", "application/x-xmind", - "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc", - "application/vnd.sun.xml.writer", "application/vnd.sun.xml.writer.template", - "application/vnd.sun.xml.draw", "application/vnd.sun.xml.impress", - "application/vnd.openofficeorg.autotext", - "application/vnd.oasis.opendocument.graphics-template", - "application/vnd.oasis.opendocument.text-web", - "application/vnd.oasis.opendocument.spreadsheet-template", - "application/vnd.oasis.opendocument.graphics", - "application/vnd.oasis.opendocument.image-template", - "application/vnd.oasis.opendocument.text", - "application/vnd.oasis.opendocument.text-template", - "application/vnd.oasis.opendocument.presentation", - "application/vnd.oasis.opendocument.chart", - "application/vnd.openofficeorg.extension", - "application/vnd.oasis.opendocument.spreadsheet", - "application/vnd.oasis.opendocument.image", - "application/vnd.oasis.opendocument.formula-template", - "application/vnd.oasis.opendocument.presentation-template", - "application/vnd.oasis.opendocument.chart-template", - "application/vnd.oasis.opendocument.text-master", - "application/vnd.adobe.indesign-idml-package", - "application/x-gtar", //specialization of tar - "application/x-wacz", "application/x-vnd.datapackage+zip" - }) { - zipSpecializations.add(MediaType.parse(mediaTypeString)); - } - return Collections.unmodifiableSet(zipSpecializations); - } - - //not clear what we should use instead? - @Deprecated - static MediaType getMediaType(ArchiveInputStream stream) { - if (stream instanceof JarArchiveInputStream) { - return JAR; - } else if (stream instanceof ZipArchiveInputStream) { - return ZIP; - } else if (stream instanceof ArArchiveInputStream) { - return AR; - } else if (stream instanceof CpioArchiveInputStream) { - return CPIO; - } else if (stream instanceof DumpArchiveInputStream) { - return DUMP; - } else if (stream instanceof TarArchiveInputStream) { - return TAR; - } else if (stream instanceof SevenZWrapper) { - return SEVENZ; - } else { - return MediaType.OCTET_STREAM; - } - } - protected static Metadata handleEntryMetadata(String name, Date createAt, Date modifiedAt, - Long size, XHTMLContentHandler xhtml, - ParseContext context) - throws SAXException, IOException, TikaException { - Metadata entrydata = Metadata.newInstance(context); - if (createAt != null) { - entrydata.set(TikaCoreProperties.CREATED, createAt); - } - if (modifiedAt != null) { - entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt); - } - if (size != null) { - entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); - } - if (name != null && name.length() > 0) { - name = name.replace("\\", "/"); - entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); - entrydata.set(TikaCoreProperties.INTERNAL_PATH, name); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", name); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); - } - return entrydata; - } - - private boolean detectCharsetsInEntryNames = true; + private static final Set<MediaType> SUPPORTED_TYPES = + MediaType.set(AR, ARJ, CPIO, DUMP, TAR); public PackageParser() { super(); } - public PackageParser(EncodingDetector encodingDetector) { - super(encodingDetector); - } - + @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } + @Override public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - // Enable rewind capability since we may need to re-read for 7z or data descriptor handling - tis.enableRewind(); - - TemporaryResources tmp = new TemporaryResources(); - // Shield the TikaInputStream from being closed when we close archive streams. - // This allows us to reset and re-read the stream for data descriptor handling. tis.setCloseShield(); try { - _parse(tis, handler, metadata, context, tmp); + doParse(tis, handler, metadata, context); } finally { tis.removeCloseShield(); } } - private void _parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, - ParseContext context, TemporaryResources tmp) - throws TikaException, IOException, SAXException { - ArchiveInputStream ais = null; - String encoding = null; + private void doParse(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws TikaException, IOException, SAXException { + ArchiveInputStream ais; try { ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory()); - encoding = factory.getEntryEncoding(); - // At the end we want to close the archive stream to release - // any associated resources, but the underlying document stream - // should not be closed - //TODO -- we've probably already detected the stream by here. We should - //rely on that detection and not re-detect. - encoding = factory.getEntryEncoding(); - // At the end we want to close the archive stream to release - // any associated resources, but the underlying document stream - // should not be closed ais = factory.createArchiveInputStream(tis); - - } catch (StreamingNotSupportedException sne) { - // Most archive formats work on streams, but a few need files - if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) { - // Rework as a file, and wrap - tis.rewind(); - - // Seven Zip suports passwords, was one given? - String password = null; - PasswordProvider provider = context.get(PasswordProvider.class); - if (provider != null) { - password = provider.getPassword(metadata); - } - - SevenZFile sevenz; - try { - SevenZFile.Builder builder = new SevenZFile.Builder().setFile(tis.getFile()); - if (password == null) { - sevenz = builder.get(); - } else { - sevenz = builder.setPassword(password.toCharArray()).get(); - } - } catch (PasswordRequiredException e) { - throw new EncryptedDocumentException(e); - } - - // Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty - ais = new SevenZWrapper(sevenz); - } else { - throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne); - } } catch (ArchiveException e) { throw new TikaException("Unable to unpack document stream", e); } updateMediaType(ais, metadata); - // Use the delegate parser to parse the contained document + EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); - // mark before we start parsing entries for potential reset - //needed for mutable int by ref, not for thread safety. - //this keeps track of how many entries were processed. - AtomicInteger entryCnt = new AtomicInteger(); - try { - parseEntries(ais, metadata, extractor, xhtml, false, entryCnt, context); - } catch (UnsupportedZipFeatureException zfe) { - // If this is a zip archive which requires a data descriptor, parse it again - if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) { - // Close archive input stream and create a new one that could handle data descriptor - ais.close(); - tis.rewind(); - ais = new ZipArchiveInputStream(tis, encoding, true, true); - parseEntries(ais, metadata, extractor, xhtml, true, entryCnt, context); - } - } finally { - ais.close(); - xhtml.endDocument(); - } - } - - /** - * Parse the entries of the zip archive - * - * @param ais archive input stream - * @param metadata document metadata (input and output) - * @param extractor the delegate parser - * @param xhtml the xhtml handler - * @param shouldUseDataDescriptor indicates if a data descriptor is required or not - * @param entryCnt index of the entry - * @throws TikaException if the document could not be parsed - * @throws IOException if a UnsupportedZipFeatureException is met - * @throws SAXException if the SAX events could not be processed - */ - private void parseEntries(ArchiveInputStream ais, Metadata metadata, - EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml, - boolean shouldUseDataDescriptor, AtomicInteger entryCnt, - ParseContext context) - throws TikaException, IOException, SAXException { try { ArchiveEntry entry = ais.getNextEntry(); while (entry != null) { - if (shouldUseDataDescriptor && entryCnt.get() > 0) { - // With shouldUseDataDescriptor being true, we are reading - // the zip once again. The number of entryCnt entries have - // already been parsed in the last time, so we can just - // skip these entries. - entryCnt.decrementAndGet(); - entry = ais.getNextEntry(); - continue; - } - if (!entry.isDirectory()) { parseEntry(ais, entry, extractor, metadata, xhtml, context); } - - if (!shouldUseDataDescriptor) { - // Record the number of entries we have read, this is used - // for zip archives using Data Descriptor. It's used for - // skipping the entries we have already read - entryCnt.incrementAndGet(); - } - entry = ais.getNextEntry(); } - } catch (UnsupportedZipFeatureException zfe) { - - // If it's an encrypted document of unknown password, report as such - if (zfe.getFeature() == Feature.ENCRYPTION) { - throw new EncryptedDocumentException(zfe); - } - - if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) { - throw zfe; - } - // Otherwise throw the exception - throw new TikaException("UnsupportedZipFeature", zfe); - } catch (PasswordRequiredException pre) { - throw new EncryptedDocumentException(pre); + } finally { + ais.close(); + xhtml.endDocument(); } } @@ -408,59 +127,52 @@ public class PackageParser extends AbstractEncodingDetectorParser { return; } - //now see if the user or an earlier step has passed in a content type String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE); if (incomingContentTypeString == null) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); return; } - MediaType incomingMediaType = MediaType.parse(incomingContentTypeString); if (incomingMediaType == null) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); return; } - if (!PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) { + // Don't overwrite if incoming type is a TAR specialization (e.g., gtar) + if (!incomingMediaType.equals(GTAR)) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } } + private static MediaType getMediaType(ArchiveInputStream stream) { + if (stream instanceof ArArchiveInputStream) { + return AR; + } else if (stream instanceof ArjArchiveInputStream) { + return ARJ; + } else if (stream instanceof CpioArchiveInputStream) { + return CPIO; + } else if (stream instanceof DumpArchiveInputStream) { + return DUMP; + } else if (stream instanceof TarArchiveInputStream) { + return TAR; + } else { + return MediaType.OCTET_STREAM; + } + } + private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml, ParseContext context) throws SAXException, IOException, TikaException { + String name = entry.getName(); - - //Try to detect charset of archive entry in case of non-unicode filename is used - if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) { - // Extend short entry name to improve accuracy of charset detection - byte[] entryName = ((ZipArchiveEntry) entry).getRawName(); - byte[] extendedEntryName = entryName; - if (0 < entryName.length && entryName.length < MIN_BYTES_FOR_DETECTING_CHARSET) { - int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET / entryName.length); - extendedEntryName = new byte[len]; - for (int i = 0; i < len; i++) { - extendedEntryName[i] = entryName[i % entryName.length]; - } - } - try (TikaInputStream tis = TikaInputStream.get(extendedEntryName)) { - Charset candidate = getEncodingDetector().detect(tis, parentMetadata, context); - if (candidate != null) { - name = new String(((ZipArchiveEntry) entry).getRawName(), candidate); - } - } - } - if (archive.canReadEntryData(entry)) { - // Fetch the metadata on the entry contained in the archive - Metadata entrydata = - handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), - xhtml, context); + Metadata entrydata = handleEntryMetadata( + name, null, entry.getLastModifiedDate(), entry.getSize(), + xhtml, context); - // Recurse into the entry if desired if (extractor.shouldParseEmbedded(entrydata)) { TemporaryResources tmp = new TemporaryResources(); try { @@ -471,82 +183,12 @@ public class PackageParser extends AbstractEncodingDetectorParser { } } } else { - name = (name == null) ? "" : name; - if (entry instanceof ZipArchiveEntry) { - ZipArchiveEntry zipArchiveEntry = (ZipArchiveEntry) entry; - boolean usesEncryption = zipArchiveEntry.getGeneralPurposeBit().usesEncryption(); - if (usesEncryption) { - EmbeddedDocumentUtil.recordEmbeddedStreamException( - new EncryptedDocumentException("stream (" + name + ") is encrypted"), - parentMetadata); - } - - // do not write to the handler if - // UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR - // is met, we will catch this exception and read the zip archive once again - boolean usesDataDescriptor = - zipArchiveEntry.getGeneralPurposeBit().usesDataDescriptor(); - if (usesDataDescriptor && zipArchiveEntry.getMethod() == ZipEntry.STORED) { - throw new UnsupportedZipFeatureException( - UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR, - zipArchiveEntry); - } - } else { - EmbeddedDocumentUtil.recordEmbeddedStreamException( - new TikaException("Can't read archive stream (" + name + ")"), - parentMetadata); - } - if (name.length() > 0) { + EmbeddedDocumentUtil.recordEmbeddedStreamException( + new TikaException("Can't read archive stream (" + name + ")"), + parentMetadata); + if (name != null && !name.isEmpty()) { xhtml.element("p", name); } } } - - // Pending a fix for COMPRESS-269, we have to wrap ourselves - private static class SevenZWrapper extends ArchiveInputStream { - private SevenZFile file; - - private SevenZWrapper(SevenZFile file) { - this.file = file; - } - - @Override - public int read() throws IOException { - return file.read(); - } - - @Override - public int read(byte[] b) throws IOException { - return file.read(b); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - return file.read(b, off, len); - } - - @Override - public ArchiveEntry getNextEntry() throws IOException { - return file.getNextEntry(); - } - - @Override - public void close() throws IOException { - file.close(); - } - } - - /** - * Whether or not to run the default charset detector against entry - * names in ZipFiles. The default is <code>true</code>. - * - * @param detectCharsetsInEntryNames - */ - public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) { - this.detectCharsetsInEntryNames = detectCharsetsInEntryNames; - } - - public boolean isDetectCharsetsInEntryNames() { - return detectCharsetsInEntryNames; - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java index 96785a7b5d..36a96be8f1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java @@ -83,9 +83,9 @@ public class RarParser implements Parser { FileHeader header = rar.nextFileHeader(); while (header != null && !Thread.currentThread().isInterrupted()) { if (!header.isDirectory()) { - Metadata entrydata = PackageParser.handleEntryMetadata(header.getFileName(), - header.getCTime(), header.getMTime(), header.getFullUnpackSize(), - xhtml, context); + Metadata entrydata = AbstractArchiveParser.handleEntryMetadata( + header.getFileName(), header.getCTime(), header.getMTime(), + header.getFullUnpackSize(), xhtml, context); try (TikaInputStream rarTis = TikaInputStream.get(rar.getInputStream(header))) { if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(rarTis, handler, entrydata, context, true); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java new file mode 100644 index 0000000000..192cfbd497 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/SevenZParser.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.apache.tika.detect.zip.PackageConstants.SEVENZ; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.compress.PasswordRequiredException; +import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * Parser for 7z (Seven Zip) archives. + * <p> + * This parser requires file-based access (not streaming) because + * the 7z format requires random access to the archive. + * <p> + * User must have JCE Unlimited Strength jars installed for encryption + * to work with 7Z files (see: COMPRESS-299 and TIKA-1521). If the jars + * are not installed, an IOException will be thrown, and potentially + * wrapped in a TikaException. + */ +@TikaComponent +public class SevenZParser extends AbstractArchiveParser { + + private static final long serialVersionUID = -5331043266963888710L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(SEVENZ); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + // Seven Zip supports passwords, was one given? + String password = null; + PasswordProvider provider = context.get(PasswordProvider.class); + if (provider != null) { + password = provider.getPassword(metadata); + } + + SevenZFile sevenZFile; + try { + SevenZFile.Builder builder = new SevenZFile.Builder().setFile(tis.getFile()); + if (password == null) { + sevenZFile = builder.get(); + } else { + sevenZFile = builder.setPassword(password.toCharArray()).get(); + } + } catch (PasswordRequiredException e) { + throw new EncryptedDocumentException(e); + } + + metadata.set(Metadata.CONTENT_TYPE, SEVENZ.toString()); + + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + xhtml.startDocument(); + + try { + SevenZArchiveEntry entry = sevenZFile.getNextEntry(); + while (entry != null) { + if (!entry.isDirectory()) { + parseEntry(sevenZFile, entry, extractor, metadata, xhtml, context); + } + entry = sevenZFile.getNextEntry(); + } + } finally { + sevenZFile.close(); + xhtml.endDocument(); + } + } + + private void parseEntry(SevenZFile sevenZFile, SevenZArchiveEntry entry, + EmbeddedDocumentExtractor extractor, Metadata parentMetadata, + XHTMLContentHandler xhtml, ParseContext context) + throws SAXException, IOException, TikaException { + + String name = entry.getName(); + Metadata entrydata = handleEntryMetadata( + name, + entry.getHasCreationDate() ? entry.getCreationDate() : null, + entry.getHasLastModifiedDate() ? entry.getLastModifiedDate() : null, + entry.getSize(), + xhtml, + context); + + if (extractor.shouldParseEmbedded(entrydata)) { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get( + new SevenZEntryInputStream(sevenZFile), tmp, entrydata); + extractor.parseEmbedded(tis, xhtml, entrydata, new ParseContext(), true); + } finally { + tmp.dispose(); + } + } + } + + /** + * InputStream wrapper for reading the current entry from a SevenZFile. + */ + private static class SevenZEntryInputStream extends InputStream { + private final SevenZFile file; + + SevenZEntryInputStream(SevenZFile file) { + this.file = file; + } + + @Override + public int read() throws IOException { + return file.read(); + } + + @Override + public int read(byte[] b) throws IOException { + return file.read(b); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return file.read(b, off, len); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java new file mode 100644 index 0000000000..0c48731058 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@ -0,0 +1,690 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.apache.tika.detect.zip.PackageConstants.JAR; +import static org.apache.tika.detect.zip.PackageConstants.ZIP; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.file.attribute.FileTime; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.Feature; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipFile; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.TikaComponent; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.Zip; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.zip.utils.ZipSalvager; + +/** + * Parser for ZIP and JAR archives using file-based access for complete metadata extraction. + * <p> + * This parser handles: + * <ul> + * <li>Standard ZIP archives</li> + * <li>JAR (Java Archive) files</li> + * <li>Archive and entry comments</li> + * <li>Unix permissions and file attributes</li> + * <li>Charset detection for non-Unicode entry names</li> + * <li>Encryption detection</li> + * </ul> + * <p> + * This parser prefers file-based access (ZipFile) for complete metadata extraction, + * but falls back to streaming (ZipArchiveInputStream) for edge-case ZIPs that + * cannot be read as files (e.g., those with data descriptors that overlap the + * central directory). + */ +@TikaComponent() +public class ZipParser extends AbstractArchiveParser { + + /** + * Set of media types that are specializations of ZIP (e.g., Office documents, EPUB, APK). + * Used to avoid overwriting more specific media types with generic "application/zip". + */ + public static final Set<MediaType> ZIP_SPECIALIZATIONS = loadZipSpecializations(); + + private static final long serialVersionUID = -5331043266963888709L; + + private static final Set<MediaType> SUPPORTED_TYPES = MediaType.set(ZIP, JAR); + + private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100; + + /** + * Maximum number of entries to record in integrity check metadata fields. + * Prevents excessive metadata in ZIPs with many discrepancies. + */ + private static final int MAX_INTEGRITY_CHECK_ENTRIES = 100; + + private final ZipParserConfig defaultConfig; + + private static Set<MediaType> loadZipSpecializations() { + Set<MediaType> zipSpecializations = new HashSet<>(); + for (String mediaTypeString : new String[]{ + //specializations of ZIP + "application/bizagi-modeler", "application/epub+zip", + "application/hwp+zip", + "application/java-archive", + "application/vnd.adobe.air-application-installer-package+zip", + "application/vnd.android.package-archive", "application/vnd.apple.iwork", + "application/vnd.apple.keynote", "application/vnd.apple.numbers", + "application/vnd.apple.pages", "application/vnd.apple.unknown.13", + "application/vnd.etsi.asic-e+zip", "application/vnd.etsi.asic-s+zip", + "application/vnd.google-earth.kmz", "application/vnd.mindjet.mindmanager", + "application/vnd.ms-excel.addin.macroenabled.12", + "application/vnd.ms-excel.sheet.binary.macroenabled.12", + "application/vnd.ms-excel.sheet.macroenabled.12", + "application/vnd.ms-excel.template.macroenabled.12", + "application/vnd.ms-powerpoint.addin.macroenabled.12", + "application/vnd.ms-powerpoint.presentation.macroenabled.12", + "application/vnd.ms-powerpoint.slide.macroenabled.12", + "application/vnd.ms-powerpoint.slideshow.macroenabled.12", + "application/vnd.ms-powerpoint.template.macroenabled.12", + "application/vnd.ms-visio.drawing", + "application/vnd.ms-visio.drawing.macroenabled.12", + "application/vnd.ms-visio.stencil", + "application/vnd.ms-visio.stencil.macroenabled.12", + "application/vnd.ms-visio.template", + "application/vnd.ms-visio.template.macroenabled.12", + "application/vnd.ms-word.document.macroenabled.12", + "application/vnd.ms-word.template.macroenabled.12", + "application/vnd.ms-xpsdocument", "application/vnd.oasis.opendocument.formula", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.presentationml.slide", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + "application/vnd.openxmlformats-officedocument.presentationml.template", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.spreadsheetml.template", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + "application/x-ibooks+zip", "application/x-itunes-ipa", + "application/x-tika-iworks-protected", "application/x-tika-java-enterprise-archive", + "application/x-tika-java-web-archive", "application/x-tika-ooxml", + "application/x-tika-visio-ooxml", "application/x-xliff+zip", "application/x-xmind", + "model/vnd.dwfx+xps", "application/vnd.sun.xml.calc", + "application/vnd.sun.xml.writer", "application/vnd.sun.xml.writer.template", + "application/vnd.sun.xml.draw", "application/vnd.sun.xml.impress", + "application/vnd.openofficeorg.autotext", + "application/vnd.oasis.opendocument.graphics-template", + "application/vnd.oasis.opendocument.text-web", + "application/vnd.oasis.opendocument.spreadsheet-template", + "application/vnd.oasis.opendocument.graphics", + "application/vnd.oasis.opendocument.image-template", + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.text-template", + "application/vnd.oasis.opendocument.presentation", + "application/vnd.oasis.opendocument.chart", + "application/vnd.openofficeorg.extension", + "application/vnd.oasis.opendocument.spreadsheet", + "application/vnd.oasis.opendocument.image", + "application/vnd.oasis.opendocument.formula-template", + "application/vnd.oasis.opendocument.presentation-template", + "application/vnd.oasis.opendocument.chart-template", + "application/vnd.oasis.opendocument.text-master", + "application/vnd.adobe.indesign-idml-package", + "application/x-wacz", "application/x-vnd.datapackage+zip" + }) { + zipSpecializations.add(MediaType.parse(mediaTypeString)); + } + return Collections.unmodifiableSet(zipSpecializations); + } + + public ZipParser() { + super(); + this.defaultConfig = new ZipParserConfig(); + } + + public ZipParser(ZipParserConfig config) { + super(); + this.defaultConfig = config; + } + + /** + * Constructor for JSON-based configuration. + */ + public ZipParser(JsonConfig jsonConfig) throws TikaConfigException { + this(ConfigDeserializer.buildConfig(jsonConfig, ZipParserConfig.class)); + } + + public ZipParser(EncodingDetector encodingDetector) { + super(encodingDetector); + this.defaultConfig = new ZipParserConfig(); + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + ZipParserConfig config = context.get(ZipParserConfig.class, defaultConfig); + + if (tis.getOpenContainer() instanceof ZipFile) { + // detectEntryName handles charset decoding from raw bytes, no need to reopen + parseWithZipFile((ZipFile) tis.getOpenContainer(), tis, handler, metadata, context, config); + return; + } + + // Check detector hints - if detector already tried ZipFile and failed, go straight to streaming + String detectorZipFileOpened = metadata.get(Zip.DETECTOR_ZIPFILE_OPENED); + if ("false".equals(detectorZipFileOpened)) { + // Detector already tried and failed - skip ZipFile, use streaming + // Enable rewind for DATA_DESCRIPTOR retry in parseWithStream + tis.enableRewind(); + String dataDescriptorRequired = metadata.get(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED); + parseWithStream(tis, handler, metadata, context, config, + "true".equals(dataDescriptorRequired)); + return; + } + + // No detector hint - try to open ZipFile (with salvaging fallback) + // This likely means that the user didn't apply a detector first or the zip detector was not in the chain + ZipFile zipFile = ZipSalvager.tryToOpenZipFile(tis, metadata, config.getEntryEncoding()); + + if (zipFile != null) { + // ZipFile opened (directly or via salvaging) - use file-based parsing + parseWithZipFile(zipFile, tis, handler, metadata, context, config); + } else { + // ZipFile and salvaging both failed - use streaming + // Enable rewind for DATA_DESCRIPTOR retry in parseWithStream + // (may be redundant if tryToOpenZipFile already called it, but that's safe) + tis.enableRewind(); + parseWithStream(tis, handler, metadata, context, config, false); + } + } + + /** + * Parses using a pre-opened ZipFile passed from the detector. + * + * @param zipFile the pre-opened ZipFile from detector + * @param tis the TikaInputStream (for integrity check rewind) + * @param handler the content handler + * @param metadata the metadata + * @param context the parse context + * @param config the parser configuration + */ + private void parseWithZipFile(ZipFile zipFile, TikaInputStream tis, ContentHandler handler, + Metadata metadata, ParseContext context, ZipParserConfig config) + throws IOException, SAXException, TikaException { + + // Collect entry names from central directory for integrity check + Set<String> centralDirectoryEntries = config.isIntegrityCheck() + ? new LinkedHashSet<>() : null; + + // Don't close the ZipFile - it was passed from the detector and will be closed + // when TikaInputStream is closed (it's set as the openContainer) + updateMediaType(zipFile, metadata); + + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + xhtml.startDocument(); + + try { + Enumeration<ZipArchiveEntry> entries = zipFile.getEntries(); + while (entries.hasMoreElements()) { + ZipArchiveEntry entry = entries.nextElement(); + if (centralDirectoryEntries != null) { + centralDirectoryEntries.add(entry.getName()); + } + if (!entry.isDirectory()) { + parseZipFileEntry(zipFile, entry, extractor, metadata, xhtml, context, config); + } + } + } finally { + xhtml.endDocument(); + } + + // Perform integrity check if enabled + if (config.isIntegrityCheck()) { + tis.enableRewind(); + tis.rewind(); + performIntegrityCheck(tis, metadata, centralDirectoryEntries, config); + } + } + + /** + * Parses using streaming with optional initial data descriptor support. + * + * @param tis the TikaInputStream + * @param handler the content handler + * @param metadata the metadata + * @param context the parse context + * @param config the parser configuration + * @param startWithDataDescriptor whether to start with data descriptor support enabled + */ + private void parseWithStream(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context, ZipParserConfig config, + boolean startWithDataDescriptor) + throws IOException, SAXException, TikaException { + + // Track entry names for duplicate detection during streaming + Set<String> seenEntryNames = config.isIntegrityCheck() + ? new LinkedHashSet<>() : null; + List<String> duplicates = config.isIntegrityCheck() + ? new ArrayList<>() : null; + + String encoding = config.getEntryEncoding() != null + ? config.getEntryEncoding().name() + : null; + ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, encoding, true, startWithDataDescriptor); + + updateMediaType(metadata); + + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + xhtml.startDocument(); + + AtomicInteger entryCnt = new AtomicInteger(); + try { + parseStreamEntries(zis, metadata, extractor, xhtml, false, entryCnt, context, config, + seenEntryNames, duplicates); + } catch (UnsupportedZipFeatureException zfe) { + if (zfe.getFeature() == Feature.DATA_DESCRIPTOR && !startWithDataDescriptor) { + // Re-read with data descriptor support + zis.close(); + tis.rewind(); + zis = new ZipArchiveInputStream(tis, encoding, true, true); + parseStreamEntries(zis, metadata, extractor, xhtml, true, entryCnt, context, config, + seenEntryNames, duplicates); + } else { + throw zfe; + } + } finally { + zis.close(); + xhtml.endDocument(); + } + + // Record integrity check results (streaming only = can't compare to central directory) + if (config.isIntegrityCheck()) { + if (duplicates.isEmpty()) { + // No duplicates found, but we couldn't compare to central directory + metadata.set(Zip.INTEGRITY_CHECK_RESULT, "PARTIAL"); + } else { + metadata.set(Zip.INTEGRITY_CHECK_RESULT, "FAIL"); + for (String dup : duplicates) { + metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup); + } + } + } + } + + private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata, + EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml, + boolean shouldUseDataDescriptor, AtomicInteger entryCnt, + ParseContext context, ZipParserConfig config, + Set<String> seenEntryNames, List<String> duplicates) + throws TikaException, IOException, SAXException { + + try { + ArchiveEntry entry = zis.getNextEntry(); + while (entry != null) { + if (shouldUseDataDescriptor && entryCnt.get() > 0) { + // Skip already-processed entries on re-read + entryCnt.decrementAndGet(); + entry = zis.getNextEntry(); + continue; + } + + if (!entry.isDirectory() && entry instanceof ZipArchiveEntry) { + parseStreamEntry(zis, (ZipArchiveEntry) entry, extractor, metadata, + xhtml, context, config); + + // Track duplicates AFTER successful processing + // (if DATA_DESCRIPTOR exception occurs, we'll re-read this entry) + if (seenEntryNames != null && duplicates != null) { + String name = entry.getName(); + if (seenEntryNames.contains(name)) { + if (duplicates.size() < MAX_INTEGRITY_CHECK_ENTRIES) { + duplicates.add(name); + } + } else { + seenEntryNames.add(name); + } + } + } + + // Increment AFTER successful processing + if (!shouldUseDataDescriptor) { + entryCnt.incrementAndGet(); + } + + entry = zis.getNextEntry(); + } + } catch (UnsupportedZipFeatureException zfe) { + if (zfe.getFeature() == Feature.ENCRYPTION) { + throw new EncryptedDocumentException(zfe); + } + if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) { + throw zfe; + } + throw new TikaException("UnsupportedZipFeature", zfe); + } + } + + private void updateMediaType(ZipFile zipFile, Metadata metadata) { + MediaType type = ZIP; + Enumeration<ZipArchiveEntry> entries = zipFile.getEntries(); + if (entries.hasMoreElements()) { + ZipArchiveEntry first = entries.nextElement(); + if ("META-INF/MANIFEST.MF".equals(first.getName())) { + type = JAR; + } + } + setMediaTypeIfNotSpecialization(metadata, type); + } + + private void updateMediaType(Metadata metadata) { + setMediaTypeIfNotSpecialization(metadata, ZIP); + } + + private void setMediaTypeIfNotSpecialization(Metadata metadata, MediaType type) { + String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE); + if (incomingContentTypeString == null) { + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + return; + } + + MediaType incomingMediaType = MediaType.parse(incomingContentTypeString); + if (incomingMediaType == null) { + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + return; + } + + if (!ZIP_SPECIALIZATIONS.contains(incomingMediaType)) { + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + } + } + + private void parseZipFileEntry(ZipFile zipFile, ZipArchiveEntry entry, + EmbeddedDocumentExtractor extractor, Metadata parentMetadata, + XHTMLContentHandler xhtml, ParseContext context, + ZipParserConfig config) + throws SAXException, IOException, TikaException { + + String name = detectEntryName(entry, parentMetadata, context, config); + + if (entry.getGeneralPurposeBit().usesEncryption()) { + handleEncryptedEntry(name, parentMetadata, xhtml); + return; + } + + Metadata entryMetadata = buildEntryMetadata(entry, name, context); + + writeEntryXhtml(name, xhtml); + + if (extractor.shouldParseEmbedded(entryMetadata)) { + TemporaryResources tmp = new TemporaryResources(); + try (InputStream entryStream = zipFile.getInputStream(entry)) { + TikaInputStream tis = TikaInputStream.get(entryStream, tmp, entryMetadata); + extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true); + } finally { + tmp.dispose(); + } + } + } + + private void parseStreamEntry(ZipArchiveInputStream zis, ZipArchiveEntry entry, + EmbeddedDocumentExtractor extractor, Metadata parentMetadata, + XHTMLContentHandler xhtml, ParseContext context, + ZipParserConfig config) + throws SAXException, IOException, TikaException { + + String name = detectEntryName(entry, parentMetadata, context, config); + + if (!zis.canReadEntryData(entry)) { + if (entry.getGeneralPurposeBit().usesEncryption()) { + handleEncryptedEntry(name, parentMetadata, xhtml); + } else if (entry.getGeneralPurposeBit().usesDataDescriptor() + && entry.getMethod() == java.util.zip.ZipEntry.STORED) { + throw new UnsupportedZipFeatureException(Feature.DATA_DESCRIPTOR, entry); + } else { + EmbeddedDocumentUtil.recordEmbeddedStreamException( + new TikaException("Can't read archive stream (" + name + ")"), + parentMetadata); + if (name != null && !name.isEmpty()) { + xhtml.element("p", name); + } + } + return; + } + + Metadata entryMetadata = buildEntryMetadata(entry, name, context); + + writeEntryXhtml(name, xhtml); + + if (extractor.shouldParseEmbedded(entryMetadata)) { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tis = TikaInputStream.get(zis, tmp, entryMetadata); + extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true); + } finally { + tmp.dispose(); + } + } + } + + private String detectEntryName(ZipArchiveEntry entry, Metadata parentMetadata, + ParseContext context, ZipParserConfig config) throws IOException { + // If user specified an encoding, decode raw bytes with that charset + // This avoids needing to reopen the ZipFile with a different charset + if (config.getEntryEncoding() != null) { + return new String(entry.getRawName(), config.getEntryEncoding()); + } + + // If charset detection is enabled, try to detect and decode + if (config.isDetectCharsetsInEntryNames()) { + byte[] entryName = entry.getRawName(); + byte[] extendedEntryName = entryName; + if (0 < entryName.length && entryName.length < MIN_BYTES_FOR_DETECTING_CHARSET) { + int len = entryName.length * (MIN_BYTES_FOR_DETECTING_CHARSET / entryName.length); + extendedEntryName = new byte[len]; + for (int i = 0; i < len; i++) { + extendedEntryName[i] = entryName[i % entryName.length]; + } + } + + try (TikaInputStream detectStream = TikaInputStream.get(extendedEntryName)) { + Charset candidate = getEncodingDetector().detect(detectStream, parentMetadata, context); + if (candidate != null) { + return new String(entry.getRawName(), candidate); + } + } + } + + // Fall back to default decoding + return entry.getName(); + } + + private void handleEncryptedEntry(String name, Metadata parentMetadata, + XHTMLContentHandler xhtml) throws SAXException { + EmbeddedDocumentUtil.recordEmbeddedStreamException( + new EncryptedDocumentException("stream (" + name + ") is encrypted"), + parentMetadata); + if (name != null && !name.isEmpty()) { + xhtml.element("p", name); + } + } + + private Metadata buildEntryMetadata(ZipArchiveEntry entry, String name, ParseContext context) + throws IOException, TikaException, SAXException { + Metadata entryMetadata = Metadata.newInstance(context); + + if (name != null && name.length() > 0) { + name = name.replace("\\", "/"); + entryMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + entryMetadata.set(TikaCoreProperties.INTERNAL_PATH, name); + } + + FileTime creationTime = entry.getCreationTime(); + if (creationTime != null) { + entryMetadata.set(TikaCoreProperties.CREATED, creationTime.toInstant().toString()); + } + FileTime modifiedTime = entry.getLastModifiedTime(); + if (modifiedTime != null) { + entryMetadata.set(TikaCoreProperties.MODIFIED, modifiedTime.toInstant().toString()); + } + + long size = entry.getSize(); + if (size >= 0) { + entryMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); + entryMetadata.set(Zip.UNCOMPRESSED_SIZE, Long.toString(size)); + } + long compressedSize = entry.getCompressedSize(); + if (compressedSize >= 0) { + entryMetadata.set(Zip.COMPRESSED_SIZE, Long.toString(compressedSize)); + } + + entryMetadata.set(Zip.COMPRESSION_METHOD, entry.getMethod()); + + long crc = entry.getCrc(); + if (crc >= 0) { + entryMetadata.set(Zip.CRC32, Long.toString(crc)); + } + + int unixMode = entry.getUnixMode(); + if (unixMode != 0) { + entryMetadata.set(Zip.UNIX_MODE, unixMode); + } + + entryMetadata.set(Zip.PLATFORM, entry.getPlatform()); + entryMetadata.set(Zip.VERSION_MADE_BY, entry.getVersionMadeBy()); + + String entryComment = entry.getComment(); + if (entryComment != null && !entryComment.isEmpty()) { + entryMetadata.set(Zip.COMMENT, entryComment); + } + + return entryMetadata; + } + + private void writeEntryXhtml(String name, XHTMLContentHandler xhtml) throws SAXException { + if (name != null && name.length() > 0) { + org.xml.sax.helpers.AttributesImpl attributes = new org.xml.sax.helpers.AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", name); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + + /** + * Performs integrity check by streaming through the ZIP and comparing + * local file headers against the central directory entries. + * + * @param tis the TikaInputStream (must be rewound) + * @param metadata the parent metadata to record results + * @param centralDirectoryEntries entry names from the central directory + * @param config the parser configuration + */ + private void performIntegrityCheck(TikaInputStream tis, Metadata metadata, + Set<String> centralDirectoryEntries, + ZipParserConfig config) throws IOException { + + String encoding = config.getEntryEncoding() != null + ? config.getEntryEncoding().name() + : null; + + Set<String> seenInStream = new LinkedHashSet<>(); + List<String> duplicates = new ArrayList<>(); + List<String> localHeaderOnly = new ArrayList<>(); + + try (ZipArchiveInputStream zis = new ZipArchiveInputStream(tis, encoding, true, true)) { + ZipArchiveEntry entry; + while ((entry = zis.getNextZipEntry()) != null) { + String name = entry.getName(); + + // Check for duplicates + if (seenInStream.contains(name)) { + if (duplicates.size() < MAX_INTEGRITY_CHECK_ENTRIES) { + duplicates.add(name); + } + } else { + seenInStream.add(name); + } + + // Check for entries not in central directory + if (!centralDirectoryEntries.contains(name)) { + if (localHeaderOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) { + localHeaderOnly.add(name); + } + } + } + } catch (IOException e) { + // If streaming fails, we still record what we found + } + + // Find entries in central directory but not in local headers + List<String> centralOnly = new ArrayList<>(); + for (String cdEntry : centralDirectoryEntries) { + if (!seenInStream.contains(cdEntry)) { + if (centralOnly.size() < MAX_INTEGRITY_CHECK_ENTRIES) { + centralOnly.add(cdEntry); + } + } + } + + // Record results + boolean passed = duplicates.isEmpty() && localHeaderOnly.isEmpty() && centralOnly.isEmpty(); + metadata.set(Zip.INTEGRITY_CHECK_RESULT, passed ? "PASS" : "FAIL"); + + for (String dup : duplicates) { + metadata.add(Zip.DUPLICATE_ENTRY_NAMES, dup); + } + for (String local : localHeaderOnly) { + metadata.add(Zip.LOCAL_HEADER_ONLY_ENTRIES, local); + } + for (String cd : centralOnly) { + metadata.add(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES, cd); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java new file mode 100644 index 0000000000..9bc53aaca9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParserConfig.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.Serializable; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Configuration for {@link ZipParser}. + */ +public class ZipParserConfig implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * Whether to run charset detection on entry names to handle + * non-Unicode filenames. Default is true. + */ + private boolean detectCharsetsInEntryNames = true; + + /** + * The charset to use for reading entry names. If null, the parser + * will use the platform default or auto-detect based on + * {@link #detectCharsetsInEntryNames}. + */ + private Charset entryEncoding = null; + + /** + * Whether to perform integrity checking by comparing the central directory + * (read via file-based access) against local file headers (read via streaming). + * This can detect: + * <ul> + * <li>Duplicate entry names (potential attack vector)</li> + * <li>Entries in central directory but not in local headers</li> + * <li>Entries in local headers but not in central directory</li> + * </ul> + * Default is true. When enabled, the ZIP is parsed twice if file-based access + * succeeds. If only streaming is possible, duplicate detection is still performed + * but central directory comparison is skipped (result will be "PARTIAL" if no + * duplicates are found). + */ + private boolean integrityCheck = true; + + public ZipParserConfig() { + } + + public boolean isDetectCharsetsInEntryNames() { + return detectCharsetsInEntryNames; + } + + public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) { + this.detectCharsetsInEntryNames = detectCharsetsInEntryNames; + } + + public Charset getEntryEncoding() { + return entryEncoding; + } + + public void setEntryEncoding(Charset entryEncoding) { + this.entryEncoding = entryEncoding; + } + + /** + * Set the entry encoding from a string (for JSON deserialization). + * + * @param charsetName the charset name + * @throws TikaConfigException if the charset is not supported + */ + public void setEntryEncodingName(String charsetName) throws TikaConfigException { + if (charsetName == null || charsetName.isEmpty()) { + this.entryEncoding = null; + return; + } + try { + this.entryEncoding = Charset.forName(charsetName); + } catch (UnsupportedCharsetException e) { + throw new TikaConfigException("Unsupported charset: " + charsetName, e); + } + } + + public boolean isIntegrityCheck() { + return integrityCheck; + } + + public void setIntegrityCheck(boolean integrityCheck) { + this.integrityCheck = integrityCheck; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java index bac9a1bb75..f209ec0189 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java @@ -38,45 +38,52 @@ public class PackageParserTest extends TikaTest { @Test public void testCoverage() throws Exception { - //test that the package parser covers all inputstreams handled - //by ArchiveStreamFactory. When we update commons-compress, and they add - //a new stream type, we want to make sure that we're handling it. + // Test that the archive parsers collectively cover all input streams handled + // by ArchiveStreamFactory. When we update commons-compress, and they add + // a new stream type, we want to make sure that we're handling it. ArchiveStreamFactory archiveStreamFactory = new ArchiveStreamFactory(StandardCharsets.UTF_8.name()); + PackageParser packageParser = new PackageParser(); + ZipParser zipParser = new ZipParser(); + SevenZParser sevenZParser = new SevenZParser(); ParseContext parseContext = new ParseContext(); + + // Combine supported types from all archive parsers + Set<MediaType> allSupportedTypes = new HashSet<>(); + allSupportedTypes.addAll(packageParser.getSupportedTypes(parseContext)); + allSupportedTypes.addAll(zipParser.getSupportedTypes(parseContext)); + allSupportedTypes.addAll(sevenZParser.getSupportedTypes(parseContext)); + for (String name : archiveStreamFactory.getInputStreamArchiveNames()) { MediaType mt = PackageConstants.getMediaType(name); - //use this instead of assertNotEquals so that we report the - //name of the missing stream + // Use this instead of assertNotEquals so that we report the + // name of the missing stream if (mt.equals(MediaType.OCTET_STREAM)) { fail("getting octet-stream for: " + name); } - if (!packageParser.getSupportedTypes(parseContext).contains(mt)) { - fail("PackageParser should support: " + mt.toString()); + if (!allSupportedTypes.contains(mt)) { + fail("Archive parsers should support: " + mt.toString()); } } } @Test - public void testSpecializations() throws Exception { - //Test that our manually constructed list of children of zip and tar - //in PackageParser is current with TikaLoader's media type registry. + public void testZipSpecializations() throws Exception { + // Test that our manually constructed list of ZIP specializations + // in ZipParser is current with TikaLoader's media type registry. MediaTypeRegistry mediaTypeRegistry = TikaLoader.getMediaTypeRegistry(); - Set<MediaType> currentSpecializations = new HashSet<>(); - MediaType tar = MediaType.parse("application/x-tar"); + Set<MediaType> currentZipSpecializations = new HashSet<>(); for (MediaType type : mediaTypeRegistry.getTypes()) { - if (mediaTypeRegistry.isSpecializationOf(type, MediaType.APPLICATION_ZIP) || - mediaTypeRegistry.isSpecializationOf(type, tar)) { - currentSpecializations.add(type); -// System.out.println("\""+type.toString()+"\","); + if (mediaTypeRegistry.isSpecializationOf(type, MediaType.APPLICATION_ZIP)) { + currentZipSpecializations.add(type); } } - for (MediaType mediaType : currentSpecializations) { - assertTrue(PackageParser.PACKAGE_SPECIALIZATIONS.contains(mediaType), + for (MediaType mediaType : currentZipSpecializations) { + assertTrue(ZipParser.ZIP_SPECIALIZATIONS.contains(mediaType), "missing: " + mediaType); } - assertEquals(currentSpecializations.size(), PackageParser.PACKAGE_SPECIALIZATIONS.size()); + assertEquals(currentZipSpecializations.size(), ZipParser.ZIP_SPECIALIZATIONS.size()); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java index 47292d27e2..9652f05643 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java @@ -17,23 +17,29 @@ package org.apache.tika.parser.pkg; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import java.io.ByteArrayOutputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + import org.apache.commons.codec.binary.Base64; -import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; +import org.junit.jupiter.api.io.TempDir; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.Zip; +import org.apache.tika.parser.ParseContext; /** * Test case for parsing zip files. @@ -46,40 +52,32 @@ public class ZipParserTest extends AbstractPkgTest { */ @Test public void testEmbedded() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip"); - try (TikaInputStream tis = getResourceAsStream("/test-documents/test-documents.zip")) { - AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext); - } + // First metadata is the container, rest are embedded documents + // With recursive parsing, we get more than 10 entries due to nested documents + // (e.g., ODT, PPT, DOC contain embedded resources) + assertTrue(metadataList.size() >= 10, "Expected at least 10 metadata entries"); - // Should have found all 9 documents - assertEquals(9, tracker.filenames.size()); - assertEquals(9, tracker.mediatypes.size()); - assertEquals(9, tracker.modifiedAts.size()); - - // Should have names and modified dates, but not content types, - // as zip doesn't store the content types - assertEquals("testEXCEL.xls", tracker.filenames.get(0)); - assertEquals("testHTML.html", tracker.filenames.get(1)); - assertEquals("testOpenOffice2.odt", tracker.filenames.get(2)); - assertEquals("testPDF.pdf", tracker.filenames.get(3)); - assertEquals("testPPT.ppt", tracker.filenames.get(4)); - assertEquals("testRTF.rtf", tracker.filenames.get(5)); - assertEquals("testTXT.txt", tracker.filenames.get(6)); - assertEquals("testWORD.doc", tracker.filenames.get(7)); - assertEquals("testXML.xml", tracker.filenames.get(8)); - - for (String type : tracker.mediatypes) { - assertNull(type); - } - for (String crt : tracker.createdAts) { - assertNull(crt); - } - for (String mod : tracker.modifiedAts) { - assertNotNull(mod); - assertTrue(mod.startsWith("20"), "Modified at " + mod); + // Collect all resource names for verification + List<String> resourceNames = new java.util.ArrayList<>(); + for (Metadata m : metadataList) { + String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (name != null) { + resourceNames.add(name); + } } + + // Should contain all 9 direct embedded files from the ZIP + assertContains("testEXCEL.xls", resourceNames); + assertContains("testHTML.html", resourceNames); + assertContains("testOpenOffice2.odt", resourceNames); + assertContains("testPDF.pdf", resourceNames); + assertContains("testPPT.ppt", resourceNames); + assertContains("testRTF.rtf", resourceNames); + assertContains("testTXT.txt", resourceNames); + assertContains("testWORD.doc", resourceNames); + assertContains("testXML.xml", resourceNames); } /** @@ -98,19 +96,50 @@ public class ZipParserTest extends AbstractPkgTest { @Test // TIKA-936 public void testCustomEncoding() throws Exception { - ArchiveStreamFactory factory = new ArchiveStreamFactory("SJIS"); - trackingContext.set(ArchiveStreamFactory.class, factory); + ZipParserConfig config = new ZipParserConfig(); + config.setEntryEncoding(Charset.forName("SJIS")); + ParseContext context = new ParseContext(); + context.set(ZipParserConfig.class, config); + + List<Metadata> metadataList; + try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64( + "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" + + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" + + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" + + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) { + metadataList = getRecursiveMetadata(tis, new Metadata(), context, false); + } + // Container + 1 embedded document + assertEquals(2, metadataList.size()); + assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", + metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + } + + @Test + public void testCharsetAutoDetectionDisabled() throws Exception { + // Test that disabling charset detection leaves non-UTF8 names as-is (garbled) + ZipParserConfig config = new ZipParserConfig(); + config.setDetectCharsetsInEntryNames(false); + ParseContext context = new ParseContext(); + context.set(ZipParserConfig.class, config); + + List<Metadata> metadataList; try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64( "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) { - AUTO_DETECT_PARSER.parse(tis, new DefaultHandler(), new Metadata(), trackingContext); + metadataList = getRecursiveMetadata(tis, new Metadata(), context, false); } - assertEquals(1, tracker.filenames.size()); - assertEquals("\u65E5\u672C\u8A9E\u30E1\u30E2.txt", tracker.filenames.get(0)); + // Container + 1 embedded document + assertEquals(2, metadataList.size()); + String name = metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY); + // With detection disabled, the SJIS bytes are interpreted as default charset (garbled) + // The correct Japanese name is 日本語メモ.txt - verify we DON'T get that + assertTrue(!"\u65E5\u672C\u8A9E\u30E1\u30E2.txt".equals(name), + "With detection disabled, SJIS name should NOT be correctly decoded"); } @Test @@ -138,23 +167,334 @@ public class ZipParserTest extends AbstractPkgTest { @Test public void testZipUsingStoredWithDataDescriptor() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - try (TikaInputStream tis = getResourceAsStream( - "/test-documents/testZip_with_DataDescriptor.zip")) { - AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext); - - assertEquals(5, tracker.filenames.size()); - assertEquals("en0", tracker.filenames.get(0)); - assertEquals("en1", tracker.filenames.get(1)); - assertEquals("en2", tracker.filenames.get(2)); - assertEquals("en3", tracker.filenames.get(3)); - assertEquals("en4", tracker.filenames.get(4)); - assertEquals(1, tracker.lastSeenStart[0]); - assertEquals(2, tracker.lastSeenStart[1]); - assertEquals(3, tracker.lastSeenStart[2]); - assertEquals(4, tracker.lastSeenStart[3]); + List<Metadata> metadataList = getRecursiveMetadata("testZip_with_DataDescriptor.zip"); + + // Container + 5 embedded documents + assertEquals(6, metadataList.size()); + assertEquals("en0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("en1", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("en2", metadataList.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("en3", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("en4", metadataList.get(5).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + + // This ZIP with DATA_DESCRIPTOR is salvaged and parsed with file-based access + // Integrity check can compare central directory vs local headers + Metadata containerMetadata = metadataList.get(0); + assertEquals("PASS", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT)); + } + + @Test + public void testIntegrityCheckPass() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip"); + + // Normal ZIP with file-based access should pass integrity check + Metadata containerMetadata = metadataList.get(0); + assertEquals("PASS", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT)); + assertNull(containerMetadata.get(Zip.DUPLICATE_ENTRY_NAMES)); + assertNull(containerMetadata.get(Zip.CENTRAL_DIRECTORY_ONLY_ENTRIES)); + assertNull(containerMetadata.get(Zip.LOCAL_HEADER_ONLY_ENTRIES)); + } + + @Test + public void testIntegrityCheckDisabled() throws Exception { + ZipParserConfig config = new ZipParserConfig(); + config.setIntegrityCheck(false); + ParseContext context = new ParseContext(); + context.set(ZipParserConfig.class, config); + + List<Metadata> metadataList = getRecursiveMetadata("test-documents.zip", context); + + // Integrity check disabled - no result should be set + Metadata containerMetadata = metadataList.get(0); + assertNull(containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT)); + } + + @Test + public void testIntegrityCheckHiddenEntry(@TempDir Path tempDir) throws Exception { + // Create a ZIP with a hidden entry (in local headers but not central directory) + Path zipPath = tempDir.resolve("hidden-entry.zip"); + byte[] zipBytes = createZipWithHiddenEntry(); + Files.write(zipPath, zipBytes); + + List<Metadata> metadataList = getRecursiveMetadata(zipPath, false); + + Metadata containerMetadata = metadataList.get(0); + assertEquals("FAIL", containerMetadata.get(Zip.INTEGRITY_CHECK_RESULT)); + String[] localOnly = containerMetadata.getValues(Zip.LOCAL_HEADER_ONLY_ENTRIES); + assertEquals(1, localOnly.length); + assertEquals("hidden.txt", localOnly[0]); + } + + /** + * Creates a ZIP file with an entry that exists in local headers but not in the + * central directory. This simulates a hidden/smuggled entry attack. + */ + private byte[] createZipWithHiddenEntry() throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + // Entry 1: visible.txt (will be in both local header and central directory) + byte[] visible = "visible content".getBytes(StandardCharsets.UTF_8); + // Entry 2: hidden.txt (will be in local header ONLY - not in central directory) + byte[] hidden = "hidden content".getBytes(StandardCharsets.UTF_8); + + // Local file header for visible.txt + int visibleLocalOffset = baos.size(); + writeLocalFileHeader(baos, "visible.txt", visible); + + // Local file header for hidden.txt (this won't have a central directory entry) + writeLocalFileHeader(baos, "hidden.txt", hidden); + + // Central directory - only includes visible.txt + int centralDirOffset = baos.size(); + writeCentralDirectoryEntry(baos, "visible.txt", visible, visibleLocalOffset); + + // End of central directory + int centralDirSize = baos.size() - centralDirOffset; + writeEndOfCentralDirectory(baos, 1, centralDirSize, centralDirOffset); + + return baos.toByteArray(); + } + + private void writeLocalFileHeader(ByteArrayOutputStream baos, String name, byte[] content) + throws Exception { + byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + + // Local file header signature + writeInt(baos, 0x04034b50); + // Version needed + writeShort(baos, 10); + // General purpose bit flag + writeShort(baos, 0); + // Compression method (0 = stored) + writeShort(baos, 0); + // Last mod time/date + writeShort(baos, 0); + writeShort(baos, 0); + // CRC-32 + writeInt(baos, (int) computeCrc32(content)); + // Compressed size + writeInt(baos, content.length); + // Uncompressed size + writeInt(baos, content.length); + // File name length + writeShort(baos, nameBytes.length); + // Extra field length + writeShort(baos, 0); + // File name + baos.write(nameBytes); + // File data + baos.write(content); + } + + private void writeCentralDirectoryEntry(ByteArrayOutputStream baos, String name, + byte[] content, int localHeaderOffset) throws Exception { + byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + + // Central directory file header signature + writeInt(baos, 0x02014b50); + // Version made by + writeShort(baos, 20); + // Version needed + writeShort(baos, 10); + // General purpose bit flag + writeShort(baos, 0); + // Compression method + writeShort(baos, 0); + // Last mod time/date + writeShort(baos, 0); + writeShort(baos, 0); + // CRC-32 + writeInt(baos, (int) computeCrc32(content)); + // Compressed size + writeInt(baos, content.length); + // Uncompressed size + writeInt(baos, content.length); + // File name length + writeShort(baos, nameBytes.length); + // Extra field length + writeShort(baos, 0); + // File comment length + writeShort(baos, 0); + // Disk number start + writeShort(baos, 0); + // Internal file attributes + writeShort(baos, 0); + // External file attributes + writeInt(baos, 0); + // Relative offset of local header + writeInt(baos, localHeaderOffset); + // File name + baos.write(nameBytes); + } + + private void writeEndOfCentralDirectory(ByteArrayOutputStream baos, int numEntries, + int centralDirSize, int centralDirOffset) { + // End of central directory signature + writeInt(baos, 0x06054b50); + // Disk number + writeShort(baos, 0); + // Disk number with central directory + writeShort(baos, 0); + // Number of entries on this disk + writeShort(baos, numEntries); + // Total number of entries + writeShort(baos, numEntries); + // Size of central directory + writeInt(baos, centralDirSize); + // Offset of central directory + writeInt(baos, centralDirOffset); + // Comment length + writeShort(baos, 0); + } + + private void writeInt(ByteArrayOutputStream baos, int value) { + baos.write(value & 0xff); + baos.write((value >> 8) & 0xff); + baos.write((value >> 16) & 0xff); + baos.write((value >> 24) & 0xff); + } + + private void writeShort(ByteArrayOutputStream baos, int value) { + baos.write(value & 0xff); + baos.write((value >> 8) & 0xff); + } + + private long computeCrc32(byte[] data) { + java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(data); + return crc.getValue(); + } + + /** + * Microbenchmark to measure the performance impact of integrity checking. + * This test is disabled by default - remove the assumeTrue to run it. + * + * WARNING: The large ZIP test creates a multi-GB file and takes significant time. + */ + @Test + public void benchmarkIntegrityCheck(@TempDir Path tempDir) throws Exception { + // Skip by default - set this to true to run the benchmark + assumeTrue(false, "Benchmark disabled by default - set to true to run"); + + int iterations = 20; + int warmupIterations = 3; + + // Create small ZIP (10 entries, ~1KB each) - ~10KB total + Path smallZip = tempDir.resolve("small.zip"); + System.out.println("Creating small ZIP (10 entries, ~10KB)..."); + createBenchmarkZip(smallZip, 10, 1024); + System.out.println(" Created: " + Files.size(smallZip) / 1024 + " KB"); + + // Create medium ZIP (1000 entries, ~100KB each) - ~100MB total + Path mediumZip = tempDir.resolve("medium.zip"); + System.out.println("Creating medium ZIP (1000 entries, ~100MB)..."); + createBenchmarkZip(mediumZip, 1000, 100 * 1024); + System.out.println(" Created: " + Files.size(mediumZip) / (1024 * 1024) + " MB"); + + // Create large ZIP (5000 entries, ~500KB each) - ~2.5GB total + Path largeZip = tempDir.resolve("large.zip"); + System.out.println("Creating large ZIP (5000 entries, ~2.5GB)..."); + createBenchmarkZip(largeZip, 5000, 500 * 1024); + System.out.println(" Created: " + Files.size(largeZip) / (1024 * 1024) + " MB"); + + System.out.println(); + System.out.println("=== Integrity Check Benchmark ==="); + System.out.println("Iterations: " + iterations + " (warmup: " + warmupIterations + ")"); + System.out.println(); + + // Benchmark small ZIP + System.out.println("Small ZIP (10 entries, ~10KB):"); + runBenchmark(smallZip, iterations, warmupIterations); + + System.out.println(); + + // Benchmark medium ZIP + System.out.println("Medium ZIP (1000 entries, ~100MB):"); + runBenchmark(mediumZip, 10, 2); + + System.out.println(); + + // Benchmark large ZIP + System.out.println("Large ZIP (5000 entries, ~2.5GB):"); + runBenchmark(largeZip, 5, 1); + } + + private void createBenchmarkZip(Path zipPath, int numEntries, int entrySize) throws Exception { + try (java.util.zip.ZipOutputStream zos = + new java.util.zip.ZipOutputStream(Files.newOutputStream(zipPath))) { + // Use STORED to avoid compression - we want actual file size + zos.setMethod(java.util.zip.ZipOutputStream.STORED); + + // Use random data to prevent any accidental compression + java.util.Random random = new java.util.Random(42); + byte[] content = new byte[entrySize]; + random.nextBytes(content); + + for (int i = 0; i < numEntries; i++) { + java.util.zip.ZipEntry entry = new java.util.zip.ZipEntry("entry" + i + ".txt"); + entry.setMethod(java.util.zip.ZipEntry.STORED); + entry.setSize(content.length); + entry.setCompressedSize(content.length); + entry.setCrc(computeCrc32(content)); + zos.putNextEntry(entry); + zos.write(content); + zos.closeEntry(); + } + } + } + + private void runBenchmark(Path zipPath, int iterations, int warmupIterations) throws Exception { + ZipParser parser = new ZipParser(); + + // Config with integrity check enabled + ZipParserConfig configWithCheck = new ZipParserConfig(); + configWithCheck.setIntegrityCheck(true); + + // Config with integrity check disabled + ZipParserConfig configWithoutCheck = new ZipParserConfig(); + configWithoutCheck.setIntegrityCheck(false); + + // Warmup - with integrity check + for (int i = 0; i < warmupIterations; i++) { + parseZip(parser, zipPath, configWithCheck); + } + + // Warmup - without integrity check + for (int i = 0; i < warmupIterations; i++) { + parseZip(parser, zipPath, configWithoutCheck); + } + + // Benchmark with integrity check + long startWithCheck = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + parseZip(parser, zipPath, configWithCheck); + } + long durationWithCheck = System.nanoTime() - startWithCheck; + + // Benchmark without integrity check + long startWithoutCheck = System.nanoTime(); + for (int i = 0; i < iterations; i++) { + parseZip(parser, zipPath, configWithoutCheck); + } + long durationWithoutCheck = System.nanoTime() - startWithoutCheck; + + double avgWithCheck = durationWithCheck / (double) iterations / 1_000_000.0; + double avgWithoutCheck = durationWithoutCheck / (double) iterations / 1_000_000.0; + double overhead = avgWithCheck - avgWithoutCheck; + double overheadPercent = (overhead / avgWithoutCheck) * 100; + + System.out.printf(" Without integrity check: %.3f ms/parse%n", avgWithoutCheck); + System.out.printf(" With integrity check: %.3f ms/parse%n", avgWithCheck); + System.out.printf(" Overhead: %.3f ms (%.1f%%)%n", overhead, overheadPercent); + } + + private void parseZip(ZipParser parser, Path zipPath, ZipParserConfig config) throws Exception { + ParseContext context = new ParseContext(); + context.set(ZipParserConfig.class, config); + + try (TikaInputStream tis = TikaInputStream.get(zipPath)) { + Metadata metadata = new Metadata(); + parser.parse(tis, new org.xml.sax.helpers.DefaultHandler(), metadata, context); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 23464be288..f23063409f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -42,8 +42,10 @@ import org.apache.tika.detect.DetectHelper; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Zip; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; +import org.apache.tika.zip.utils.ZipSalvager; /** * This class is designed to detect subtypes of zip-based file formats. @@ -198,48 +200,52 @@ public class DefaultZipContainerDetector implements Detector { * This will call TikaInputStream's getFile(). If there are no exceptions, * it will place the ZipFile in TikaInputStream's openContainer and leave it * open. + * <p> + * Sets detector hints in metadata for the parser: + * <ul> + * <li>{@link Zip#DETECTOR_ZIPFILE_OPENED} - true if ZipFile opened successfully</li> + * <li>{@link Zip#DETECTOR_DATA_DESCRIPTOR_REQUIRED} - true if streaming needed data descriptor support</li> + * </ul> * - * @param tis - * @return + * @param tis the TikaInputStream + * @param metadata the metadata (will be updated with detector hints) + * @param parseContext the parse context + * @return the detected media type */ private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata metadata, ParseContext parseContext) { - ZipFile zip = null; - try { - zip = ZipFile.builder().setFile(tis.getFile()).get(); - - for (ZipContainerDetector zipDetector : getDetectors()) { - MediaType type = zipDetector.detect(zip, tis); - if (type != null) { - if (LOG.isDebugEnabled()) { - LOG.debug("{} detected {}", zipDetector.getClass(), - type.toString()); - } - //e.g. if OPCPackage has already been set - //don't overwrite it with the zip - if (tis.getOpenContainer() == null) { - tis.setOpenContainer(zip); + // Try to open ZipFile (with salvaging fallback) + ZipFile zip = ZipSalvager.tryToOpenZipFile(tis, metadata); + + if (zip != null) { + // ZipFile opened (directly or via salvaging) - run file-based detection + try { + for (ZipContainerDetector zipDetector : getDetectors()) { + MediaType type = zipDetector.detect(zip, tis); + if (type != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("{} detected {}", zipDetector.getClass(), type.toString()); + } + return type; } else { - tis.addCloseableResource(zip); - } - return type; - } else { - if (LOG.isDebugEnabled()) { - LOG.debug("{} detected null", zipDetector.getClass()); + if (LOG.isDebugEnabled()) { + LOG.debug("{} detected null", zipDetector.getClass()); + } } } + } catch (IOException e) { + // Detection failed - fall through to return plain ZIP + if (LOG.isDebugEnabled()) { + LOG.debug("Detection failed on opened ZipFile", e); + } } - } catch (IOException e) { - //do nothing - } - // Fallback: it's still a zip file, we just don't know what kind of one - if (zip != null) { - IOUtils.closeQuietly(zip); + // No specific type detected - it's a plain ZIP return MediaType.APPLICATION_ZIP; } + + // ZipFile failed to open even after salvaging - fall back to streaming detection if (LOG.isDebugEnabled()) { - LOG.debug("zip file failed to open; attempting streaming detect. Results may be imprecise"); + LOG.debug("ZipFile and salvaging both failed; falling back to streaming detection"); } - //problem opening zip file (truncated?) try { return detectStreamingFromPath(tis.getPath(), metadata, false); } catch (IOException e) { @@ -265,6 +271,8 @@ public class DefaultZipContainerDetector implements Detector { } catch (UnsupportedZipFeatureException zfe) { if (allowStoredEntries == false && zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + // Set hint for parser that DATA_DESCRIPTOR support is required + metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true); input.reset(); return detectStreaming(input, metadata, true); } @@ -295,6 +303,8 @@ public class DefaultZipContainerDetector implements Detector { } catch (UnsupportedZipFeatureException zfe) { if (allowStoredEntries == false && zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + // Set hint for parser that DATA_DESCRIPTOR support is required + metadata.set(Zip.DETECTOR_DATA_DESCRIPTOR_REQUIRED, true); return detectStreamingFromPath(p, metadata, true); } } catch (SecurityException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java index 52391bbf8c..1a5542895a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java @@ -17,81 +17,195 @@ package org.apache.tika.zip.utils; import java.io.EOFException; -import java.io.File; import java.io.IOException; -import java.io.InputStream; +import java.nio.charset.Charset; import java.nio.file.Files; +import java.nio.file.Path; import java.util.zip.ZipException; import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Zip; public class ZipSalvager { private static final Logger LOG = LoggerFactory.getLogger(ZipSalvager.class); /** - * This streams the broken zip and rebuilds a new zip that - * is at least a valid zip file. The contents of the final stream - * may be truncated, but the result should be a valid zip file. + * Tries to open a ZipFile from the TikaInputStream. If direct opening fails, + * attempts to salvage the ZIP and open the salvaged version. * <p> - * This does nothing fancy to fix the underlying broken zip. - * <p> - * This will close the inputstream + * On success: + * <ul> + * <li>Sets {@link Zip#DETECTOR_ZIPFILE_OPENED} to true in metadata</li> + * <li>Stores the ZipFile in tis.openContainer (if not already set)</li> + * <li>Returns the opened ZipFile</li> + * </ul> + * On failure: + * <ul> + * <li>Sets {@link Zip#DETECTOR_ZIPFILE_OPENED} to false in metadata</li> + * <li>Returns null</li> + * </ul> * - * @param brokenZip - * @param salvagedZip - * @param allowStoredEntries - * @throws java.io.IOException + * @param tis the TikaInputStream (must be file-backed) + * @param metadata the metadata to update with hints + * @param charset optional charset for entry names (may be null) + * @return the opened ZipFile, or null if opening and salvaging both failed */ - public static void salvageCopy(InputStream brokenZip, File salvagedZip, - boolean allowStoredEntries) throws IOException { + public static ZipFile tryToOpenZipFile(TikaInputStream tis, Metadata metadata, Charset charset) { + // First, try direct open + try { + ZipFile.Builder builder = new ZipFile.Builder().setFile(tis.getFile()); + if (charset != null) { + builder.setCharset(charset); + } + ZipFile zipFile = builder.get(); + + // Direct open succeeded + metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, true); + if (tis.getOpenContainer() == null) { + tis.setOpenContainer(zipFile); + } else { + tis.addCloseableResource(zipFile); + } + return zipFile; + } catch (IOException e) { + if (LOG.isDebugEnabled()) { + LOG.debug("ZipFile failed to open directly; attempting to salvage", e); + } + } - TikaInputStream tis = TikaInputStream.get(brokenZip); - // Enable rewind capability for retry on DATA_DESCRIPTOR feature - tis.enableRewind(); + // Direct open failed - try salvaging try { - try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip); - ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream( - CloseShieldInputStream.wrap(tis), "UTF8", false, - allowStoredEntries)) { - ZipArchiveEntry zae = zipArchiveInputStream.getNextEntry(); + final Path salvagedPath = Files.createTempFile("tika-salvaged-", ".zip"); + tis.enableRewind(); + salvageCopy(tis, salvagedPath, false); + tis.rewind(); + + ZipFile.Builder builder = new ZipFile.Builder().setPath(salvagedPath); + if (charset != null) { + builder.setCharset(charset); + } + ZipFile salvagedZip = builder.get(); + + // Salvaging succeeded + if (LOG.isDebugEnabled()) { + LOG.debug("Successfully salvaged ZIP to {}", salvagedPath); + } + metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, true); + metadata.set(Zip.SALVAGED, true); + + // Add file deletion FIRST so it runs AFTER ZipFile is closed + // (TemporaryResources uses LIFO order) + tis.addCloseableResource(() -> { try { - processZAE(zae, zipArchiveInputStream, outputStream); - } catch (UnsupportedZipFeatureException uzfe) { - if (uzfe.getFeature() == - UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { - //percolate up to allow for retry - throw uzfe; - } - //else swallow - } catch (ZipException | EOFException e) { - //swallow + Files.deleteIfExists(salvagedPath); + } catch (IOException e) { + LOG.warn("Failed to delete salvaged temp file: {}", salvagedPath, e); + salvagedPath.toFile().deleteOnExit(); } - outputStream.flush(); - outputStream.finish(); - } catch (UnsupportedZipFeatureException e) { - //now retry - if (allowStoredEntries == false && - e.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { - tis.rewind(); - salvageCopy(tis, salvagedZip, true); - } else { - throw e; + }); + + // Then add ZipFile (will be closed before file deletion runs) + if (tis.getOpenContainer() == null) { + tis.setOpenContainer(salvagedZip); + } else { + tis.addCloseableResource(salvagedZip); + } + return salvagedZip; + } catch (IOException e) { + if (LOG.isDebugEnabled()) { + LOG.debug("Salvaging failed", e); + } + } + + // Both direct open and salvaging failed + metadata.set(Zip.DETECTOR_ZIPFILE_OPENED, false); + return null; + } + + /** + * Tries to open a ZipFile from the TikaInputStream using default charset. + * + * @see #tryToOpenZipFile(TikaInputStream, Metadata, Charset) + */ + public static ZipFile tryToOpenZipFile(TikaInputStream tis, Metadata metadata) { + return tryToOpenZipFile(tis, metadata, null); + } + + /** + * Streams the broken zip and rebuilds a new zip that is at least a valid zip file. + * The contents of the final stream may be truncated, but the result should be a valid zip file. + * <p> + * This does nothing fancy to fix the underlying broken zip. + * <p> + * This method does NOT close the TikaInputStream - the caller owns it. + * The caller should call {@code tis.enableRewind()} before calling this method + * if retry on DATA_DESCRIPTOR is needed. + * + * @param tis the TikaInputStream to read from (not closed by this method) + * @param salvagedZip the output path for the salvaged ZIP + * @param allowStoredEntries whether to allow stored entries with data descriptors + * @throws IOException if salvaging fails + */ + public static void salvageCopy(TikaInputStream tis, Path salvagedZip, + boolean allowStoredEntries) throws IOException { + try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(Files.newOutputStream(salvagedZip)); + ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream( + CloseShieldInputStream.wrap(tis), "UTF8", false, + allowStoredEntries)) { + ZipArchiveEntry zae = zipArchiveInputStream.getNextEntry(); + try { + processZAE(zae, zipArchiveInputStream, outputStream); + } catch (UnsupportedZipFeatureException uzfe) { + if (uzfe.getFeature() == + UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + //percolate up to allow for retry + throw uzfe; } - } catch (IOException e) { - LOG.warn("problem fixing zip", e); + //else swallow + } catch (ZipException | EOFException e) { + //swallow } - } finally { - tis.close(); + outputStream.flush(); + outputStream.finish(); + } catch (UnsupportedZipFeatureException e) { + //now retry with data descriptor support + if (!allowStoredEntries && + e.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + tis.rewind(); + salvageCopy(tis, salvagedZip, true); + } else { + throw e; + } + } catch (IOException e) { + LOG.warn("problem fixing zip", e); + } + } + + /** + * Streams a broken zip from a Path and rebuilds a valid zip file. + * <p> + * This is a convenience method that creates a TikaInputStream internally. + * + * @param brokenZip the path to the broken ZIP file + * @param salvagedZip the path for the salvaged ZIP output + * @throws IOException if salvaging fails + */ + public static void salvageCopy(Path brokenZip, Path salvagedZip) throws IOException { + try (TikaInputStream tis = TikaInputStream.get(brokenZip)) { + tis.enableRewind(); + salvageCopy(tis, salvagedZip, false); } } @@ -122,10 +236,4 @@ public class ZipSalvager { zae = zipArchiveInputStream.getNextEntry(); } } - - public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException { - try (InputStream is = Files.newInputStream(brokenZip.toPath())) { - salvageCopy(is, salvagedZip, false); - } - } }
