[tika] branch main updated: Fix TIKA-3196 (#364)

tallison Fri, 25 Sep 2020 07:05:42 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new aba3e43  Fix TIKA-3196 (#364)
aba3e43 is described below

commit aba3e433510f02300ff627df74d09fdfb372cf38
Author: Lee <[email protected]>
AuthorDate: Fri Sep 25 22:04:54 2020 +0800

    Fix TIKA-3196 (#364)
    
    When reading a zip archive entry with STORED and Data Descriptor, a
    UnsupportedZipFeatureException is thrown. We can save the number of
    entries we have already read, reset the stream, and open the
    ZipArchieInputStream again with Data Descriptor allowed. Then we can
    finish reading the rest of the entries.
---
 .../org/apache/tika/parser/pkg/PackageParser.java  |  87 +++++++++++++++++++--
 .../org/apache/tika/parser/pkg/ZipParserTest.java  |  37 +++++++--
 .../test-documents/testZip_with_DataDescriptor.zip | Bin 0 -> 484 bytes
 3 files changed, 110 insertions(+), 14 deletions(-)

diff --git 
a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 
b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index b1b86a2..b099aff 100644
--- 
a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ 
b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.zip.ZipEntry;
 
 import org.apache.commons.compress.PasswordRequiredException;
 import org.apache.commons.compress.archivers.ArchiveEntry;
@@ -101,6 +102,12 @@ public class PackageParser extends AbstractParser {
     static final Set<MediaType> PACKAGE_SPECIALIZATIONS =
             loadPackageSpecializations();
 
+    // the mark limit used for stream
+    private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
+
+    // count of the entries in the archive, this is used for zip requires Data 
Descriptor
+    private int entryCnt = 0;
+
     static final Set<MediaType> loadPackageSpecializations() {
         Set<MediaType> zipSpecializations = new HashSet<>();
         for (String mediaTypeString : new String[]{
@@ -211,8 +218,10 @@ public class PackageParser extends AbstractParser {
         
         TemporaryResources tmp = new TemporaryResources();
         ArchiveInputStream ais = null;
+        String encoding = null;
         try {
             ArchiveStreamFactory factory = 
context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+            encoding = factory.getEntryEncoding();
             // At the end we want to close the archive stream to release
             // any associated resources, but the underlying document stream
             // should not be closed
@@ -262,29 +271,87 @@ public class PackageParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
+        // mark before we start parsing entries for potential reset
+        stream.mark(MARK_LIMIT);
+        entryCnt = 0;
+        try {
+            parseEntries(false, ais, metadata, extractor, xhtml);
+        } catch (UnsupportedZipFeatureException zfe) {
+            // If this is a zip archive which requires a data descriptor, 
parse it again
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
+                // Close archive input stream and create a new one that could 
handle data descriptor
+                ais.close();
+                // An exception would be thrown if MARK_LIMIT is not big enough
+                stream.reset();
+                ais = new ZipArchiveInputStream(new 
CloseShieldInputStream(stream), encoding, true, true);
+                parseEntries(true, ais, metadata, extractor, xhtml);
+            }
+        } finally {
+            ais.close();
+            tmp.close();
+            // reset the entryCnt
+            entryCnt = 0;
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Parse the entries of the zip archive
+     *
+     * @param shouldUseDataDescriptor indicates if a data descriptor is 
required or not
+     * @param ais archive input stream
+     * @param metadata document metadata (input and output)
+     * @param extractor the delegate parser
+     * @param xhtml the xhtml handler
+     * @throws TikaException if the document could not be parsed
+     * @throws IOException if a UnsupportedZipFeatureException is met
+     * @throws SAXException if the SAX events could not be processed
+     */
+    private void parseEntries(boolean shouldUseDataDescriptor, 
ArchiveInputStream ais, Metadata metadata,
+                              EmbeddedDocumentExtractor extractor, 
XHTMLContentHandler xhtml)
+            throws TikaException, IOException, SAXException {
         try {
             ArchiveEntry entry = ais.getNextEntry();
             while (entry != null) {
+                if (shouldUseDataDescriptor && entryCnt > 0) {
+                    // With shouldUseDataDescriptor being true, we are reading
+                    // the zip once again. The number of entryCnt entries have
+                    // already been parsed in the last time, so we can just
+                    // skip these entries.
+                    entryCnt--;
+                    entry = ais.getNextEntry();
+                    continue;
+                }
+
                 if (!entry.isDirectory()) {
                     parseEntry(ais, entry, extractor, metadata, xhtml);
                 }
+
+                if (!shouldUseDataDescriptor) {
+                    // Record the number of entries we have read, this is used
+                    // for zip archives using Data Descriptor. It's used for
+                    // skipping the entries we have already read
+                    entryCnt++;
+                }
+
                 entry = ais.getNextEntry();
             }
         } catch (UnsupportedZipFeatureException zfe) {
+
             // If it's an encrypted document of unknown password, report as 
such
             if (zfe.getFeature() == Feature.ENCRYPTION) {
                 throw new EncryptedDocumentException(zfe);
             }
+
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
+                throw zfe;
+            }
             // Otherwise throw the exception
             throw new TikaException("UnsupportedZipFeature", zfe);
         } catch (PasswordRequiredException pre) {
             throw new EncryptedDocumentException(pre);
-        } finally {
-            ais.close();
-            tmp.close();
         }
-
-        xhtml.endDocument();
     }
 
     private void updateMediaType(ArchiveInputStream ais, Metadata metadata) {
@@ -337,11 +404,19 @@ public class PackageParser extends AbstractParser {
         } else {
             name = (name == null) ? "" : name;
             if (entry instanceof ZipArchiveEntry) {
-                boolean usesEncryption = ((ZipArchiveEntry) 
entry).getGeneralPurposeBit().usesEncryption();
+                ZipArchiveEntry zipArchiveEntry = (ZipArchiveEntry) entry;
+                boolean usesEncryption = 
zipArchiveEntry.getGeneralPurposeBit().usesEncryption();
                 if (usesEncryption) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(
                             new EncryptedDocumentException("stream ("+name+") 
is encrypted"), parentMetadata);
                 }
+
+                // do not write to the handler if 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR
+                // is met, we will catch this exception and read the zip 
archive once again
+                boolean usesDataDescriptor = 
zipArchiveEntry.getGeneralPurposeBit().usesDataDescriptor();
+                if (usesDataDescriptor && zipArchiveEntry.getMethod() == 
ZipEntry.STORED) {
+                    throw new 
UnsupportedZipFeatureException(UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR,
 zipArchiveEntry);
+                }
             } else {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(
                         new TikaException("Can't read archive stream 
("+name+")"), parentMetadata);
diff --git 
a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 
b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 32ac389..29ff990 100644
--- 
a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ 
b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -21,22 +21,22 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.io.InputStream;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.Arrays;
+import java.util.zip.ZipEntry;
 
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
@@ -143,4 +143,25 @@ public class ZipParserTest extends AbstractPkgTest {
         getXML("droste.zip");
     }
 
+    @Test
+    public void testZipUsingStoredWithDataDescriptor() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/testZip_with_DataDescriptor.zip")) {
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
trackingContext);
+
+            assertEquals(5, tracker.filenames.size());
+            assertEquals("en0", tracker.filenames.get(0));
+            assertEquals("en1", tracker.filenames.get(1));
+            assertEquals("en2", tracker.filenames.get(2));
+            assertEquals("en3", tracker.filenames.get(3));
+            assertEquals("en4", tracker.filenames.get(4));
+            assertEquals(1, tracker.lastSeenStart[0]);
+            assertEquals(2, tracker.lastSeenStart[1]);
+            assertEquals(3, tracker.lastSeenStart[2]);
+            assertEquals(4, tracker.lastSeenStart[3]);
+        }
+    }
 }
diff --git 
a/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor.zip
 
b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor.zip
new file mode 100644
index 0000000..1f06197
Binary files /dev/null and 
b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor.zip
 differ

[tika] branch main updated: Fix TIKA-3196 (#364)

Reply via email to