This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new dd222a392 TIKA-4395 -- improve handling and logging in container 
detection (#2185)
dd222a392 is described below

commit dd222a39244f223e21ce8133601c8f54b86f2b65
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 10 08:42:16 2025 -0400

    TIKA-4395 -- improve handling and logging in container detection (#2185)
    
    TIKA-4395 -- improve handling and logging in container detection
    
    (cherry picked from commit 845493973c6ebefed8481369bc4719b19c519e53)
---
 .../java/org/apache/tika/io/TikaInputStream.java   |  75 ++++++------
 .../org/apache/tika/MultiThreadedTikaTest.java     |   1 -
 .../detect/microsoft/POIFSContainerDetector.java   |  89 +++++++++-----
 .../microsoft/POIFSContainerDetectorTest.java      |  90 ++++++++++++++
 .../src/test/resources/log4j2.xml                  |   3 +
 .../detect/zip/DefaultZipContainerDetector.java    |  47 +++++---
 .../apache/tika/detect/zip/ZipDetectionTest.java   | 129 +++++++++++++++++++++
 .../org/apache/tika/detect/zip/ZipParserTest.java  |  47 --------
 .../tika/config/TikaConfigSerializerTest.java      |   2 +-
 .../tika/detect/TestContainerAwareDetector.java    |   4 +
 10 files changed, 357 insertions(+), 130 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 77e09226a..1afa2d14e 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -692,52 +692,55 @@ public class TikaInputStream extends TaggedInputStream {
      * @throws IOException
      */
     public Path getPath(int maxBytes) throws IOException {
-        if (path == null) {
-            if (position > 0) {
-                throw new IOException("Stream is already being read");
-            } else {
-                Path tmpFile = tmp.createTempFile(suffix);
-                if (maxBytes > -1) {
-                    this.mark(maxBytes);
-                    try (BoundedInputStream boundedInputStream =
-                                 new BoundedInputStream(maxBytes, this)) {
+        if (path != null) {
+            return path;
+        }
+        if (position > 0) {
+            throw new IOException("Stream is already being read");
+        } else {
+            Path tmpFile = tmp.createTempFile(suffix);
+            if (maxBytes > -1) {
+                try (BoundedInputStream boundedInputStream = new 
BoundedInputStream(maxBytes, this)) {
+                    boundedInputStream.mark(maxBytes);
+                    try {
                         Files.copy(boundedInputStream, tmpFile, 
REPLACE_EXISTING);
                         if (boundedInputStream.hasHitBound()) {
                             //tmpFile will be cleaned up when this 
TikaInputStream is closed
                             return null;
                         }
                     } finally {
-                        this.reset();
+                        boundedInputStream.reset();
                     }
-                } else {
-                    // Spool the entire stream into a temporary file
-                    Files.copy(this, tmpFile, REPLACE_EXISTING);
                 }
-                //successful so far, set tis' path to tmpFile
-                path = tmpFile;
-
-                // Create a new input stream and make sure it'll get closed
-                InputStream newStream = Files.newInputStream(path);
-                tmp.addResource(newStream);
-
-                // Replace the spooled stream with the new stream in a way
-                // that still ends up closing the old stream if or when the
-                // close() method is called. The closing of the new stream
-                // is already being handled as noted above.
-                final InputStream oldStream = in;
-                in = new BufferedInputStream(newStream) {
-                    @Override
-                    public void close() throws IOException {
-                        oldStream.close();
-                    }
-                };
-
-                // Update length to file size. Update position, mark
-                length = Files.size(path);
-                position = 0;
-                mark = -1;
+            } else {
+                // Spool the entire stream into a temporary file
+                Files.copy(this, tmpFile, REPLACE_EXISTING);
             }
+            //successful so far, set tis' path to tmpFile
+            path = tmpFile;
+
+            // Create a new input stream and make sure it'll get closed
+            InputStream newStream = Files.newInputStream(path);
+            tmp.addResource(newStream);
+
+            // Replace the spooled stream with the new stream in a way
+            // that still ends up closing the old stream if or when the
+            // close() method is called. The closing of the new stream
+            // is already being handled as noted above.
+            final InputStream oldStream = in;
+            in = new BufferedInputStream(newStream) {
+                @Override
+                public void close() throws IOException {
+                    oldStream.close();
+                }
+            };
+
+            // Update length to file size. Update position, mark
+            length = Files.size(path);
+            position = 0;
+            mark = -1;
         }
+
         return path;
     }
 
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java 
b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index fd3f381d4..ee87f9bf7 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -109,7 +109,6 @@ public class MultiThreadedTikaTest extends TikaTest {
                 baseline.put(f, new Extract(metadataList));
 
             } catch (Exception e) {
-                e.printStackTrace();
                 //swallow
             }
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 69be0361f..f0605a78d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.detect.microsoft;
 
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.apache.tika.mime.MediaType.application;
 import static org.apache.tika.mime.MediaType.image;
 
@@ -38,6 +39,8 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.DocumentNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
@@ -247,8 +250,11 @@ public class POIFSContainerDetector implements Detector {
      */
     private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(POIFSContainerDetector.class);
+
+
     @Field
-    private int markLimit = 128 * 1024 * 1024;
+    private int markLimit = -1;
 
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
@@ -267,7 +273,7 @@ public class POIFSContainerDetector implements Detector {
      * @return
      */
     public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry 
root) {
-        if (anyCaseNames == null || anyCaseNames.size() == 0) {
+        if (anyCaseNames == null || anyCaseNames.isEmpty()) {
             return OLE;
         }
 
@@ -567,6 +573,8 @@ public class POIFSContainerDetector implements Detector {
 
         //if the stream was longer than markLimit, don't detect
         if (file == null) {
+            LOG.warn("File length exceeds marklimit. Skipping detection on 
this file. " +
+                    "If you need precise detection, consider increasing the 
marklimit or setting it to -1");
             return Collections.emptySet();
         }
 
@@ -581,6 +589,8 @@ public class POIFSContainerDetector implements Detector {
         } catch (IOException e) {
             // Parse error in POI, so we don't know the file type
             return Collections.emptySet();
+        } catch (SecurityException e) {
+            throw e;
         } catch (RuntimeException e) {
             // Another problem in POI
             return Collections.emptySet();
@@ -593,48 +603,67 @@ public class POIFSContainerDetector implements Detector {
             return MediaType.OCTET_STREAM;
         }
 
-        // If this is a TikaInputStream wrapping an already
-        // parsed NPOIFileSystem/DirectoryNode, just get the
-        // names from the root:
         TikaInputStream tis = TikaInputStream.cast(input);
-        Set<String> names = null;
         if (tis != null) {
-            Object container = tis.getOpenContainer();
-            if (container instanceof POIFSFileSystem) {
-                names = getTopLevelNames(((POIFSFileSystem) 
container).getRoot());
-            } else if (container instanceof DirectoryNode) {
-                names = getTopLevelNames((DirectoryNode) container);
-            }
+            return handleTikaStream(tis, metadata);
+        }
+        if (isOleHeader(input)) {
+            return OLE;
         }
+        return MediaType.OCTET_STREAM;
+    }
 
-        if (names == null) {
-            // Check if the document starts with the OLE header
-            input.mark(8);
-            try {
-                if (input.read() != 0xd0 || input.read() != 0xcf || 
input.read() != 0x11 ||
-                        input.read() != 0xe0 || input.read() != 0xa1 || 
input.read() != 0xb1 ||
-                        input.read() != 0x1a || input.read() != 0xe1) {
-                    return MediaType.OCTET_STREAM;
-                }
-            } catch (IOException e) {
-                return MediaType.OCTET_STREAM;
-            } finally {
-                input.reset();
-            }
+    private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        //try for an open container
+        Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
+
+        //if that didn't work, confirm the bytes are OLE
+        if (names == null && ! isOleHeader(tis)) {
+            return OCTET_STREAM;
         }
 
-        // We can only detect the exact type when given a TikaInputStream
-        if (names == null && tis != null) {
-            // Look for known top level entry names to detect the document type
+        // If OLE, spool to disk
+        if (names == null) {
+            // spool to disk and try detection
             names = getTopLevelNames(tis);
         }
 
         // Detect based on the names (as available)
-        if (tis != null && tis.getOpenContainer() != null &&
+        if (tis.getOpenContainer() != null &&
                 tis.getOpenContainer() instanceof POIFSFileSystem) {
             return detect(names, ((POIFSFileSystem) 
tis.getOpenContainer()).getRoot());
         } else {
             return detect(names, null);
         }
     }
+
+    private boolean isOleHeader(InputStream input) throws IOException {
+        input.mark(8);
+        try {
+            return (input.read() == 0xd0 && input.read() == 0xcf && 
input.read() == 0x11 && input.read() == 0xe0 && input.read() == 0xa1 && 
input.read() == 0xb1 &&
+                    input.read() == 0x1a && input.read() == 0xe1);
+        } finally {
+            input.reset();
+        }
+    }
+
+
+    public static Set<String> tryOpenContainerOnTikaInputStream(InputStream 
input, Metadata metadata) {
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof POIFSFileSystem) {
+                return getTopLevelNames(((POIFSFileSystem) 
container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                return getTopLevelNames((DirectoryNode) container);
+            }
+        }
+        return null;
+    }
+
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
new file mode 100644
index 000000000..bb2785a0d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.microsoft;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class POIFSContainerDetectorTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        String[] files =
+                new String[]{"testEXCEL.xls", "testWORD.doc", "testPPT.ppt", 
"testVISIO.vsd",
+                        "test-outlook.msg"};
+        String[] expected =
+                new String[]{
+                    "application/vnd.ms-excel", "application/msword", 
"application/vnd.ms-powerpoint",
+                        "application/vnd.visio", "application/vnd.ms-outlook"
+                };
+        for (String fileName : files) {
+            testStream(fileName, "application/x-tika-msoffice", -1);
+            testStream(fileName, "application/x-tika-msoffice", 0);
+            testStream(fileName, "application/x-tika-msoffice", 100);
+            testTikaInputStream(fileName, "application/x-tika-msoffice", 10);
+        }
+        for (int i = 0; i < files.length; i++) {
+            testTikaInputStream(files[i], expected[i], -1);
+        }
+    }
+
+    private void testStream(String fileName, String expectedMime, int 
markLimit) throws IOException {
+        String expectedDigest = digest(getStream(fileName));
+        POIFSContainerDetector detector = new POIFSContainerDetector();
+        detector.setMarkLimit(markLimit);
+        try (InputStream is = getStream(fileName)) {
+            assertExpected(detector, is, expectedMime, expectedDigest);
+        }
+    }
+
+    private void testTikaInputStream(String fileName, String expectedMime, int 
markLimit) throws IOException {
+        String expectedDigest = digest(getStream(fileName));
+        POIFSContainerDetector detector = new POIFSContainerDetector();
+        detector.setMarkLimit(markLimit);
+        try (InputStream is = TikaInputStream.get(getStream(fileName))) {
+            assertExpected(detector, is, expectedMime, expectedDigest);
+        }
+    }
+
+    private InputStream getStream(String fileName) {
+        return 
POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" + 
fileName);
+    }
+
+    private void assertExpected(Detector detector, InputStream is, String 
expectedMime, String expectedDigest) throws IOException {
+        MediaType mt = detector.detect(is, new Metadata());
+        assertEquals(expectedMime, mt.toString());
+        assertEquals(expectedDigest, digest(is));
+    }
+
+    private String digest(String fileName) throws IOException {
+        return 
digest(POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" 
+ fileName));
+    }
+
+    private String digest(InputStream is) throws IOException {
+        return DigestUtils.sha256Hex(is);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
index 1e9327e01..d609d7631 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
@@ -36,5 +36,8 @@
     <Logger name="org.apache.poi" level="ERROR" additivity="false">
       <AppenderRef ref="Console"/>
     </Logger>
+    <Logger name="org.apache.tika.detect.microsoft.POIFSContainerDetector" 
level="ERROR" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
   </Loggers>
 </Configuration>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 9adfe9ba0..2c2669b85 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -71,7 +71,7 @@ public class DefaultZipContainerDetector implements Detector {
     //this has to be > 100,000 to handle some of the iworks files
     //in our unit tests
     @Field
-    int markLimit = 16 * 1024 * 1024;
+    int markLimit = -1;//16 * 1024 * 1024;
 
     private transient ServiceLoader loader;
 
@@ -181,14 +181,16 @@ public class DefaultZipContainerDetector implements 
Detector {
 
             if (TikaInputStream.isTikaInputStream(input)) {
                 TikaInputStream tis = TikaInputStream.cast(input);
-                if (markLimit < 0) {
-                    tis.getFile();
-                }
-                if (tis.hasFile()) {
+                if (markLimit < 1 || tis.hasFile()) {
                     return detectZipFormatOnFile(tis, metadata);
+                } else {
+                    return tryStreaming(tis, metadata);
                 }
+            } else {
+                LOG.warn("Applying streaming detection in 
DefaultZipContainerDetector. " +
+                            "This can lead to imprecise detection. Please 
consider using a TikaInputStream");
+                return detectStreaming(input, metadata);
             }
-            return detectStreaming(input, metadata);
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
         } else {
@@ -196,6 +198,23 @@ public class DefaultZipContainerDetector implements 
Detector {
         }
     }
 
+    private MediaType tryStreaming(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        BoundedInputStream boundedInputStream = new 
BoundedInputStream(markLimit, tis);
+        boundedInputStream.mark(markLimit);
+        MediaType mt = null;
+        //try streaming detect
+        try {
+            mt = detectStreaming(boundedInputStream, metadata, false);
+            if (! boundedInputStream.hasHitBound()) {
+                return mt;
+            }
+        } finally {
+            boundedInputStream.reset();
+        }
+        //spool to disk
+        return detectZipFormatOnFile(tis, metadata);
+    }
+
     /**
      * This will call TikaInputStream's getFile(). If there are no exceptions,
      * it will place the ZipFile in TikaInputStream's openContainer and leave 
it
@@ -207,7 +226,7 @@ public class DefaultZipContainerDetector implements 
Detector {
     private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata 
metadata) {
         ZipFile zip = null;
         try {
-            zip = ZipFile.builder().setFile(tis.getFile()).get(); // TODO: 
hasFile()?
+            zip = ZipFile.builder().setFile(tis.getFile()).get();
 
             for (ZipContainerDetector zipDetector : getDetectors()) {
                 MediaType type = zipDetector.detect(zip, tis);
@@ -239,15 +258,13 @@ public class DefaultZipContainerDetector implements 
Detector {
             return MediaType.APPLICATION_ZIP;
         }
         if (LOG.isDebugEnabled()) {
-            LOG.debug("zip file failed to open; attempting streaming detect");
+            LOG.debug("zip file failed to open; attempting streaming detect. 
Results may be imprecise");
         }
-        if (zip == null) {
-            //problem opening zip file (truncated?)
-            try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
-                return detectStreaming(is, metadata);
-            } catch (IOException e) {
-                //swallow
-            }
+        //problem opening zip file (truncated?)
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
+            return detectStreaming(is, metadata, false);
+        } catch (IOException e) {
+            //swallow
         }
         return MediaType.APPLICATION_ZIP;
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
new file mode 100644
index 000000000..a00bd6420
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.zip;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for detecting zip-based files.
+ */
+public class ZipDetectionTest extends TikaTest {
+
+
+    @Test
+    public void testKMZDetection() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
+        assertEquals("application/vnd.google-earth.kmz",
+                metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testJARDetection() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
+        assertEquals("application/java-archive", 
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testStreaming() throws Exception {
+        String expectedDigest = digest("testJAR.jar");
+        DefaultZipContainerDetector detector = new 
DefaultZipContainerDetector();
+        try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        for (int markLimit : new int[]{-1,0,10,100,1000}) {
+            detector = new DefaultZipContainerDetector();
+            //mark limit is ignored for a TikaInputStream
+            try (InputStream is = 
TikaInputStream.get(getStream("testJAR.jar"))) {
+                detector.setMarkLimit(markLimit);
+                assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+            }
+        }
+
+        detector = new DefaultZipContainerDetector();
+        //mark limit is ignored for a TikaInputStream
+        try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+            detector.setMarkLimit(-1);
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        //try on a file that isn't a TikaInputStream
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(TikaInputStream.get(getStream("testJAR.jar")).getPath())))
 {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(100);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/zip", expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(0);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/zip", expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(100000);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+    }
+
+    private InputStream getStream(String fileName) {
+        return ZipDetectionTest.class.getResourceAsStream("/test-documents/" + 
fileName);
+    }
+
+    private void assertExpected(Detector detector, InputStream is, String 
expectedMime, String expectedDigest) throws IOException {
+        MediaType mt = detector.detect(is, new Metadata());
+        assertEquals(expectedMime, mt.toString());
+        assertEquals(expectedDigest, digest(is));
+
+    }
+
+    private String digest(String fileName) throws IOException {
+        return 
digest(ZipDetectionTest.class.getResourceAsStream("/test-documents/" + 
fileName));
+    }
+
+    private String digest(InputStream is) throws IOException {
+        return DigestUtils.sha256Hex(is);
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
deleted file mode 100644
index 2ed4c3572..000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect.zip;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends TikaTest {
-
-
-    @Test
-    public void testKMZDetection() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
-        assertEquals("application/vnd.google-earth.kmz",
-                metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testJARDetection() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
-        assertEquals("application/java-archive", 
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
-    }
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index d7313db6f..9ba4b4ab6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -46,7 +46,7 @@ public class TikaConfigSerializerTest {
         assertContains(encodingNeedle, xml);
 
         String detectorNeedle = "<detector 
class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" +
-                " <params> <param name=\"markLimit\" 
type=\"int\">16777216</param> </params>";
+                " <params> <param name=\"markLimit\" type=\"int\">-1</param> 
</params>";
         assertContains(detectorNeedle, xml);
 
         String parserNeedle = "<parser 
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index d35df67bf..cb71b925e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -65,6 +65,10 @@ public class TestContainerAwareDetector extends 
MultiThreadedTikaTest {
     private final StreamingZipContainerDetector streamingZipDetector =
             new StreamingZipContainerDetector();
 
+    TestContainerAwareDetector() {
+        streamingZipDetector.setMarkLimit(128 * 1024 * 1024);
+    }
+
     @AfterEach
     public void tearDown() throws TikaException {
         //make sure to reset pool size because it is being randomly resized 
during the tests

Reply via email to