This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new b3eb67fe6 TIKA-4424 -- fix regression in DefaultZipContainerDetector 
(#2249)
b3eb67fe6 is described below

commit b3eb67fe6679598e34f479377efc99f2cd260a80
Author: Tim Allison <[email protected]>
AuthorDate: Mon Jun 9 10:11:55 2025 -0400

    TIKA-4424 -- fix regression in DefaultZipContainerDetector (#2249)
    
    (cherry picked from commit 00f865fbce64b173111b85d42d0044a3184d285b)
---
 .../detect/zip/DefaultZipContainerDetector.java    |  18 ++-
 .../org/apache/tika/detect/zip/KMZDetector.java    |  30 +++--
 .../tika/config/TikaConfigSerializerTest.java      |   2 +-
 .../org/apache/tika/detect/TestZipDetector.java    | 138 +++++++++++++++++++++
 .../test/resources/configs/tika-4424-config.xml    |  10 ++
 .../test/resources/test-documents/testTika4424.zip | Bin 0 -> 33043 bytes
 6 files changed, 180 insertions(+), 18 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 6b8513c12..11ca45a49 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -71,7 +71,7 @@ public class DefaultZipContainerDetector implements Detector {
     //this has to be > 100,000 to handle some of the iworks files
     //in our unit tests
     @Field
-    int markLimit = -1;//16 * 1024 * 1024;
+    int markLimit = 16 * 1024 * 1024;
 
     private transient ServiceLoader loader;
 
@@ -141,10 +141,16 @@ public class DefaultZipContainerDetector implements 
Detector {
     }
 
     /**
-     * If this is less than 0, the file will be spooled to disk,
+     * If this is less than 0 and a TikaInputStream is used, the file will be 
spooled to disk,
      * and detection will run on the full file.
-     * If this is greater than 0, the {@link 
DeprecatedStreamingZipContainerDetector}
-     * will be called only up to the markLimit.
+     * <p>
+     * If this is greater than 0 and a TikaInputStream is used, this will try 
detection
+     * on the stream up to the markLimit, and if that is greater than the 
length of the file,
+     * the streaming result will be returned. If the BoundedInputStream hits 
its bound during detection,
+     * the file will be spooled to disk, and detection will be run on the full 
file.
+     * <p>
+     * If a non-TikaInputStream is used, detection will only work up to the 
<code>markLimit</code>,
+     * potentially leading to lack of precision in zip-based file detection.
      *
      * @param markLimit mark limit for streaming detection
      */
@@ -184,7 +190,7 @@ public class DefaultZipContainerDetector implements 
Detector {
                 if (markLimit < 1 || tis.hasFile()) {
                     return detectZipFormatOnFile(tis, metadata);
                 } else {
-                    return tryStreaming(tis, metadata);
+                    return tryStreamingOnTikaInputStream(tis, metadata);
                 }
             } else {
                 LOG.warn("Applying streaming detection in 
DefaultZipContainerDetector. " +
@@ -198,7 +204,7 @@ public class DefaultZipContainerDetector implements 
Detector {
         }
     }
 
-    private MediaType tryStreaming(TikaInputStream tis, Metadata metadata) 
throws IOException {
+    private MediaType tryStreamingOnTikaInputStream(TikaInputStream tis, 
Metadata metadata) throws IOException {
         BoundedInputStream boundedInputStream = new 
BoundedInputStream(markLimit, tis);
         boundedInputStream.mark(markLimit);
         MediaType mt = null;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
index 82ac23b37..df6e96b87 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
@@ -21,6 +21,7 @@ import static org.apache.tika.detect.zip.PackageConstants.KMZ;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Enumeration;
+import java.util.Locale;
 
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -28,29 +29,36 @@ import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MediaType;
 
+/**
+ * This looks for a single file with a name ending in ".kml" at the root level 
of the zip file.
+ * <p>
+ * As of Tika 3.2.1, we allow other files at the root level.
+ * <p>
+ * We could make this more robust by requiring xml root detection on the *.kml 
file.
+ */
 public class KMZDetector implements ZipContainerDetector {
+
+
     @Override
     public MediaType detect(ZipFile zip, TikaInputStream tis) throws 
IOException {
-        boolean kmlFound = false;
-
         Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+        int kmlCount = 0;
         while (entries.hasMoreElements()) {
             ZipArchiveEntry entry = entries.nextElement();
             String name = entry.getName();
             if (!entry.isDirectory() && name.indexOf('/') == -1 && 
name.indexOf('\\') == -1) {
-                if (name.endsWith(".kml") && !kmlFound) {
-                    kmlFound = true;
-                } else {
+                if (name.toLowerCase(Locale.ROOT).endsWith(".kml")) {
+                    kmlCount++;
+                }
+                if (kmlCount > 1) {
                     return null;
                 }
             }
         }
-
-        if (kmlFound) {
-            return MediaType.application("vnd.google-earth.kmz");
-        } else {
-            return null;
+        if (kmlCount == 1) {
+            return KMZ;
         }
+        return null;
     }
 
     @Override
@@ -61,7 +69,7 @@ public class KMZDetector implements ZipContainerDetector {
         if (name.indexOf('/') != -1 || name.indexOf('\\') != -1) {
             return null;
         }
-        if (name.endsWith(".kml")) {
+        if (name.toLowerCase(Locale.ROOT).endsWith(".kml")) {
             KMLCounter counter = detectContext.get(KMLCounter.class);
             if (counter == null) {
                 counter = new KMLCounter();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index 9ba4b4ab6..d7313db6f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -46,7 +46,7 @@ public class TikaConfigSerializerTest {
         assertContains(encodingNeedle, xml);
 
         String detectorNeedle = "<detector 
class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" +
-                " <params> <param name=\"markLimit\" type=\"int\">-1</param> 
</params>";
+                " <params> <param name=\"markLimit\" 
type=\"int\">16777216</param> </params>";
         assertContains(detectorNeedle, xml);
 
         String parserNeedle = "<parser 
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
new file mode 100644
index 000000000..7f6c00bd8
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+public class TestZipDetector extends TikaTest {
+
+    private static final String ZIP_FILE = "testTika4424.zip";
+    private static final String SKIP_ZIP_CONTAINER_CONFIG = 
"tika-4424-config.xml";
+
+    private static final Detector DETECTOR = TikaConfig
+            .getDefaultConfig()
+            .getDetector();
+
+    private static Path DOCX;
+    @BeforeAll
+    public static void setUp() throws Exception {
+        DOCX = Files.createTempFile("test-zip-", ".docx");
+        
Files.copy(TestZipDetector.class.getResourceAsStream("/test-documents/testWORD.docx"),
+                DOCX, StandardCopyOption.REPLACE_EXISTING);
+    }
+
+    @AfterAll
+    public static void tearDown() throws Exception {
+        Files.delete(DOCX);
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        String expectedMime = 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
+        Path p = DOCX;
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            assertEquals(expectedMime, DETECTOR
+                    .detect(tis, metadata)
+                    .toString());
+        }
+
+        byte[] bytes = Files.readAllBytes(p);
+        metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            assertEquals(expectedMime, DETECTOR
+                    .detect(tis, metadata)
+                    .toString());
+        }
+
+        metadata = new Metadata();
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(p))) {
+            assertEquals(expectedMime, DETECTOR
+                    .detect(is, metadata)
+                    .toString());
+        }
+
+        metadata = new Metadata();
+        try (InputStream is = new ByteArrayInputStream(bytes)) {
+            assertEquals(expectedMime, DETECTOR
+                    .detect(is, metadata)
+                    .toString());
+        }
+    }
+
+    @Test
+    public void detectKmzUsingPlainInputStream() throws Exception {
+        try (InputStream inputStream = 
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE)) {
+
+            assertNotNull(inputStream);
+
+            Tika tika = new Tika();
+
+            String result = tika.detect(inputStream, ZIP_FILE);
+            assertEquals("application/vnd.google-earth.kmz", result);
+        }
+    }
+
+    @Test
+    public void detectKmzUsingTikaInputStream() throws Exception {
+        try (InputStream inputStream = 
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE);
+                TikaInputStream tikaInputStream = 
TikaInputStream.get(inputStream)) {
+
+            assertNotNull(tikaInputStream);
+
+            Tika tika = new Tika();
+
+            String result = tika.detect(tikaInputStream, ZIP_FILE);
+            assertEquals("application/vnd.google-earth.kmz", result);
+        }
+    }
+
+    @Test
+    public void detectPlainZipUsingPlainInputStream() throws Exception {
+        try (InputStream tikaConfigInputStream = 
TestZipDetector.class.getResourceAsStream("/configs/" + 
SKIP_ZIP_CONTAINER_CONFIG);
+                InputStream inputStream = 
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE)) {
+
+            assertNotNull(tikaConfigInputStream);
+            assertNotNull(inputStream);
+
+            Tika tika = new Tika(new TikaConfig(tikaConfigInputStream));
+
+            String result = tika.detect(inputStream, ZIP_FILE);
+            assertEquals("application/zip", result);
+        }
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
new file mode 100644
index 000000000..aa3facf1a
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+    <detectors>
+        <!-- All detectors except built-in container ones -->
+        <detector class="org.apache.tika.detect.DefaultDetector">
+            <!-- DefaultZipContainerDetector will identify *.zip files with 
KML content as "kmz" files, this is correct behaviour -->
+            <detector-exclude 
class="org.apache.tika.detect.zip.DefaultZipContainerDetector"/>
+        </detector>
+    </detectors>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
new file mode 100644
index 000000000..364b17455
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
 differ

Reply via email to