This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 00f865fbc TIKA-4424 -- fix regression in DefaultZipContainerDetector
(#2249)
00f865fbc is described below
commit 00f865fbce64b173111b85d42d0044a3184d285b
Author: Tim Allison <[email protected]>
AuthorDate: Mon Jun 9 10:11:55 2025 -0400
TIKA-4424 -- fix regression in DefaultZipContainerDetector (#2249)
---
.../detect/zip/DefaultZipContainerDetector.java | 18 ++-
.../org/apache/tika/detect/zip/KMZDetector.java | 30 +++--
.../tika/config/TikaConfigSerializerTest.java | 2 +-
.../org/apache/tika/detect/TestZipDetector.java | 138 +++++++++++++++++++++
.../test/resources/configs/tika-4424-config.xml | 10 ++
.../test/resources/test-documents/testTika4424.zip | Bin 0 -> 33043 bytes
6 files changed, 180 insertions(+), 18 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 6b8513c12..11ca45a49 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -71,7 +71,7 @@ public class DefaultZipContainerDetector implements Detector {
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
@Field
- int markLimit = -1;//16 * 1024 * 1024;
+ int markLimit = 16 * 1024 * 1024;
private transient ServiceLoader loader;
@@ -141,10 +141,16 @@ public class DefaultZipContainerDetector implements
Detector {
}
/**
- * If this is less than 0, the file will be spooled to disk,
+ * If this is less than 0 and a TikaInputStream is used, the file will be
spooled to disk,
* and detection will run on the full file.
- * If this is greater than 0, the {@link
DeprecatedStreamingZipContainerDetector}
- * will be called only up to the markLimit.
+ * <p>
+ * If this is greater than 0 and a TikaInputStream is used, this will try
detection
+ * on the stream up to the markLimit, and if that is greater than the
length of the file,
+ * the streaming result will be returned. If the BoundedInputStream hits
its bound during detection,
+ * the file will be spooled to disk, and detection will be run on the full
file.
+ * <p>
+ * If a non-TikaInputStream is used, detection will only work up to the
<code>markLimit</code>,
+ * potentially leading to lack of precision in zip-based file detection.
*
* @param markLimit mark limit for streaming detection
*/
@@ -184,7 +190,7 @@ public class DefaultZipContainerDetector implements
Detector {
if (markLimit < 1 || tis.hasFile()) {
return detectZipFormatOnFile(tis, metadata);
} else {
- return tryStreaming(tis, metadata);
+ return tryStreamingOnTikaInputStream(tis, metadata);
}
} else {
LOG.warn("Applying streaming detection in
DefaultZipContainerDetector. " +
@@ -198,7 +204,7 @@ public class DefaultZipContainerDetector implements
Detector {
}
}
- private MediaType tryStreaming(TikaInputStream tis, Metadata metadata)
throws IOException {
+ private MediaType tryStreamingOnTikaInputStream(TikaInputStream tis,
Metadata metadata) throws IOException {
BoundedInputStream boundedInputStream = new
BoundedInputStream(markLimit, tis);
boundedInputStream.mark(markLimit);
MediaType mt = null;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
index 82ac23b37..df6e96b87 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
@@ -21,6 +21,7 @@ import static org.apache.tika.detect.zip.PackageConstants.KMZ;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
+import java.util.Locale;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -28,29 +29,36 @@ import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
+/**
+ * This looks for a single file with a name ending in ".kml" at the root level
of the zip file.
+ * <p>
+ * As of Tika 3.2.1, we allow other files at the root level.
+ * <p>
+ * We could make this more robust by requiring xml root detection on the *.kml
file.
+ */
public class KMZDetector implements ZipContainerDetector {
+
+
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws
IOException {
- boolean kmlFound = false;
-
Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ int kmlCount = 0;
while (entries.hasMoreElements()) {
ZipArchiveEntry entry = entries.nextElement();
String name = entry.getName();
if (!entry.isDirectory() && name.indexOf('/') == -1 &&
name.indexOf('\\') == -1) {
- if (name.endsWith(".kml") && !kmlFound) {
- kmlFound = true;
- } else {
+ if (name.toLowerCase(Locale.ROOT).endsWith(".kml")) {
+ kmlCount++;
+ }
+ if (kmlCount > 1) {
return null;
}
}
}
-
- if (kmlFound) {
- return MediaType.application("vnd.google-earth.kmz");
- } else {
- return null;
+ if (kmlCount == 1) {
+ return KMZ;
}
+ return null;
}
@Override
@@ -61,7 +69,7 @@ public class KMZDetector implements ZipContainerDetector {
if (name.indexOf('/') != -1 || name.indexOf('\\') != -1) {
return null;
}
- if (name.endsWith(".kml")) {
+ if (name.toLowerCase(Locale.ROOT).endsWith(".kml")) {
KMLCounter counter = detectContext.get(KMLCounter.class);
if (counter == null) {
counter = new KMLCounter();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index 9ba4b4ab6..d7313db6f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -46,7 +46,7 @@ public class TikaConfigSerializerTest {
assertContains(encodingNeedle, xml);
String detectorNeedle = "<detector
class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" +
- " <params> <param name=\"markLimit\" type=\"int\">-1</param>
</params>";
+ " <params> <param name=\"markLimit\"
type=\"int\">16777216</param> </params>";
assertContains(detectorNeedle, xml);
String parserNeedle = "<parser
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
new file mode 100644
index 000000000..7f6c00bd8
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+public class TestZipDetector extends TikaTest {
+
+ private static final String ZIP_FILE = "testTika4424.zip";
+ private static final String SKIP_ZIP_CONTAINER_CONFIG =
"tika-4424-config.xml";
+
+ private static final Detector DETECTOR = TikaConfig
+ .getDefaultConfig()
+ .getDetector();
+
+ private static Path DOCX;
+ @BeforeAll
+ public static void setUp() throws Exception {
+ DOCX = Files.createTempFile("test-zip-", ".docx");
+
Files.copy(TestZipDetector.class.getResourceAsStream("/test-documents/testWORD.docx"),
+ DOCX, StandardCopyOption.REPLACE_EXISTING);
+ }
+
+ @AfterAll
+ public static void tearDown() throws Exception {
+ Files.delete(DOCX);
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+ String expectedMime =
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
+ Path p = DOCX;
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+ assertEquals(expectedMime, DETECTOR
+ .detect(tis, metadata)
+ .toString());
+ }
+
+ byte[] bytes = Files.readAllBytes(p);
+ metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ assertEquals(expectedMime, DETECTOR
+ .detect(tis, metadata)
+ .toString());
+ }
+
+ metadata = new Metadata();
+ try (InputStream is = new
BufferedInputStream(Files.newInputStream(p))) {
+ assertEquals(expectedMime, DETECTOR
+ .detect(is, metadata)
+ .toString());
+ }
+
+ metadata = new Metadata();
+ try (InputStream is = new ByteArrayInputStream(bytes)) {
+ assertEquals(expectedMime, DETECTOR
+ .detect(is, metadata)
+ .toString());
+ }
+ }
+
+ @Test
+ public void detectKmzUsingPlainInputStream() throws Exception {
+ try (InputStream inputStream =
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE)) {
+
+ assertNotNull(inputStream);
+
+ Tika tika = new Tika();
+
+ String result = tika.detect(inputStream, ZIP_FILE);
+ assertEquals("application/vnd.google-earth.kmz", result);
+ }
+ }
+
+ @Test
+ public void detectKmzUsingTikaInputStream() throws Exception {
+ try (InputStream inputStream =
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE);
+ TikaInputStream tikaInputStream =
TikaInputStream.get(inputStream)) {
+
+ assertNotNull(tikaInputStream);
+
+ Tika tika = new Tika();
+
+ String result = tika.detect(tikaInputStream, ZIP_FILE);
+ assertEquals("application/vnd.google-earth.kmz", result);
+ }
+ }
+
+ @Test
+ public void detectPlainZipUsingPlainInputStream() throws Exception {
+ try (InputStream tikaConfigInputStream =
TestZipDetector.class.getResourceAsStream("/configs/" +
SKIP_ZIP_CONTAINER_CONFIG);
+ InputStream inputStream =
TestZipDetector.class.getResourceAsStream("/test-documents/" + ZIP_FILE)) {
+
+ assertNotNull(tikaConfigInputStream);
+ assertNotNull(inputStream);
+
+ Tika tika = new Tika(new TikaConfig(tikaConfigInputStream));
+
+ String result = tika.detect(inputStream, ZIP_FILE);
+ assertEquals("application/zip", result);
+ }
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
new file mode 100644
index 000000000..aa3facf1a
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+ <detectors>
+ <!-- All detectors except built-in container ones -->
+ <detector class="org.apache.tika.detect.DefaultDetector">
+ <!-- DefaultZipContainerDetector will identify *.zip files with
KML content as "kmz" files, this is correct behaviour -->
+ <detector-exclude
class="org.apache.tika.detect.zip.DefaultZipContainerDetector"/>
+ </detector>
+ </detectors>
+</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
new file mode 100644
index 000000000..364b17455
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testTika4424.zip
differ