This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new dd222a392 TIKA-4395 -- improve handling and logging in container
detection (#2185)
dd222a392 is described below
commit dd222a39244f223e21ce8133601c8f54b86f2b65
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 10 08:42:16 2025 -0400
TIKA-4395 -- improve handling and logging in container detection (#2185)
TIKA-4395 -- improve handling and logging in container detection
(cherry picked from commit 845493973c6ebefed8481369bc4719b19c519e53)
---
.../java/org/apache/tika/io/TikaInputStream.java | 75 ++++++------
.../org/apache/tika/MultiThreadedTikaTest.java | 1 -
.../detect/microsoft/POIFSContainerDetector.java | 89 +++++++++-----
.../microsoft/POIFSContainerDetectorTest.java | 90 ++++++++++++++
.../src/test/resources/log4j2.xml | 3 +
.../detect/zip/DefaultZipContainerDetector.java | 47 +++++---
.../apache/tika/detect/zip/ZipDetectionTest.java | 129 +++++++++++++++++++++
.../org/apache/tika/detect/zip/ZipParserTest.java | 47 --------
.../tika/config/TikaConfigSerializerTest.java | 2 +-
.../tika/detect/TestContainerAwareDetector.java | 4 +
10 files changed, 357 insertions(+), 130 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 77e09226a..1afa2d14e 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -692,52 +692,55 @@ public class TikaInputStream extends TaggedInputStream {
* @throws IOException
*/
public Path getPath(int maxBytes) throws IOException {
- if (path == null) {
- if (position > 0) {
- throw new IOException("Stream is already being read");
- } else {
- Path tmpFile = tmp.createTempFile(suffix);
- if (maxBytes > -1) {
- this.mark(maxBytes);
- try (BoundedInputStream boundedInputStream =
- new BoundedInputStream(maxBytes, this)) {
+ if (path != null) {
+ return path;
+ }
+ if (position > 0) {
+ throw new IOException("Stream is already being read");
+ } else {
+ Path tmpFile = tmp.createTempFile(suffix);
+ if (maxBytes > -1) {
+ try (BoundedInputStream boundedInputStream = new
BoundedInputStream(maxBytes, this)) {
+ boundedInputStream.mark(maxBytes);
+ try {
Files.copy(boundedInputStream, tmpFile,
REPLACE_EXISTING);
if (boundedInputStream.hasHitBound()) {
//tmpFile will be cleaned up when this
TikaInputStream is closed
return null;
}
} finally {
- this.reset();
+ boundedInputStream.reset();
}
- } else {
- // Spool the entire stream into a temporary file
- Files.copy(this, tmpFile, REPLACE_EXISTING);
}
- //successful so far, set tis' path to tmpFile
- path = tmpFile;
-
- // Create a new input stream and make sure it'll get closed
- InputStream newStream = Files.newInputStream(path);
- tmp.addResource(newStream);
-
- // Replace the spooled stream with the new stream in a way
- // that still ends up closing the old stream if or when the
- // close() method is called. The closing of the new stream
- // is already being handled as noted above.
- final InputStream oldStream = in;
- in = new BufferedInputStream(newStream) {
- @Override
- public void close() throws IOException {
- oldStream.close();
- }
- };
-
- // Update length to file size. Update position, mark
- length = Files.size(path);
- position = 0;
- mark = -1;
+ } else {
+ // Spool the entire stream into a temporary file
+ Files.copy(this, tmpFile, REPLACE_EXISTING);
}
+ //successful so far, set tis' path to tmpFile
+ path = tmpFile;
+
+ // Create a new input stream and make sure it'll get closed
+ InputStream newStream = Files.newInputStream(path);
+ tmp.addResource(newStream);
+
+ // Replace the spooled stream with the new stream in a way
+ // that still ends up closing the old stream if or when the
+ // close() method is called. The closing of the new stream
+ // is already being handled as noted above.
+ final InputStream oldStream = in;
+ in = new BufferedInputStream(newStream) {
+ @Override
+ public void close() throws IOException {
+ oldStream.close();
+ }
+ };
+
+ // Update length to file size. Update position, mark
+ length = Files.size(path);
+ position = 0;
+ mark = -1;
}
+
return path;
}
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index fd3f381d4..ee87f9bf7 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -109,7 +109,6 @@ public class MultiThreadedTikaTest extends TikaTest {
baseline.put(f, new Extract(metadataList));
} catch (Exception e) {
- e.printStackTrace();
//swallow
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 69be0361f..f0605a78d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.detect.microsoft;
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.application;
import static org.apache.tika.mime.MediaType.image;
@@ -38,6 +39,8 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
@@ -247,8 +250,11 @@ public class POIFSContainerDetector implements Detector {
*/
private static final Pattern mppDataMatch =
Pattern.compile("\\s\\s\\s\\d+");
+ private static final Logger LOG =
LoggerFactory.getLogger(POIFSContainerDetector.class);
+
+
@Field
- private int markLimit = 128 * 1024 * 1024;
+ private int markLimit = -1;
/**
* Internal detection of the specific kind of OLE2 document, based on the
@@ -267,7 +273,7 @@ public class POIFSContainerDetector implements Detector {
* @return
*/
public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry
root) {
- if (anyCaseNames == null || anyCaseNames.size() == 0) {
+ if (anyCaseNames == null || anyCaseNames.isEmpty()) {
return OLE;
}
@@ -567,6 +573,8 @@ public class POIFSContainerDetector implements Detector {
//if the stream was longer than markLimit, don't detect
if (file == null) {
+ LOG.warn("File length exceeds marklimit. Skipping detection on
this file. " +
+ "If you need precise detection, consider increasing the
marklimit or setting it to -1");
return Collections.emptySet();
}
@@ -581,6 +589,8 @@ public class POIFSContainerDetector implements Detector {
} catch (IOException e) {
// Parse error in POI, so we don't know the file type
return Collections.emptySet();
+ } catch (SecurityException e) {
+ throw e;
} catch (RuntimeException e) {
// Another problem in POI
return Collections.emptySet();
@@ -593,48 +603,67 @@ public class POIFSContainerDetector implements Detector {
return MediaType.OCTET_STREAM;
}
- // If this is a TikaInputStream wrapping an already
- // parsed NPOIFileSystem/DirectoryNode, just get the
- // names from the root:
TikaInputStream tis = TikaInputStream.cast(input);
- Set<String> names = null;
if (tis != null) {
- Object container = tis.getOpenContainer();
- if (container instanceof POIFSFileSystem) {
- names = getTopLevelNames(((POIFSFileSystem)
container).getRoot());
- } else if (container instanceof DirectoryNode) {
- names = getTopLevelNames((DirectoryNode) container);
- }
+ return handleTikaStream(tis, metadata);
+ }
+ if (isOleHeader(input)) {
+ return OLE;
}
+ return MediaType.OCTET_STREAM;
+ }
- if (names == null) {
- // Check if the document starts with the OLE header
- input.mark(8);
- try {
- if (input.read() != 0xd0 || input.read() != 0xcf ||
input.read() != 0x11 ||
- input.read() != 0xe0 || input.read() != 0xa1 ||
input.read() != 0xb1 ||
- input.read() != 0x1a || input.read() != 0xe1) {
- return MediaType.OCTET_STREAM;
- }
- } catch (IOException e) {
- return MediaType.OCTET_STREAM;
- } finally {
- input.reset();
- }
+ private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata)
throws IOException {
+ //try for an open container
+ Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
+
+ //if that didn't work, confirm the bytes are OLE
+ if (names == null && ! isOleHeader(tis)) {
+ return OCTET_STREAM;
}
- // We can only detect the exact type when given a TikaInputStream
- if (names == null && tis != null) {
- // Look for known top level entry names to detect the document type
+ // If OLE, spool to disk
+ if (names == null) {
+ // spool to disk and try detection
names = getTopLevelNames(tis);
}
// Detect based on the names (as available)
- if (tis != null && tis.getOpenContainer() != null &&
+ if (tis.getOpenContainer() != null &&
tis.getOpenContainer() instanceof POIFSFileSystem) {
return detect(names, ((POIFSFileSystem)
tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
}
+
+ private boolean isOleHeader(InputStream input) throws IOException {
+ input.mark(8);
+ try {
+ return (input.read() == 0xd0 && input.read() == 0xcf &&
input.read() == 0x11 && input.read() == 0xe0 && input.read() == 0xa1 &&
input.read() == 0xb1 &&
+ input.read() == 0x1a && input.read() == 0xe1);
+ } finally {
+ input.reset();
+ }
+ }
+
+
+ public static Set<String> tryOpenContainerOnTikaInputStream(InputStream
input, Metadata metadata) {
+ // If this is a TikaInputStream wrapping an already
+ // parsed NPOIFileSystem/DirectoryNode, just get the
+ // names from the root:
+ TikaInputStream tis = TikaInputStream.cast(input);
+ Set<String> names = null;
+ if (tis != null) {
+ Object container = tis.getOpenContainer();
+ if (container instanceof POIFSFileSystem) {
+ return getTopLevelNames(((POIFSFileSystem)
container).getRoot());
+ } else if (container instanceof DirectoryNode) {
+ return getTopLevelNames((DirectoryNode) container);
+ }
+ }
+ return null;
+ }
+
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
new file mode 100644
index 000000000..bb2785a0d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.microsoft;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class POIFSContainerDetectorTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ String[] files =
+ new String[]{"testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
"testVISIO.vsd",
+ "test-outlook.msg"};
+ String[] expected =
+ new String[]{
+ "application/vnd.ms-excel", "application/msword",
"application/vnd.ms-powerpoint",
+ "application/vnd.visio", "application/vnd.ms-outlook"
+ };
+ for (String fileName : files) {
+ testStream(fileName, "application/x-tika-msoffice", -1);
+ testStream(fileName, "application/x-tika-msoffice", 0);
+ testStream(fileName, "application/x-tika-msoffice", 100);
+ testTikaInputStream(fileName, "application/x-tika-msoffice", 10);
+ }
+ for (int i = 0; i < files.length; i++) {
+ testTikaInputStream(files[i], expected[i], -1);
+ }
+ }
+
+ private void testStream(String fileName, String expectedMime, int
markLimit) throws IOException {
+ String expectedDigest = digest(getStream(fileName));
+ POIFSContainerDetector detector = new POIFSContainerDetector();
+ detector.setMarkLimit(markLimit);
+ try (InputStream is = getStream(fileName)) {
+ assertExpected(detector, is, expectedMime, expectedDigest);
+ }
+ }
+
+ private void testTikaInputStream(String fileName, String expectedMime, int
markLimit) throws IOException {
+ String expectedDigest = digest(getStream(fileName));
+ POIFSContainerDetector detector = new POIFSContainerDetector();
+ detector.setMarkLimit(markLimit);
+ try (InputStream is = TikaInputStream.get(getStream(fileName))) {
+ assertExpected(detector, is, expectedMime, expectedDigest);
+ }
+ }
+
+ private InputStream getStream(String fileName) {
+ return
POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" +
fileName);
+ }
+
+ private void assertExpected(Detector detector, InputStream is, String
expectedMime, String expectedDigest) throws IOException {
+ MediaType mt = detector.detect(is, new Metadata());
+ assertEquals(expectedMime, mt.toString());
+ assertEquals(expectedDigest, digest(is));
+ }
+
+ private String digest(String fileName) throws IOException {
+ return
digest(POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/"
+ fileName));
+ }
+
+ private String digest(InputStream is) throws IOException {
+ return DigestUtils.sha256Hex(is);
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
index 1e9327e01..d609d7631 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
@@ -36,5 +36,8 @@
<Logger name="org.apache.poi" level="ERROR" additivity="false">
<AppenderRef ref="Console"/>
</Logger>
+ <Logger name="org.apache.tika.detect.microsoft.POIFSContainerDetector"
level="ERROR" additivity="false">
+ <AppenderRef ref="Console"/>
+ </Logger>
</Loggers>
</Configuration>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 9adfe9ba0..2c2669b85 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -71,7 +71,7 @@ public class DefaultZipContainerDetector implements Detector {
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
@Field
- int markLimit = 16 * 1024 * 1024;
+ int markLimit = -1;//16 * 1024 * 1024;
private transient ServiceLoader loader;
@@ -181,14 +181,16 @@ public class DefaultZipContainerDetector implements
Detector {
if (TikaInputStream.isTikaInputStream(input)) {
TikaInputStream tis = TikaInputStream.cast(input);
- if (markLimit < 0) {
- tis.getFile();
- }
- if (tis.hasFile()) {
+ if (markLimit < 1 || tis.hasFile()) {
return detectZipFormatOnFile(tis, metadata);
+ } else {
+ return tryStreaming(tis, metadata);
}
+ } else {
+ LOG.warn("Applying streaming detection in
DefaultZipContainerDetector. " +
+ "This can lead to imprecise detection. Please
consider using a TikaInputStream");
+ return detectStreaming(input, metadata);
}
- return detectStreaming(input, metadata);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
@@ -196,6 +198,23 @@ public class DefaultZipContainerDetector implements
Detector {
}
}
+ private MediaType tryStreaming(TikaInputStream tis, Metadata metadata)
throws IOException {
+ BoundedInputStream boundedInputStream = new
BoundedInputStream(markLimit, tis);
+ boundedInputStream.mark(markLimit);
+ MediaType mt = null;
+ //try streaming detect
+ try {
+ mt = detectStreaming(boundedInputStream, metadata, false);
+ if (! boundedInputStream.hasHitBound()) {
+ return mt;
+ }
+ } finally {
+ boundedInputStream.reset();
+ }
+ //spool to disk
+ return detectZipFormatOnFile(tis, metadata);
+ }
+
/**
* This will call TikaInputStream's getFile(). If there are no exceptions,
* it will place the ZipFile in TikaInputStream's openContainer and leave
it
@@ -207,7 +226,7 @@ public class DefaultZipContainerDetector implements
Detector {
private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata
metadata) {
ZipFile zip = null;
try {
- zip = ZipFile.builder().setFile(tis.getFile()).get(); // TODO:
hasFile()?
+ zip = ZipFile.builder().setFile(tis.getFile()).get();
for (ZipContainerDetector zipDetector : getDetectors()) {
MediaType type = zipDetector.detect(zip, tis);
@@ -239,15 +258,13 @@ public class DefaultZipContainerDetector implements
Detector {
return MediaType.APPLICATION_ZIP;
}
if (LOG.isDebugEnabled()) {
- LOG.debug("zip file failed to open; attempting streaming detect");
+ LOG.debug("zip file failed to open; attempting streaming detect.
Results may be imprecise");
}
- if (zip == null) {
- //problem opening zip file (truncated?)
- try (InputStream is = new
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
- return detectStreaming(is, metadata);
- } catch (IOException e) {
- //swallow
- }
+ //problem opening zip file (truncated?)
+ try (InputStream is = new
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
+ return detectStreaming(is, metadata, false);
+ } catch (IOException e) {
+ //swallow
}
return MediaType.APPLICATION_ZIP;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
new file mode 100644
index 000000000..a00bd6420
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.zip;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for detecting zip-based files.
+ */
+public class ZipDetectionTest extends TikaTest {
+
+
+ @Test
+ public void testKMZDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
+ assertEquals("application/vnd.google-earth.kmz",
+ metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testJARDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
+ assertEquals("application/java-archive",
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testStreaming() throws Exception {
+ String expectedDigest = digest("testJAR.jar");
+ DefaultZipContainerDetector detector = new
DefaultZipContainerDetector();
+ try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+
+ for (int markLimit : new int[]{-1,0,10,100,1000}) {
+ detector = new DefaultZipContainerDetector();
+ //mark limit is ignored for a TikaInputStream
+ try (InputStream is =
TikaInputStream.get(getStream("testJAR.jar"))) {
+ detector.setMarkLimit(markLimit);
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+ }
+
+ detector = new DefaultZipContainerDetector();
+ //mark limit is ignored for a TikaInputStream
+ try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+ detector.setMarkLimit(-1);
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+
+ detector = new DefaultZipContainerDetector();
+ //try on a file that isn't a TikaInputStream
+ try (InputStream is = new
BufferedInputStream(Files.newInputStream(TikaInputStream.get(getStream("testJAR.jar")).getPath())))
{
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+
+ detector = new DefaultZipContainerDetector();
+ try (InputStream is =
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+
+ detector = new DefaultZipContainerDetector();
+ detector.setMarkLimit(100);
+ try (InputStream is =
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+ assertExpected(detector, is, "application/zip", expectedDigest);
+ }
+
+ detector = new DefaultZipContainerDetector();
+ detector.setMarkLimit(0);
+ try (InputStream is =
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+ assertExpected(detector, is, "application/zip", expectedDigest);
+ }
+
+ detector = new DefaultZipContainerDetector();
+ detector.setMarkLimit(100000);
+ try (InputStream is =
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+ assertExpected(detector, is, "application/java-archive",
expectedDigest);
+ }
+ }
+
+ private InputStream getStream(String fileName) {
+ return ZipDetectionTest.class.getResourceAsStream("/test-documents/" +
fileName);
+ }
+
+ private void assertExpected(Detector detector, InputStream is, String
expectedMime, String expectedDigest) throws IOException {
+ MediaType mt = detector.detect(is, new Metadata());
+ assertEquals(expectedMime, mt.toString());
+ assertEquals(expectedDigest, digest(is));
+
+ }
+
+ private String digest(String fileName) throws IOException {
+ return
digest(ZipDetectionTest.class.getResourceAsStream("/test-documents/" +
fileName));
+ }
+
+ private String digest(InputStream is) throws IOException {
+ return DigestUtils.sha256Hex(is);
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
deleted file mode 100644
index 2ed4c3572..000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect.zip;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends TikaTest {
-
-
- @Test
- public void testKMZDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
- assertEquals("application/vnd.google-earth.kmz",
- metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-
- @Test
- public void testJARDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
- assertEquals("application/java-archive",
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index d7313db6f..9ba4b4ab6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -46,7 +46,7 @@ public class TikaConfigSerializerTest {
assertContains(encodingNeedle, xml);
String detectorNeedle = "<detector
class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" +
- " <params> <param name=\"markLimit\"
type=\"int\">16777216</param> </params>";
+ " <params> <param name=\"markLimit\" type=\"int\">-1</param>
</params>";
assertContains(detectorNeedle, xml);
String parserNeedle = "<parser
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index d35df67bf..cb71b925e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -65,6 +65,10 @@ public class TestContainerAwareDetector extends
MultiThreadedTikaTest {
private final StreamingZipContainerDetector streamingZipDetector =
new StreamingZipContainerDetector();
+ TestContainerAwareDetector() {
+ streamingZipDetector.setMarkLimit(128 * 1024 * 1024);
+ }
+
@AfterEach
public void tearDown() throws TikaException {
//make sure to reset pool size because it is being randomly resized
during the tests