This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5f70b36e4 improve handling of the data descriptor retry in the zip
detector (#2199)
5f70b36e4 is described below
commit 5f70b36e418e2685a21693237cf5f049e91d1234
Author: Tim Allison <[email protected]>
AuthorDate: Mon May 19 10:33:21 2025 -0400
improve handling of the data descriptor retry in the zip detector (#2199)
---
.../detect/zip/DefaultZipContainerDetector.java | 34 ++++++++++++++++++++--
1 file changed, 31 insertions(+), 3 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 2c2669b85..6b8513c12 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -16,11 +16,11 @@
*/
package org.apache.tika.detect.zip;
-import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
@@ -261,8 +261,8 @@ public class DefaultZipContainerDetector implements
Detector {
LOG.debug("zip file failed to open; attempting streaming detect.
Results may be imprecise");
}
//problem opening zip file (truncated?)
- try (InputStream is = new
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
- return detectStreaming(is, metadata, false);
+ try {
+ return detectStreamingFromPath(tis.getPath(), metadata, false);
} catch (IOException e) {
//swallow
}
@@ -310,6 +310,34 @@ public class DefaultZipContainerDetector implements
Detector {
return finalDetect(detectContext);
}
+ MediaType detectStreamingFromPath(Path p, Metadata metadata, boolean
allowStoredEntries)
+ throws IOException {
+ StreamingDetectContext detectContext = new StreamingDetectContext();
+ try (ZipArchiveInputStream zis = new ZipArchiveInputStream(
+ Files.newInputStream(p), "UTF8", false, allowStoredEntries)) {
+ ZipArchiveEntry zae = zis.getNextEntry();
+ while (zae != null) {
+ MediaType mt = detect(zae, zis, detectContext);
+ if (mt != null) {
+ return mt;
+ }
+ zae = zis.getNextEntry();
+ }
+ } catch (UnsupportedZipFeatureException zfe) {
+ if (allowStoredEntries == false &&
+ zfe.getFeature() ==
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ return detectStreamingFromPath(p, metadata, true);
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (EOFException e) {
+ //truncated zip -- swallow
+ } catch (IOException e) {
+ //another option for a truncated zip
+ }
+
+ return finalDetect(detectContext);
+ }
private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis,
StreamingDetectContext detectContext) throws
IOException {