(tika) 02/04: TIKA-4395 -- more work

tallison Thu, 10 Apr 2025 05:20:15 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4395
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 0839824736aad31f4f1656983dc77bca3959896d
Author: tallison <[email protected]>
AuthorDate: Wed Apr 9 16:06:14 2025 -0400

    TIKA-4395 -- more work
---
 .../java/org/apache/tika/io/TikaInputStream.java   |  75 ++++++------
 .../detect/microsoft/POIFSContainerDetector.java   |  82 ++++++++-----
 .../microsoft/POIFSContainerDetectorTest.java      |  90 ++++++++++++++
 .../detect/zip/DefaultZipContainerDetector.java    |  47 +++++---
 .../apache/tika/detect/zip/ZipDetectionTest.java   | 129 +++++++++++++++++++++
 .../org/apache/tika/detect/zip/ZipParserTest.java  |  65 -----------
 6 files changed, 339 insertions(+), 149 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index ea48487a0..0bebd1886 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -723,52 +723,55 @@ public class TikaInputStream extends TaggedInputStream {
      * @throws IOException
      */
     public Path getPath(int maxBytes) throws IOException {
-        if (path == null) {
-            if (position > 0) {
-                throw new IOException("Stream is already being read");
-            } else {
-                Path tmpFile = tmp.createTempFile(suffix);
-                if (maxBytes > -1) {
-                    this.mark(maxBytes);
-                    try (BoundedInputStream boundedInputStream =
-                                 new BoundedInputStream(maxBytes, this)) {
+        if (path != null) {
+            return path;
+        }
+        if (position > 0) {
+            throw new IOException("Stream is already being read");
+        } else {
+            Path tmpFile = tmp.createTempFile(suffix);
+            if (maxBytes > -1) {
+                try (BoundedInputStream boundedInputStream = new 
BoundedInputStream(maxBytes, this)) {
+                    boundedInputStream.mark(maxBytes);
+                    try {
                         Files.copy(boundedInputStream, tmpFile, 
REPLACE_EXISTING);
                         if (boundedInputStream.hasHitBound()) {
                             //tmpFile will be cleaned up when this 
TikaInputStream is closed
                             return null;
                         }
                     } finally {
-                        this.reset();
+                        boundedInputStream.reset();
                     }
-                } else {
-                    // Spool the entire stream into a temporary file
-                    Files.copy(this, tmpFile, REPLACE_EXISTING);
                 }
-                //successful so far, set tis' path to tmpFile
-                path = tmpFile;
-
-                // Create a new input stream and make sure it'll get closed
-                InputStream newStream = Files.newInputStream(path);
-                tmp.addResource(newStream);
-
-                // Replace the spooled stream with the new stream in a way
-                // that still ends up closing the old stream if or when the
-                // close() method is called. The closing of the new stream
-                // is already being handled as noted above.
-                final InputStream oldStream = in;
-                in = new BufferedInputStream(newStream) {
-                    @Override
-                    public void close() throws IOException {
-                        oldStream.close();
-                    }
-                };
-
-                // Update length to file size. Update position, mark
-                length = Files.size(path);
-                position = 0;
-                mark = -1;
+            } else {
+                // Spool the entire stream into a temporary file
+                Files.copy(this, tmpFile, REPLACE_EXISTING);
             }
+            //successful so far, set tis' path to tmpFile
+            path = tmpFile;
+
+            // Create a new input stream and make sure it'll get closed
+            InputStream newStream = Files.newInputStream(path);
+            tmp.addResource(newStream);
+
+            // Replace the spooled stream with the new stream in a way
+            // that still ends up closing the old stream if or when the
+            // close() method is called. The closing of the new stream
+            // is already being handled as noted above.
+            final InputStream oldStream = in;
+            in = new BufferedInputStream(newStream) {
+                @Override
+                public void close() throws IOException {
+                    oldStream.close();
+                }
+            };
+
+            // Update length to file size. Update position, mark
+            length = Files.size(path);
+            position = 0;
+            mark = -1;
         }
+
         return path;
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 321a01f9a..2285630fb 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -38,6 +38,8 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.DocumentNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
@@ -247,6 +249,9 @@ public class POIFSContainerDetector implements Detector {
      */
     private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(POIFSContainerDetector.class);
+
+
     @Field
     private int markLimit = -1;
 
@@ -267,7 +272,7 @@ public class POIFSContainerDetector implements Detector {
      * @return
      */
     public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry 
root) {
-        if (anyCaseNames == null || anyCaseNames.size() == 0) {
+        if (anyCaseNames == null || anyCaseNames.isEmpty()) {
             return OLE;
         }
 
@@ -567,6 +572,8 @@ public class POIFSContainerDetector implements Detector {
 
         //if the stream was longer than markLimit, don't detect
         if (file == null) {
+            LOG.warn("File length exceeds marklimit. Skipping detection on 
this file. " +
+                    "If you need precise detection, consider increasing the 
marklimit or setting it to -1");
             return Collections.emptySet();
         }
 
@@ -581,6 +588,8 @@ public class POIFSContainerDetector implements Detector {
         } catch (IOException e) {
             // Parse error in POI, so we don't know the file type
             return Collections.emptySet();
+        } catch (SecurityException e) {
+            throw e;
         } catch (RuntimeException e) {
             // Another problem in POI
             return Collections.emptySet();
@@ -593,48 +602,61 @@ public class POIFSContainerDetector implements Detector {
             return MediaType.OCTET_STREAM;
         }
 
-        // If this is a TikaInputStream wrapping an already
-        // parsed NPOIFileSystem/DirectoryNode, just get the
-        // names from the root:
-        TikaInputStream tis = TikaInputStream.cast(input);
-        Set<String> names = null;
-        if (tis != null) {
-            Object container = tis.getOpenContainer();
-            if (container instanceof POIFSFileSystem) {
-                names = getTopLevelNames(((POIFSFileSystem) 
container).getRoot());
-            } else if (container instanceof DirectoryNode) {
-                names = getTopLevelNames((DirectoryNode) container);
-            }
+        if (! isOleHeader(input)) {
+            return MediaType.OCTET_STREAM;
         }
 
-        if (names == null) {
-            // Check if the document starts with the OLE header
-            input.mark(8);
-            try {
-                if (input.read() != 0xd0 || input.read() != 0xcf || 
input.read() != 0x11 ||
-                        input.read() != 0xe0 || input.read() != 0xa1 || 
input.read() != 0xb1 ||
-                        input.read() != 0x1a || input.read() != 0xe1) {
-                    return MediaType.OCTET_STREAM;
-                }
-            } catch (IOException e) {
-                return MediaType.OCTET_STREAM;
-            } finally {
-                input.reset();
-            }
+        TikaInputStream tis = TikaInputStream.cast(input);
+        if (tis == null) {
+            LOG.warn("POIFSContainerDetector requires a TikaInputStream for 
precise detection.");
+            return OLE;
         }
 
+        Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
+
         // We can only detect the exact type when given a TikaInputStream
-        if (names == null && tis != null) {
-            // Look for known top level entry names to detect the document type
+        if (names == null) {
+            // spool to disk and try detection
             names = getTopLevelNames(tis);
         }
 
         // Detect based on the names (as available)
-        if (tis != null && tis.getOpenContainer() != null &&
+        if (tis.getOpenContainer() != null &&
                 tis.getOpenContainer() instanceof POIFSFileSystem) {
             return detect(names, ((POIFSFileSystem) 
tis.getOpenContainer()).getRoot());
         } else {
+            //can we actually get here?
             return detect(names, null);
         }
     }
+
+    private boolean isOleHeader(InputStream input) throws IOException {
+        input.mark(8);
+        try {
+            return (input.read() == 0xd0 && input.read() == 0xcf && 
input.read() == 0x11 && input.read() == 0xe0 && input.read() == 0xa1 && 
input.read() == 0xb1 &&
+                    input.read() == 0x1a && input.read() == 0xe1);
+        } finally {
+            input.reset();
+        }
+    }
+
+
+    public static Set<String> tryOpenContainerOnTikaInputStream(InputStream 
input, Metadata metadata) {
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof POIFSFileSystem) {
+                return getTopLevelNames(((POIFSFileSystem) 
container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                return getTopLevelNames((DirectoryNode) container);
+            }
+        }
+        return null;
+    }
+
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
new file mode 100644
index 000000000..bb2785a0d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.microsoft;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+public class POIFSContainerDetectorTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        String[] files =
+                new String[]{"testEXCEL.xls", "testWORD.doc", "testPPT.ppt", 
"testVISIO.vsd",
+                        "test-outlook.msg"};
+        String[] expected =
+                new String[]{
+                    "application/vnd.ms-excel", "application/msword", 
"application/vnd.ms-powerpoint",
+                        "application/vnd.visio", "application/vnd.ms-outlook"
+                };
+        for (String fileName : files) {
+            testStream(fileName, "application/x-tika-msoffice", -1);
+            testStream(fileName, "application/x-tika-msoffice", 0);
+            testStream(fileName, "application/x-tika-msoffice", 100);
+            testTikaInputStream(fileName, "application/x-tika-msoffice", 10);
+        }
+        for (int i = 0; i < files.length; i++) {
+            testTikaInputStream(files[i], expected[i], -1);
+        }
+    }
+
+    private void testStream(String fileName, String expectedMime, int 
markLimit) throws IOException {
+        String expectedDigest = digest(getStream(fileName));
+        POIFSContainerDetector detector = new POIFSContainerDetector();
+        detector.setMarkLimit(markLimit);
+        try (InputStream is = getStream(fileName)) {
+            assertExpected(detector, is, expectedMime, expectedDigest);
+        }
+    }
+
+    private void testTikaInputStream(String fileName, String expectedMime, int 
markLimit) throws IOException {
+        String expectedDigest = digest(getStream(fileName));
+        POIFSContainerDetector detector = new POIFSContainerDetector();
+        detector.setMarkLimit(markLimit);
+        try (InputStream is = TikaInputStream.get(getStream(fileName))) {
+            assertExpected(detector, is, expectedMime, expectedDigest);
+        }
+    }
+
+    private InputStream getStream(String fileName) {
+        return 
POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" + 
fileName);
+    }
+
+    private void assertExpected(Detector detector, InputStream is, String 
expectedMime, String expectedDigest) throws IOException {
+        MediaType mt = detector.detect(is, new Metadata());
+        assertEquals(expectedMime, mt.toString());
+        assertEquals(expectedDigest, digest(is));
+    }
+
+    private String digest(String fileName) throws IOException {
+        return 
digest(POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" 
+ fileName));
+    }
+
+    private String digest(InputStream is) throws IOException {
+        return DigestUtils.sha256Hex(is);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index b30a73e14..5b6567308 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -21,7 +21,6 @@ import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
-import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -44,7 +43,6 @@ import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.BoundedInputStream;
-import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -183,17 +181,15 @@ public class DefaultZipContainerDetector implements 
Detector {
 
             if (TikaInputStream.isTikaInputStream(input)) {
                 TikaInputStream tis = TikaInputStream.cast(input);
-                return detectZipFormatOnFile(tis, metadata);
-            } else {
-                if (markLimit >= 0) {
-                    return detectStreaming(input, metadata);
+                if (markLimit < 1 || tis.hasFile()) {
+                    return detectZipFormatOnFile(tis, metadata);
                 } else {
-                    try (TemporaryResources tmp = new TemporaryResources()) {
-                        try (TikaInputStream tis = TikaInputStream.get(input, 
tmp, new Metadata())) {
-                            return detectZipFormatOnFile(tis, metadata);
-                        }
-                    }
+                    return tryStreaming(tis, metadata);
                 }
+            } else {
+                LOG.warn("Applying streaming detection in 
DefaultZipContainerDetector. " +
+                            "This can lead to imprecise detection. Please 
consider using a TikaInputStream");
+                return detectStreaming(input, metadata);
             }
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
@@ -202,6 +198,23 @@ public class DefaultZipContainerDetector implements 
Detector {
         }
     }
 
+    private MediaType tryStreaming(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        BoundedInputStream boundedInputStream = new 
BoundedInputStream(markLimit, tis);
+        boundedInputStream.mark(markLimit);
+        MediaType mt = null;
+        //try streaming detect
+        try {
+            mt = detectStreaming(boundedInputStream, metadata, false);
+            if (! boundedInputStream.hasHitBound()) {
+                return mt;
+            }
+        } finally {
+            boundedInputStream.reset();
+        }
+        //spool to disk
+        return detectZipFormatOnFile(tis, metadata);
+    }
+
     /**
      * This will call TikaInputStream's getFile(). If there are no exceptions,
      * it will place the ZipFile in TikaInputStream's openContainer and leave 
it
@@ -245,15 +258,13 @@ public class DefaultZipContainerDetector implements 
Detector {
             return MediaType.APPLICATION_ZIP;
         }
         if (LOG.isDebugEnabled()) {
-            LOG.debug("zip file failed to open; attempting streaming detect");
+            LOG.debug("zip file failed to open; attempting streaming detect. 
Results may be imprecise");
         }
-        if (zip == null) {
-            //problem opening zip file (truncated?)
-            try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
-                return detectStreaming(is, metadata);
-            } catch (IOException e) {
+        //problem opening zip file (truncated?)
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
+            return detectStreaming(is, metadata);
+        } catch (IOException e) {
                 //swallow
-            }
         }
         return MediaType.APPLICATION_ZIP;
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
new file mode 100644
index 000000000..a00bd6420
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.zip;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for detecting zip-based files.
+ */
+public class ZipDetectionTest extends TikaTest {
+
+
+    @Test
+    public void testKMZDetection() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
+        assertEquals("application/vnd.google-earth.kmz",
+                metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testJARDetection() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
+        assertEquals("application/java-archive", 
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testStreaming() throws Exception {
+        String expectedDigest = digest("testJAR.jar");
+        DefaultZipContainerDetector detector = new 
DefaultZipContainerDetector();
+        try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        for (int markLimit : new int[]{-1,0,10,100,1000}) {
+            detector = new DefaultZipContainerDetector();
+            //mark limit is ignored for a TikaInputStream
+            try (InputStream is = 
TikaInputStream.get(getStream("testJAR.jar"))) {
+                detector.setMarkLimit(markLimit);
+                assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+            }
+        }
+
+        detector = new DefaultZipContainerDetector();
+        //mark limit is ignored for a TikaInputStream
+        try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) {
+            detector.setMarkLimit(-1);
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        //try on a file that isn't a TikaInputStream
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(TikaInputStream.get(getStream("testJAR.jar")).getPath())))
 {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(100);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/zip", expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(0);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/zip", expectedDigest);
+        }
+
+        detector = new DefaultZipContainerDetector();
+        detector.setMarkLimit(100000);
+        try (InputStream is = 
ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            assertExpected(detector, is, "application/java-archive", 
expectedDigest);
+        }
+    }
+
+    private InputStream getStream(String fileName) {
+        return ZipDetectionTest.class.getResourceAsStream("/test-documents/" + 
fileName);
+    }
+
+    private void assertExpected(Detector detector, InputStream is, String 
expectedMime, String expectedDigest) throws IOException {
+        MediaType mt = detector.detect(is, new Metadata());
+        assertEquals(expectedMime, mt.toString());
+        assertEquals(expectedDigest, digest(is));
+
+    }
+
+    private String digest(String fileName) throws IOException {
+        return 
digest(ZipDetectionTest.class.getResourceAsStream("/test-documents/" + 
fileName));
+    }
+
+    private String digest(InputStream is) throws IOException {
+        return DigestUtils.sha256Hex(is);
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
deleted file mode 100644
index 14c0bb5a4..000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect.zip;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Test case for parsing zip files.
- */
-public class ZipParserTest extends TikaTest {
-
-
-    @Test
-    public void testKMZDetection() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
-        assertEquals("application/vnd.google-earth.kmz",
-                metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testJARDetection() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
-        assertEquals("application/java-archive", 
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
-    }
-
-    @Test
-    public void testStreaming() throws Exception {
-        long len = getLength("testJAR.jar");
-        System.out.println(len);
-        DefaultZipContainerDetector detector = new 
DefaultZipContainerDetector();
-        //detector.setMarkLimit(100);
-        try (InputStream is = 
ZipParserTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
-            System.out.println(detector.detect(is, new Metadata()));
-        }
-    }
-
-    private long getLength(String fileName) throws IOException {
-        return 
IOUtils.toByteArray(ZipParserTest.class.getResourceAsStream("/test-documents/" 
+ fileName)).length;
-    }
-}

(tika) 02/04: TIKA-4395 -- more work

Reply via email to