This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4395 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0839824736aad31f4f1656983dc77bca3959896d Author: tallison <[email protected]> AuthorDate: Wed Apr 9 16:06:14 2025 -0400 TIKA-4395 -- more work --- .../java/org/apache/tika/io/TikaInputStream.java | 75 ++++++------ .../detect/microsoft/POIFSContainerDetector.java | 82 ++++++++----- .../microsoft/POIFSContainerDetectorTest.java | 90 ++++++++++++++ .../detect/zip/DefaultZipContainerDetector.java | 47 +++++--- .../apache/tika/detect/zip/ZipDetectionTest.java | 129 +++++++++++++++++++++ .../org/apache/tika/detect/zip/ZipParserTest.java | 65 ----------- 6 files changed, 339 insertions(+), 149 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index ea48487a0..0bebd1886 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -723,52 +723,55 @@ public class TikaInputStream extends TaggedInputStream { * @throws IOException */ public Path getPath(int maxBytes) throws IOException { - if (path == null) { - if (position > 0) { - throw new IOException("Stream is already being read"); - } else { - Path tmpFile = tmp.createTempFile(suffix); - if (maxBytes > -1) { - this.mark(maxBytes); - try (BoundedInputStream boundedInputStream = - new BoundedInputStream(maxBytes, this)) { + if (path != null) { + return path; + } + if (position > 0) { + throw new IOException("Stream is already being read"); + } else { + Path tmpFile = tmp.createTempFile(suffix); + if (maxBytes > -1) { + try (BoundedInputStream boundedInputStream = new BoundedInputStream(maxBytes, this)) { + boundedInputStream.mark(maxBytes); + try { Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING); if (boundedInputStream.hasHitBound()) { //tmpFile will be cleaned up when this TikaInputStream is closed return null; } } finally { - this.reset(); + boundedInputStream.reset(); } - } else { - // Spool the entire stream into a temporary file - Files.copy(this, tmpFile, REPLACE_EXISTING); } - //successful so far, set tis' path to tmpFile - path = tmpFile; - - // Create a new input stream and make sure it'll get closed - InputStream newStream = Files.newInputStream(path); - tmp.addResource(newStream); - - // Replace the spooled stream with the new stream in a way - // that still ends up closing the old stream if or when the - // close() method is called. The closing of the new stream - // is already being handled as noted above. - final InputStream oldStream = in; - in = new BufferedInputStream(newStream) { - @Override - public void close() throws IOException { - oldStream.close(); - } - }; - - // Update length to file size. Update position, mark - length = Files.size(path); - position = 0; - mark = -1; + } else { + // Spool the entire stream into a temporary file + Files.copy(this, tmpFile, REPLACE_EXISTING); } + //successful so far, set tis' path to tmpFile + path = tmpFile; + + // Create a new input stream and make sure it'll get closed + InputStream newStream = Files.newInputStream(path); + tmp.addResource(newStream); + + // Replace the spooled stream with the new stream in a way + // that still ends up closing the old stream if or when the + // close() method is called. The closing of the new stream + // is already being handled as noted above. + final InputStream oldStream = in; + in = new BufferedInputStream(newStream) { + @Override + public void close() throws IOException { + oldStream.close(); + } + }; + + // Update length to file size. Update position, mark + length = Files.size(path); + position = 0; + mark = -1; } + return path; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index 321a01f9a..2285630fb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -38,6 +38,8 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.DocumentNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; @@ -247,6 +249,9 @@ public class POIFSContainerDetector implements Detector { */ private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); + private static final Logger LOG = LoggerFactory.getLogger(POIFSContainerDetector.class); + + @Field private int markLimit = -1; @@ -267,7 +272,7 @@ public class POIFSContainerDetector implements Detector { * @return */ public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry root) { - if (anyCaseNames == null || anyCaseNames.size() == 0) { + if (anyCaseNames == null || anyCaseNames.isEmpty()) { return OLE; } @@ -567,6 +572,8 @@ public class POIFSContainerDetector implements Detector { //if the stream was longer than markLimit, don't detect if (file == null) { + LOG.warn("File length exceeds marklimit. Skipping detection on this file. " + + "If you need precise detection, consider increasing the marklimit or setting it to -1"); return Collections.emptySet(); } @@ -581,6 +588,8 @@ public class POIFSContainerDetector implements Detector { } catch (IOException e) { // Parse error in POI, so we don't know the file type return Collections.emptySet(); + } catch (SecurityException e) { + throw e; } catch (RuntimeException e) { // Another problem in POI return Collections.emptySet(); @@ -593,48 +602,61 @@ public class POIFSContainerDetector implements Detector { return MediaType.OCTET_STREAM; } - // If this is a TikaInputStream wrapping an already - // parsed NPOIFileSystem/DirectoryNode, just get the - // names from the root: - TikaInputStream tis = TikaInputStream.cast(input); - Set<String> names = null; - if (tis != null) { - Object container = tis.getOpenContainer(); - if (container instanceof POIFSFileSystem) { - names = getTopLevelNames(((POIFSFileSystem) container).getRoot()); - } else if (container instanceof DirectoryNode) { - names = getTopLevelNames((DirectoryNode) container); - } + if (! isOleHeader(input)) { + return MediaType.OCTET_STREAM; } - if (names == null) { - // Check if the document starts with the OLE header - input.mark(8); - try { - if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 || - input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 || - input.read() != 0x1a || input.read() != 0xe1) { - return MediaType.OCTET_STREAM; - } - } catch (IOException e) { - return MediaType.OCTET_STREAM; - } finally { - input.reset(); - } + TikaInputStream tis = TikaInputStream.cast(input); + if (tis == null) { + LOG.warn("POIFSContainerDetector requires a TikaInputStream for precise detection."); + return OLE; } + Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata); + // We can only detect the exact type when given a TikaInputStream - if (names == null && tis != null) { - // Look for known top level entry names to detect the document type + if (names == null) { + // spool to disk and try detection names = getTopLevelNames(tis); } // Detect based on the names (as available) - if (tis != null && tis.getOpenContainer() != null && + if (tis.getOpenContainer() != null && tis.getOpenContainer() instanceof POIFSFileSystem) { return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { + //can we actually get here? return detect(names, null); } } + + private boolean isOleHeader(InputStream input) throws IOException { + input.mark(8); + try { + return (input.read() == 0xd0 && input.read() == 0xcf && input.read() == 0x11 && input.read() == 0xe0 && input.read() == 0xa1 && input.read() == 0xb1 && + input.read() == 0x1a && input.read() == 0xe1); + } finally { + input.reset(); + } + } + + + public static Set<String> tryOpenContainerOnTikaInputStream(InputStream input, Metadata metadata) { + // If this is a TikaInputStream wrapping an already + // parsed NPOIFileSystem/DirectoryNode, just get the + // names from the root: + TikaInputStream tis = TikaInputStream.cast(input); + Set<String> names = null; + if (tis != null) { + Object container = tis.getOpenContainer(); + if (container instanceof POIFSFileSystem) { + return getTopLevelNames(((POIFSFileSystem) container).getRoot()); + } else if (container instanceof DirectoryNode) { + return getTopLevelNames((DirectoryNode) container); + } + } + return null; + } + + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java new file mode 100644 index 000000000..bb2785a0d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/detect/microsoft/POIFSContainerDetectorTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.microsoft; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.codec.digest.DigestUtils; +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +public class POIFSContainerDetectorTest { + + @Test + public void testBasic() throws Exception { + String[] files = + new String[]{"testEXCEL.xls", "testWORD.doc", "testPPT.ppt", "testVISIO.vsd", + "test-outlook.msg"}; + String[] expected = + new String[]{ + "application/vnd.ms-excel", "application/msword", "application/vnd.ms-powerpoint", + "application/vnd.visio", "application/vnd.ms-outlook" + }; + for (String fileName : files) { + testStream(fileName, "application/x-tika-msoffice", -1); + testStream(fileName, "application/x-tika-msoffice", 0); + testStream(fileName, "application/x-tika-msoffice", 100); + testTikaInputStream(fileName, "application/x-tika-msoffice", 10); + } + for (int i = 0; i < files.length; i++) { + testTikaInputStream(files[i], expected[i], -1); + } + } + + private void testStream(String fileName, String expectedMime, int markLimit) throws IOException { + String expectedDigest = digest(getStream(fileName)); + POIFSContainerDetector detector = new POIFSContainerDetector(); + detector.setMarkLimit(markLimit); + try (InputStream is = getStream(fileName)) { + assertExpected(detector, is, expectedMime, expectedDigest); + } + } + + private void testTikaInputStream(String fileName, String expectedMime, int markLimit) throws IOException { + String expectedDigest = digest(getStream(fileName)); + POIFSContainerDetector detector = new POIFSContainerDetector(); + detector.setMarkLimit(markLimit); + try (InputStream is = TikaInputStream.get(getStream(fileName))) { + assertExpected(detector, is, expectedMime, expectedDigest); + } + } + + private InputStream getStream(String fileName) { + return POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" + fileName); + } + + private void assertExpected(Detector detector, InputStream is, String expectedMime, String expectedDigest) throws IOException { + MediaType mt = detector.detect(is, new Metadata()); + assertEquals(expectedMime, mt.toString()); + assertEquals(expectedDigest, digest(is)); + } + + private String digest(String fileName) throws IOException { + return digest(POIFSContainerDetectorTest.class.getResourceAsStream("/test-documents/" + fileName)); + } + + private String digest(InputStream is) throws IOException { + return DigestUtils.sha256Hex(is); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index b30a73e14..5b6567308 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -21,7 +21,6 @@ import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -44,7 +43,6 @@ import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.Detector; import org.apache.tika.io.BoundedInputStream; -import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -183,17 +181,15 @@ public class DefaultZipContainerDetector implements Detector { if (TikaInputStream.isTikaInputStream(input)) { TikaInputStream tis = TikaInputStream.cast(input); - return detectZipFormatOnFile(tis, metadata); - } else { - if (markLimit >= 0) { - return detectStreaming(input, metadata); + if (markLimit < 1 || tis.hasFile()) { + return detectZipFormatOnFile(tis, metadata); } else { - try (TemporaryResources tmp = new TemporaryResources()) { - try (TikaInputStream tis = TikaInputStream.get(input, tmp, new Metadata())) { - return detectZipFormatOnFile(tis, metadata); - } - } + return tryStreaming(tis, metadata); } + } else { + LOG.warn("Applying streaming detection in DefaultZipContainerDetector. " + + "This can lead to imprecise detection. Please consider using a TikaInputStream"); + return detectStreaming(input, metadata); } } else if (!type.equals(MediaType.OCTET_STREAM)) { return type; @@ -202,6 +198,23 @@ public class DefaultZipContainerDetector implements Detector { } } + private MediaType tryStreaming(TikaInputStream tis, Metadata metadata) throws IOException { + BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, tis); + boundedInputStream.mark(markLimit); + MediaType mt = null; + //try streaming detect + try { + mt = detectStreaming(boundedInputStream, metadata, false); + if (! boundedInputStream.hasHitBound()) { + return mt; + } + } finally { + boundedInputStream.reset(); + } + //spool to disk + return detectZipFormatOnFile(tis, metadata); + } + /** * This will call TikaInputStream's getFile(). If there are no exceptions, * it will place the ZipFile in TikaInputStream's openContainer and leave it @@ -245,15 +258,13 @@ public class DefaultZipContainerDetector implements Detector { return MediaType.APPLICATION_ZIP; } if (LOG.isDebugEnabled()) { - LOG.debug("zip file failed to open; attempting streaming detect"); + LOG.debug("zip file failed to open; attempting streaming detect. Results may be imprecise"); } - if (zip == null) { - //problem opening zip file (truncated?) - try (InputStream is = new BufferedInputStream(Files.newInputStream(tis.getPath()))) { - return detectStreaming(is, metadata); - } catch (IOException e) { + //problem opening zip file (truncated?) + try (InputStream is = new BufferedInputStream(Files.newInputStream(tis.getPath()))) { + return detectStreaming(is, metadata); + } catch (IOException e) { //swallow - } } return MediaType.APPLICATION_ZIP; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java new file mode 100644 index 000000000..a00bd6420 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipDetectionTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.zip; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.util.List; + +import org.apache.commons.codec.digest.DigestUtils; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +/** + * Test cases for detecting zip-based files. + */ +public class ZipDetectionTest extends TikaTest { + + + @Test + public void testKMZDetection() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz"); + assertEquals("application/vnd.google-earth.kmz", + metadataList.get(0).get(HttpHeaders.CONTENT_TYPE)); + } + + @Test + public void testJARDetection() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar"); + assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE)); + } + + @Test + public void testStreaming() throws Exception { + String expectedDigest = digest("testJAR.jar"); + DefaultZipContainerDetector detector = new DefaultZipContainerDetector(); + try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) { + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + + for (int markLimit : new int[]{-1,0,10,100,1000}) { + detector = new DefaultZipContainerDetector(); + //mark limit is ignored for a TikaInputStream + try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) { + detector.setMarkLimit(markLimit); + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + } + + detector = new DefaultZipContainerDetector(); + //mark limit is ignored for a TikaInputStream + try (InputStream is = TikaInputStream.get(getStream("testJAR.jar"))) { + detector.setMarkLimit(-1); + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + + detector = new DefaultZipContainerDetector(); + //try on a file that isn't a TikaInputStream + try (InputStream is = new BufferedInputStream(Files.newInputStream(TikaInputStream.get(getStream("testJAR.jar")).getPath()))) { + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + + detector = new DefaultZipContainerDetector(); + try (InputStream is = ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + + detector = new DefaultZipContainerDetector(); + detector.setMarkLimit(100); + try (InputStream is = ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { + assertExpected(detector, is, "application/zip", expectedDigest); + } + + detector = new DefaultZipContainerDetector(); + detector.setMarkLimit(0); + try (InputStream is = ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { + assertExpected(detector, is, "application/zip", expectedDigest); + } + + detector = new DefaultZipContainerDetector(); + detector.setMarkLimit(100000); + try (InputStream is = ZipDetectionTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { + assertExpected(detector, is, "application/java-archive", expectedDigest); + } + } + + private InputStream getStream(String fileName) { + return ZipDetectionTest.class.getResourceAsStream("/test-documents/" + fileName); + } + + private void assertExpected(Detector detector, InputStream is, String expectedMime, String expectedDigest) throws IOException { + MediaType mt = detector.detect(is, new Metadata()); + assertEquals(expectedMime, mt.toString()); + assertEquals(expectedDigest, digest(is)); + + } + + private String digest(String fileName) throws IOException { + return digest(ZipDetectionTest.class.getResourceAsStream("/test-documents/" + fileName)); + } + + private String digest(InputStream is) throws IOException { + return DigestUtils.sha256Hex(is); + } + +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java deleted file mode 100644 index 14c0bb5a4..000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.detect.zip; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - -import org.apache.tika.TikaTest; -import org.apache.tika.metadata.HttpHeaders; -import org.apache.tika.metadata.Metadata; - -/** - * Test case for parsing zip files. - */ -public class ZipParserTest extends TikaTest { - - - @Test - public void testKMZDetection() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz"); - assertEquals("application/vnd.google-earth.kmz", - metadataList.get(0).get(HttpHeaders.CONTENT_TYPE)); - } - - @Test - public void testJARDetection() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar"); - assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE)); - } - - @Test - public void testStreaming() throws Exception { - long len = getLength("testJAR.jar"); - System.out.println(len); - DefaultZipContainerDetector detector = new DefaultZipContainerDetector(); - //detector.setMarkLimit(100); - try (InputStream is = ZipParserTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { - System.out.println(detector.detect(is, new Metadata())); - } - } - - private long getLength(String fileName) throws IOException { - return IOUtils.toByteArray(ZipParserTest.class.getResourceAsStream("/test-documents/" + fileName)).length; - } -}
