This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push: new 462ee47 TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt 462ee47 is described below commit 462ee4744fd426cfdb12539435627b25e789c912 Author: tballison <talli...@mitre.org> AuthorDate: Wed Mar 7 14:55:06 2018 -0500 TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt --- CHANGES.txt | 4 ++ .../tika/parser/pkg/ZipContainerDetector.java | 42 ++++++++++++++++- .../tika/parser/pkg/ZipContainerDetectorTest.java | 55 ++++++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3f0f31a..71cb60b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,10 @@ Release 2.0.0 - ??? Other changes + * Add workaround to identify TIFFs that might confuse + commons-compress's tar detection via Daniel Schmidt + (TIKA-2591) + * Ignore non-IANA supported charsets in HTML meta-headers during charset detection in HTMLEncodingDetector via Andreas Meier (TIKA-2592) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index 65e2e1d..c453617 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.Enumeration; import java.util.HashSet; import java.util.Iterator; @@ -56,6 +57,19 @@ import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; * formats to figure out exactly what the file is. */ public class ZipContainerDetector implements Detector { + + //Regrettably, some tiff files can be incorrectly identified + //as tar files. We need this ugly workaround to rule out TIFF. + //If commons-compress ever chooses to take over TIFF detection + //we can remove all of this. See TIKA-2591. + private final static MediaType TIFF = MediaType.image("tiff"); + private final static byte[][] TIFF_SIGNATURES = new byte[3][]; + static { + TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a}; + TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00}; + TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b}; + } + private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE); // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes @@ -86,8 +100,11 @@ public class ZipContainerDetector implements Detector { int length = tis.peek(prefix); MediaType type = detectArchiveFormat(prefix, length); - if (PackageParser.isZipArchive(type) - && TikaInputStream.isTikaInputStream(input)) { + + if (type == TIFF) { + return TIFF; + } else if (PackageParser.isZipArchive(type) + && TikaInputStream.isTikaInputStream(input)) { return detectZipFormat(tis); } else if (!type.equals(MediaType.OCTET_STREAM)) { return type; @@ -112,7 +129,28 @@ public class ZipContainerDetector implements Detector { } } + private static boolean isTiff(byte[] prefix) { + for (byte[] sig : TIFF_SIGNATURES) { + if(arrayStartWith(sig, prefix)) { + return true; + } + } + return false; + } + + private static boolean arrayStartWith(byte[] needle, byte[] haystack) { + for (int i = 0; i < needle.length; i++) { + if (haystack[i] != needle[i]) { + return false; + } + } + return true; + } + private static MediaType detectArchiveFormat(byte[] prefix, int length) { + if (isTiff(prefix)) { + return TIFF; + } try { String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length)); return PackageParser.getMediaType(name); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java new file mode 100644 index 0000000..2865442 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pkg; + + +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.tika.TikaTest; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class ZipContainerDetectorTest extends TikaTest { + + @Test + public void testTiffWorkaround() throws Exception { + //TIKA-2591 + ZipContainerDetector zipContainerDetector = new ZipContainerDetector(); + Metadata metadata = new Metadata(); + try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) { + MediaType mt = zipContainerDetector.detect(is, metadata); + assertEquals(MediaType.image("tiff"), mt); + } + metadata = new Metadata(); + try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) { + MediaType mt = zipContainerDetector.detect(is, metadata); + assertEquals(MediaType.image("tiff"), mt); + } + + } +} \ No newline at end of file -- To stop receiving notification emails like this one, please contact talli...@apache.org.