This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b4047eb2d92ee4ae8d8e02d12079232419775a73
Author: tballison <talli...@mitre.org>
AuthorDate: Wed Mar 7 14:55:06 2018 -0500

    TIKA-2591 -- Add workaround to identify TIFFs that might confuse 
commons-compress's tar detection via Daniel Schmidt
---
 CHANGES.txt                                        |  4 ++
 .../tika/parser/pkg/ZipContainerDetector.java      | 42 ++++++++++++++++-
 .../tika/parser/pkg/ZipContainerDetectorTest.java  | 55 ++++++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 73d3d68..9b05d80 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 Release 1.18 - ???
 
+   * Add workaround to identify TIFFs that might confuse
+     commons-compress's tar detection via Daniel Schmidt
+     (TIKA-2591)
+
    * Ignore non-IANA supported charsets in HTML meta-headers
      during charset detection in HTMLEncodingDetector
      via Andreas Meier (TIKA-2592)
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 65e2e1d..c453617 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -56,6 +57,19 @@ import 
org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
  * formats to figure out exactly what the file is.
  */
 public class ZipContainerDetector implements Detector {
+
+    //Regrettably, some tiff files can be incorrectly identified
+    //as tar files.  We need this ugly workaround to rule out TIFF.
+    //If commons-compress ever chooses to take over TIFF detection
+    //we can remove all of this. See TIKA-2591.
+    private final static MediaType TIFF = MediaType.image("tiff");
+    private final static byte[][] TIFF_SIGNATURES = new byte[3][];
+    static {
+        TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
+        TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
+        TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
+    }
+
     private static final Pattern MACRO_TEMPLATE_PATTERN = 
Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
 
     // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use 
PackageRelationshipTypes 
@@ -86,8 +100,11 @@ public class ZipContainerDetector implements Detector {
             int length = tis.peek(prefix);
 
             MediaType type = detectArchiveFormat(prefix, length);
-            if (PackageParser.isZipArchive(type)
-                    && TikaInputStream.isTikaInputStream(input)) {
+
+            if (type == TIFF) {
+                return TIFF;
+            } else if (PackageParser.isZipArchive(type)
+                        && TikaInputStream.isTikaInputStream(input)) {
                 return detectZipFormat(tis);
             } else if (!type.equals(MediaType.OCTET_STREAM)) {
                 return type;
@@ -112,7 +129,28 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
+    private static boolean isTiff(byte[] prefix) {
+        for (byte[] sig : TIFF_SIGNATURES) {
+            if(arrayStartWith(sig, prefix)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
+        for (int i = 0; i < needle.length; i++) {
+            if (haystack[i] != needle[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+        if (isTiff(prefix)) {
+            return TIFF;
+        }
         try {
             String name = ArchiveStreamFactory.detect(new 
ByteArrayInputStream(prefix, 0, length));
             return PackageParser.getMediaType(name);
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
new file mode 100644
index 0000000..2865442
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class ZipContainerDetectorTest extends TikaTest {
+
+    @Test
+    public void testTiffWorkaround() throws Exception {
+        //TIKA-2591
+        ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
+        Metadata metadata = new Metadata();
+        try (InputStream is = 
TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
+            MediaType mt = zipContainerDetector.detect(is, metadata);
+            assertEquals(MediaType.image("tiff"), mt);
+        }
+        metadata = new Metadata();
+        try (InputStream is = 
TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif")))
 {
+            MediaType mt = zipContainerDetector.detect(is, metadata);
+            assertEquals(MediaType.image("tiff"), mt);
+        }
+
+    }
+}
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
talli...@apache.org.

Reply via email to