Author: jukka
Date: Wed Aug  4 09:23:20 2010
New Revision: 982175

URL: http://svn.apache.org/viewvc?rev=982175&view=rev
Log:
TIKA-447: Container aware mimetype detection

Use TikaInputStream in the container-aware Detectors to avoid changing the 
visible state of the given stream

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
Wed Aug  4 09:23:20 2010
@@ -38,6 +38,19 @@ import org.apache.tika.metadata.Metadata
 public class TikaInputStream extends ProxyInputStream {
 
     /**
+     * Checks whether the given stream is a TikaInputStream instance.
+     * The given stream can be <code>null</code>, in which case the return
+     * value is <code>false</code>.
+     * 
+     * @param stream input stream, possibly <code>null</code>
+     * @return <code>true</code> if the stream is a TikaInputStream instance,
+     *         <code>false</code> otherwise
+     */
+    public static boolean isTikaInputStream(InputStream stream) {
+        return stream instanceof TikaInputStream;
+    }
+
+    /**
      * Casts or wraps the given stream to a TikaInputStream instance.
      * This method can be used to access the functionality of this class
      * even when given just a normal input stream instance.

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
 Wed Aug  4 09:23:20 2010
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.detect;
 
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
@@ -23,7 +24,6 @@ import org.apache.poi.poifs.filesystem.P
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 
 
@@ -32,20 +32,21 @@ import org.apache.tika.parser.microsoft.
  *  to figure out exactly what the file is
  */
 public class POIFSContainerDetector implements Detector {
+
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
-        POIFSFileSystem fs = new POIFSFileSystem(input);
+        if (TikaInputStream.isTikaInputStream(input)) {
+            TikaInputStream stream = TikaInputStream.get(input);
+
+            // NOTE: POIFSFileSystem will close the FileInputStream
+            POIFSFileSystem fs =
+                new POIFSFileSystem(new FileInputStream(stream.getFile()));
+            stream.setOpenContainer(fs);
 
-        POIFSDocumentType type =
-            OfficeParser.POIFSDocumentType.detectType(fs);
-        
-        if(input instanceof TikaInputStream) {
-            ((TikaInputStream)input).setOpenContainer(fs);
+            return POIFSDocumentType.detectType(fs).getType();
         } else {
-            fs = null;
+            return MediaType.application("x-tika-msoffice");
         }
-        
-        return type.getType();
     }
-}
 
+}

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 Wed Aug  4 09:23:20 2010
@@ -18,8 +18,9 @@ package org.apache.tika.detect;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
 import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
+import java.util.zip.ZipFile;
 
 import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -37,29 +38,32 @@ import org.apache.tika.mime.MediaType;
  *  to figure out exactly what the file is
  */
 public class ZipContainerDetector implements Detector {
+
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
-       if(input instanceof TikaInputStream) {
-           return detect((TikaInputStream)input, metadata);
-       }
-       return detect( TikaInputStream.get(input), metadata );
+        if (TikaInputStream.isTikaInputStream(input)) {
+            return detect(TikaInputStream.get(input));
+        } else {
+            return MediaType.APPLICATION_ZIP;
+        }
     }
-    public MediaType detect(TikaInputStream input, Metadata metadata)
-             throws IOException {
-        ZipInputStream zip = new ZipInputStream(input);
-        ZipEntry entry = zip.getNextEntry();
-        while (entry != null) {
+
+    private MediaType detect(TikaInputStream input) throws IOException {
+        ZipFile zip = new ZipFile(input.getFile());
+        for (ZipEntry entry : Collections.list(zip.entries())) {
             // Is it an Open Document file?
             if (entry.getName().equals("mimetype")) {
-                String type = IOUtils.toString(zip, "UTF-8");
-                return fromString(type);
+                InputStream stream = zip.getInputStream(entry);
+                try {
+                    return fromString(IOUtils.toString(stream, "UTF-8"));
+                } finally {
+                    stream.close();
+                }
             } else if (entry.getName().equals("_rels/.rels") || 
                    entry.getName().equals("[Content_Types].xml")) {
                 // Office Open XML File
                // As POI to open and investigate it for us
                try {
-                   input.reset();
-                   
                    OPCPackage pkg = OPCPackage.open(input);
                    input.setOpenContainer(pkg);
                    
@@ -85,8 +89,6 @@ public class ZipContainerDetector implem
                // Java Jar
                return MediaType.application("java-archive");
             }
-            
-            entry = zip.getNextEntry();
         }
         
         return MediaType.APPLICATION_ZIP;


Reply via email to