Author: jukka
Date: Sun Aug 21 14:12:10 2011
New Revision: 1159985

URL: http://svn.apache.org/viewvc?rev=1159985&view=rev
Log:
TIKA-447: Container aware mimetype detection

Move the container detectors to matching o.a.t.parser subpackages to avoid 
complicating the OSGi bundle classpath.

The ContainerAwareDetector class is no longer needed as the DefaultDetector 
will automatically load any available container detectors. Instead of directly 
removing the class, I moved it to tika-core and marked it as deprecated to 
prevent backwards compatibility problems with Tika 0.9 clients.

Added:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
   (contents, props changed)
      - copied, changed from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
      - copied, changed from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
      - copied, changed from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Removed:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Modified:
    
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Copied: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
 (from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
 Sun Aug 21 14:12:10 2011
@@ -18,12 +18,7 @@ package org.apache.tika.detect;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.zip.ZipException;
 
-import org.apache.poi.poifs.common.POIFSConstants;
-import org.apache.poi.poifs.storage.HeaderBlockConstants;
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -38,14 +33,14 @@ import org.apache.tika.mime.MimeTypes;
  *  to handle detection for non container formats. 
  * Should normally be used with a {@link TikaInputStream} to minimise 
  *  the memory usage.
+ *
+ * @deprecated Use the {@link DefaultDetector} class instead
  */
 public class ContainerAwareDetector implements Detector {
 
     private Detector fallbackDetector;
 
-    private Detector zipDetector;
-
-    private Detector poifsDetector;
+    private Detector defaultDetector;
 
     /**
      * Creates a new container detector, which will use the
@@ -54,16 +49,12 @@ public class ContainerAwareDetector impl
      */
     public ContainerAwareDetector(Detector fallbackDetector) {
         this.fallbackDetector = fallbackDetector;
-        poifsDetector = new POIFSContainerDetector();
-        zipDetector = new ZipContainerDetector();
+        this.defaultDetector = new DefaultDetector();
     }
 
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
-        MediaType type = zipDetector.detect(input, metadata);
-        if (MediaType.OCTET_STREAM.equals(type)) {
-            type = poifsDetector.detect(input, metadata);
-        }
+        MediaType type = defaultDetector.detect(input, metadata);
         if (MediaType.OCTET_STREAM.equals(type)) {
             return fallbackDetector.detect(input, metadata);
         }

Propchange: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
(original)
+++ 
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
Sun Aug 21 14:12:10 2011
@@ -1317,7 +1317,9 @@
     <glob pattern="*.wks"/>
     <glob pattern="*.wcm"/>
     <glob pattern="*.wdb"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
+
   <mime-type type="application/vnd.ms-wpl">
     <glob pattern="*.wpl"/>
   </mime-type>
@@ -1430,6 +1432,7 @@
       </match>
     </magic>
     <glob pattern="*.odf"/>
+    <sub-class-of type="application/zip"/>
   </mime-type>
 
   <mime-type type="application/vnd.oasis.opendocument.formula-template">

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 Sun Aug 21 14:12:10 2011
@@ -31,7 +31,6 @@ import org.apache.poi.poifs.filesystem.O
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
-import org.apache.tika.detect.ZipContainerDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
@@ -40,6 +39,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 

Copied: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Sun Aug 21 14:12:10 2011
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.detect;
+package org.apache.tika.parser.microsoft;
 
 import static org.apache.tika.mime.MediaType.application;
 
@@ -27,6 +27,8 @@ import java.util.Set;
 
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -89,46 +91,50 @@ public class POIFSContainerDetector impl
         }
 
         // We can only detect the exact type when given a TikaInputStream
-        if (!TikaInputStream.isTikaInputStream(input)) {
-            return OLE;
-        }
-
-        // Look for known top level entry names to detect the document type
-        Set<String> names = getTopLevelNames(TikaInputStream.get(input));
-        if (names.contains("Workbook")) {
-            return XLS;
-        } else if (names.contains("EncryptedPackage")) {
-            return OLE;
-        } else if (names.contains("WordDocument")) {
-            return DOC;
-        } else if (names.contains("Quill")) {
-            return PUB;
-        } else if (names.contains("PowerPoint Document")) {
-            return PPT;
-        } else if (names.contains("VisioDocument")) {
-            return VSD;
-        } else if (names.contains("CONTENTS")) {
-            return WPS;
-        } else if (names.contains("\u0001Ole10Native")) {
-            return OLE;
-        } else if (names.contains("PerfectOffice_MAIN")) {
-            if (names.contains("SlideShow")) {
-                return MediaType.application("x-corelpresentations"); // .shw
-            } else if (names.contains("PerfectOffice_OBJECTS")) {
-                return MediaType.application("x-quattro-pro"); // .wb?
-            } else {
-                return OLE;
-            }
-        } else if (names.contains("NativeContent_MAIN")) {
-            return MediaType.application("x-quattro-pro"); // .qpw
-        } else {
-            for (String name : names) {
-                if (name.startsWith("__substg1.0_")) {
-                    return MSG;
+        if (TikaInputStream.isTikaInputStream(input)) {
+            TemporaryFiles tmp = new TemporaryFiles();
+            try {
+                // Look for known top level entry names to detect the document 
type
+                Set<String> names =
+                    getTopLevelNames(TikaInputStream.get(input, tmp));
+                if (names.contains("Workbook")) {
+                    return XLS;
+                } else if (names.contains("EncryptedPackage")) {
+                    return OLE;
+                } else if (names.contains("WordDocument")) {
+                    return DOC;
+                } else if (names.contains("Quill")) {
+                    return PUB;
+                } else if (names.contains("PowerPoint Document")) {
+                    return PPT;
+                } else if (names.contains("VisioDocument")) {
+                    return VSD;
+                } else if (names.contains("CONTENTS")) {
+                    return WPS;
+                } else if (names.contains("\u0001Ole10Native")) {
+                    return OLE;
+                } else if (names.contains("PerfectOffice_MAIN")) {
+                    if (names.contains("SlideShow")) {
+                        return MediaType.application("x-corelpresentations"); 
// .shw
+                    } else if (names.contains("PerfectOffice_OBJECTS")) {
+                        return MediaType.application("x-quattro-pro"); // .wb?
+                    }
+                } else if (names.contains("NativeContent_MAIN")) {
+                    return MediaType.application("x-quattro-pro"); // .qpw
+                } else {
+                    for (String name : names) {
+                        if (name.startsWith("__substg1.0_")) {
+                            return MSG;
+                        }
+                    }
                 }
+            } finally {
+                tmp.dispose();
             }
-            return OLE;
         }
+
+        // Couldn't detect a more specific type
+        return OLE;
     }
 
     private static Set<String> getTopLevelNames(TikaInputStream stream)

Copied: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 (from r1159980, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java&r1=1159980&r2=1159985&rev=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 Sun Aug 21 14:12:10 2011
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.detect;
+package org.apache.tika.parser.pkg;
 
 import java.io.File;
 import java.io.IOException;
@@ -28,7 +28,9 @@ import org.apache.poi.openxml4j.opc.OPCP
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -68,8 +70,9 @@ public class ZipContainerDetector implem
             return MediaType.APPLICATION_ZIP;
         }
 
+        TemporaryFiles tmp = new TemporaryFiles();
         try {
-            File file = TikaInputStream.get(input).getFile();
+            File file = TikaInputStream.get(input, tmp).getFile();
             ZipFile zip = new ZipFile(file);
 
             MediaType type = detectOpenDocument(zip);
@@ -88,6 +91,8 @@ public class ZipContainerDetector implem
             return type;
         } catch (IOException e) {
             return MediaType.APPLICATION_ZIP;
+        } finally {
+            tmp.dispose();
         }
     }
 
@@ -168,4 +173,5 @@ public class ZipContainerDetector implem
             return null;
         }
     }
+
 }
\ No newline at end of file

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 Sun Aug 21 14:12:10 2011
@@ -13,5 +13,5 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.tika.detect.POIFSContainerDetector
-org.apache.tika.detect.ZipContainerDetector
+org.apache.tika.parser.microsoft.POIFSContainerDetector
+org.apache.tika.parser.pkg.ZipContainerDetector

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1159985&r1=1159984&r2=1159985&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Sun Aug 21 14:12:10 2011
@@ -32,7 +32,7 @@ import org.apache.tika.mime.MimeTypes;
  */
 public class TestContainerAwareDetector extends TestCase {
 
-    private final ContainerAwareDetector detector =
+    private final Detector detector =
         new ContainerAwareDetector(MimeTypes.getDefaultMimeTypes());
 
     private void assertDetect(String file, String type) throws Exception {
@@ -135,7 +135,7 @@ public class TestContainerAwareDetector 
         TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
         try {
             assertEquals(
-                    MediaType.APPLICATION_ZIP,
+                    MediaType.application("x-tika-ooxml"),
                     detector.detect(xlsx, new Metadata()));
         } finally {
             xlsx.close();


Reply via email to