Author: nick
Date: Wed Jul 28 13:55:20 2010
New Revision: 980058

URL: http://svn.apache.org/viewvc?rev=980058&view=rev
Log:
TIKA-447 - Container aware mimetype detection
Initial implementation of container aware detection. New ContainerAwareDetector 
class, which is a Detector, which will open and handle OLE2 and Zip files to 
detect the mimetype, falling back on a specified default detector for 
non-container formats.
Some work remains - Not all Zip file based things are detected yet, and the Zip 
based parsers don't yet take advantage of the already open zip stream. (OLE2 
ones can)

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ogg/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=980058&r1=980057&r2=980058&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
Wed Jul 28 13:55:20 2010
@@ -168,6 +168,13 @@ public class TikaInputStream extends Pro
      * Marked position, or -1 if there is no current mark.
      */
     private long mark = -1;
+    
+    /**
+     * A opened container, such as a POIFS FileSystem
+     *  for an OLE2 document, or a Zip file for a
+     *  zip based (eg ooxml, odf) document.
+     */
+    private Object openContainer;
 
     /**
      * 
@@ -211,6 +218,26 @@ public class TikaInputStream extends Pro
 
         return n;
     }
+    
+    /**
+     * Returns the open container object, such as a
+     *  POIFS FileSystem in the event of an OLE2
+     *  document being detected and processed by
+     *  the OLE2 detector. 
+     */
+    public Object getOpenContainer() {
+        return openContainer;
+    }
+    
+    /**
+     * Stores the open container object against
+     *  the stream, eg after a Zip contents 
+     *  detector has loaded the file to decide
+     *  what it contains.
+     */
+    public void setOpenContainer(Object container) {
+        openContainer = container;
+    }
 
     public File getFile() throws IOException {
         if (file == null) {
@@ -298,6 +325,11 @@ public class TikaInputStream extends Pro
         super.mark(readlimit);
         mark = position;
     }
+    
+    @Override
+    public boolean markSupported() {
+       return true;
+    }
 
     @Override
     public void reset() throws IOException {
@@ -309,6 +341,9 @@ public class TikaInputStream extends Pro
 
     @Override
     public void close() throws IOException {
+        if (openContainer != null) {
+            openContainer = null;
+        }
         if (in != null) {
             in.close();
             in = null;

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java?rev=980058&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
 Wed Jul 28 13:55:20 2010
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.poifs.storage.HeaderBlockConstants;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypes;
+
+
+/**
+ * A detector that knows about the container formats that we support 
+ *  (eg POIFS, Zip), and is able to peek inside them to better figure 
+ *  out the contents.
+ * Delegates to another {...@link Detector} (normally {...@link MimeTypes})
+ *  to handle detection for non container formats. 
+ * Should normally be used with a {...@link TikaInputStream} to minimise 
+ *  the memory usage.
+ */
+public class ContainerAwareDetector implements Detector {
+    private Detector fallbackDetector;
+    private ZipContainerDetector zipDetector;
+    private POIFSContainerDetector poifsDetector;
+    
+    /**
+     * Creates a new container detector, which will use the
+     *  given detector for non container formats.
+     * @param fallbackDetector The detector to use for non-containers
+     */
+    public ContainerAwareDetector(Detector fallbackDetector) {
+        this.fallbackDetector = fallbackDetector;
+        poifsDetector = new POIFSContainerDetector();
+        zipDetector = new ZipContainerDetector();
+    }
+    
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        return detect(TikaInputStream.get(input), metadata);
+    }
+    
+    public MediaType detect(TikaInputStream input, Metadata metadata)
+            throws IOException {
+       
+        // Grab the first 8 bytes, used to do container detection
+        input.mark(8);
+        byte[] first8 = new byte[8];
+        IOUtils.readFully(input, first8);
+        input.reset();
+       
+        // Is this a zip file?
+        if(first8[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
+           first8[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
+           first8[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
+           first8[3] == POIFSConstants.OOXML_FILE_HEADER[3]) {
+            return zipDetector.detect(input, metadata);
+        }
+        
+        // Is this an ole2 file?
+        long ole2Signature = LittleEndian.getLong(first8, 0);
+        if(ole2Signature == HeaderBlockConstants._signature) {
+            return poifsDetector.detect(input, metadata);
+        }
+        
+        // Not a supported container, ask our fall back
+        //  detector to figure it out
+        return fallbackDetector.detect(input, metadata);
+    }
+}
+

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java?rev=980058&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
 Wed Jul 28 13:55:20 2010
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ *  to figure out exactly what the file is
+ */
+public class POIFSContainerDetector implements Detector {
+    public MediaType detect(InputStream input, Metadata metadata)
+             throws IOException {
+        POIFSFileSystem fs = new POIFSFileSystem(input);
+
+        POIFSDocumentType type =
+            OfficeParser.POIFSDocumentType.detectType(fs);
+        
+        if(input instanceof TikaInputStream) {
+            ((TikaInputStream)input).setOpenContainer(fs);
+        } else {
+            fs = null;
+        }
+        
+        return type.getType();
+    }
+}
+

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=980058&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 Wed Jul 28 13:55:20 2010
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+
+/**
+ * A detector that works on a Zip document
+ *  to figure out exactly what the file is
+ */
+public class ZipContainerDetector implements Detector {
+    public MediaType detect(InputStream input, Metadata metadata)
+             throws IOException {
+        ZipInputStream zip = new ZipInputStream(input);
+        ZipEntry entry = zip.getNextEntry();
+        while (entry != null) {
+            // Is it an Open Document file?
+            if (entry.getName().equals("mimetype")) {
+                String type = IOUtils.toString(zip, "UTF-8");
+                int splitAt = type.indexOf('/');
+                if(splitAt > -1) {
+                    return new MediaType(
+                           type.substring(0,splitAt), 
+                           type.substring(splitAt+1)
+                    );
+                }
+                return MediaType.APPLICATION_ZIP;
+            } else if (entry.getName().equals("[Content_Types].xml")) {
+                // Office Open XML
+               // TODO
+            }
+            entry = zip.getNextEntry();
+        }
+        
+        return MediaType.APPLICATION_ZIP;
+    }
+}
+

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=980058&r1=980057&r2=980058&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Wed Jul 28 13:55:20 2010
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.D
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
@@ -151,7 +152,13 @@ public class OfficeParser implements Par
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
-        POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+        POIFSFileSystem filesystem;
+        if(stream instanceof TikaInputStream && 
+               ((TikaInputStream)stream).getOpenContainer() != null) {
+            filesystem = 
(POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
+        } else {
+            filesystem = new POIFSFileSystem(stream);
+        }
 
         // Parse summary entries first, to make metadata available early
         new SummaryExtractor(metadata).parseSummaries(filesystem);

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=980058&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Wed Jul 28 13:55:20 2010
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Junit test class for {...@link ContainerAwareDetector}
+ */
+public class TestContainerAwareDetector extends TestCase {
+    private TikaConfig tc;
+    private ContainerAwareDetector d;
+
+    public void setUp() throws Exception {
+        tc = TikaConfig.getDefaultConfig();
+        d = new ContainerAwareDetector(tc.getMimeRepository());
+    }
+    
+    private InputStream getTestDoc(String filename) {
+        InputStream input = 
TestContainerAwareDetector.class.getResourceAsStream(
+            "/test-documents/" + filename);
+        assertNotNull(input);
+        return input;
+    }
+    
+    public void testDetectOLE2() throws Exception {
+        InputStream input;
+        
+        input = getTestDoc("testEXCEL.xls");
+        assertEquals(
+               MediaType.application("vnd.ms-excel"),
+               d.detect(input, new Metadata())
+        );
+        
+        input = getTestDoc("testWORD.doc");
+        assertEquals(
+               MediaType.application("msword"),
+               d.detect(input, new Metadata())
+        );
+        
+        input = getTestDoc("testPPT.ppt");
+        assertEquals(
+               MediaType.application("vnd.ms-powerpoint"),
+               d.detect(input, new Metadata())
+        );
+        
+        TikaInputStream tis = TikaInputStream.get(getTestDoc("testPPT.ppt"));
+        assertEquals(
+               MediaType.application("vnd.ms-powerpoint"),
+               d.detect(tis, new Metadata())
+        );
+        assertNotNull(tis.getOpenContainer());
+    }
+    
+    public void testDetectODF() throws Exception {
+        InputStream input;
+        
+        input = getTestDoc("testODFwithOOo3.odt");
+        assertEquals(
+               MediaType.application("vnd.oasis.opendocument.text"),
+               d.detect(input, new Metadata())
+        );
+        
+        input = getTestDoc("testOpenOffice2.odf");
+        assertEquals(
+               MediaType.application("vnd.oasis.opendocument.formula"),
+               d.detect(input, new Metadata())
+        );
+        
+        TikaInputStream tis = 
TikaInputStream.get(getTestDoc("testOpenOffice2.odf"));
+        assertEquals(
+               MediaType.application("vnd.oasis.opendocument.formula"),
+               d.detect(tis, new Metadata())
+        );
+        // Doesn't store the zip parser yet
+        assertNull(tis.getOpenContainer());
+    }
+    
+    public void testDetectOOXML() throws Exception {
+       
+    }
+    
+    public void testDetectZip() throws Exception {
+       
+    }
+}


Reply via email to