Author: nick
Date: Thu Jul 29 13:03:53 2010
New Revision: 980430

URL: http://svn.apache.org/viewvc?rev=980430&view=rev
Log:
Add Office Open XML (OOXML) support to the Zip container aware detector 
(TIKA-447)
If an OOXML zip file entry is found, passes this to POI and fetches the content 
type through that. Also updates the OOXML text extractor to take advantage of 
the open package if detection was already done.

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=980430&r1=980429&r2=980430&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 Thu Jul 29 13:03:53 2010
@@ -21,7 +21,13 @@ import java.io.InputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
@@ -33,28 +39,62 @@ import org.apache.tika.mime.MediaType;
 public class ZipContainerDetector implements Detector {
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
+       if(input instanceof TikaInputStream) {
+           return detect((TikaInputStream)input, metadata);
+       }
+       return detect( TikaInputStream.get(input), metadata );
+    }
+    public MediaType detect(TikaInputStream input, Metadata metadata)
+             throws IOException {
         ZipInputStream zip = new ZipInputStream(input);
         ZipEntry entry = zip.getNextEntry();
         while (entry != null) {
             // Is it an Open Document file?
             if (entry.getName().equals("mimetype")) {
                 String type = IOUtils.toString(zip, "UTF-8");
-                int splitAt = type.indexOf('/');
-                if(splitAt > -1) {
-                    return new MediaType(
-                           type.substring(0,splitAt), 
-                           type.substring(splitAt+1)
-                    );
+                return fromString(type);
+            } else if (entry.getName().equals("_rels/.rels") || 
+                   entry.getName().equals("[Content_Types].xml")) {
+                // Office Open XML File
+               // As POI to open and investigate it for us
+               try {
+                   input.reset();
+                   
+                   OPCPackage pkg = OPCPackage.open(input);
+                   input.setOpenContainer(pkg);
+                   
+                    PackageRelationshipCollection core = 
+                         
pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+                    if(core.size() != 1) {
+                       throw new IOException("Invalid OOXML Package received - 
expected 1 core document, found " + core.size());
+                    }
+
+                    // Get the type of the core document part
+                    PackagePart corePart = 
pkg.getPart(core.getRelationship(0));
+                    String coreType = corePart.getContentType();
+                    
+                    // Turn that into the type of the overall document
+                    String docType = coreType.substring(0, 
coreType.lastIndexOf('.'));
+                    return fromString(docType);
+                } catch(InvalidFormatException e) {
+                    throw new IOException("Office Open XML File detected, but 
corrupted", e);
                 }
-                return MediaType.APPLICATION_ZIP;
-            } else if (entry.getName().equals("[Content_Types].xml")) {
-                // Office Open XML
-               // TODO
             }
             entry = zip.getNextEntry();
         }
         
         return MediaType.APPLICATION_ZIP;
     }
+    
+    private static MediaType fromString(String type) {
+        int splitAt = type.indexOf('/');
+        if(splitAt > -1) {
+            return new MediaType(
+                   type.substring(0,splitAt), 
+                   type.substring(splitAt+1)
+            );
+        }
+        return MediaType.APPLICATION_ZIP;
+    }
 }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=980430&r1=980429&r2=980430&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 Thu Jul 29 13:03:53 2010
@@ -25,6 +25,7 @@ import org.apache.poi.POIXMLTextExtracto
 import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -32,6 +33,7 @@ import org.apache.poi.xssf.usermodel.XSS
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
@@ -50,8 +52,16 @@ public class OOXMLExtractorFactory {
         try {
             OOXMLExtractor extractor;
 
-            POIXMLTextExtractor poiExtractor =
-                (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
+            POIXMLTextExtractor poiExtractor;
+            if(stream instanceof TikaInputStream && 
+                   ((TikaInputStream)stream).getOpenContainer() != null) {
+               poiExtractor = ExtractorFactory.createExtractor(
+                    (OPCPackage)((TikaInputStream)stream).getOpenContainer()
+               );
+            } else {
+               poiExtractor = (POIXMLTextExtractor) 
ExtractorFactory.createExtractor(stream);
+            }
+            
             POIXMLDocument document = poiExtractor.getDocument();
             if (document instanceof XSLFSlideShow) {
                 extractor = new XSLFPowerPointExtractorDecorator(

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=980430&r1=980429&r2=980430&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Thu Jul 29 13:03:53 2010
@@ -20,6 +20,8 @@ import java.io.InputStream;
 
 import junit.framework.TestCase;
 
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -70,7 +72,9 @@ public class TestContainerAwareDetector 
                MediaType.application("vnd.ms-powerpoint"),
                d.detect(tis, new Metadata())
         );
+        
         assertNotNull(tis.getOpenContainer());
+        assertEquals(POIFSFileSystem.class, tis.getOpenContainer().getClass());
     }
     
     public void testDetectODF() throws Exception {
@@ -98,7 +102,37 @@ public class TestContainerAwareDetector 
     }
     
     public void testDetectOOXML() throws Exception {
-       
+        InputStream input;
+        
+        input = getTestDoc("testEXCEL.xlsx");
+        assertEquals(
+               
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+               d.detect(input, new Metadata())
+        );
+        
+        input = getTestDoc("testWORD.docx");
+        assertEquals(
+               
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
+               d.detect(input, new Metadata())
+        );
+        
+        input = getTestDoc("testPPT.pptx");
+        assertEquals(
+               
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+               d.detect(input, new Metadata())
+        );
+        
+        TikaInputStream tis = TikaInputStream.get(getTestDoc("testPPT.pptx"));
+        assertEquals(
+               
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+               d.detect(tis, new Metadata())
+        );
+        
+        assertNotNull(tis.getOpenContainer());
+        assertTrue(
+                "Open container should be OPCPackage, not " + 
tis.getOpenContainer().getClass(), 
+                tis.getOpenContainer() instanceof OPCPackage
+        );
     }
     
     public void testDetectZip() throws Exception {


Reply via email to