Author: nick
Date: Tue Oct 19 14:56:54 2010
New Revision: 1024255

URL: http://svn.apache.org/viewvc?rev=1024255&view=rev
Log:
Add iWork support to the Container Aware Detector (TIKA-533)
It's a bit icky for now, but it works and it's quick...

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip   
(with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=1024255&r1=1024254&r2=1024255&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
 Tue Oct 19 14:56:54 2010
@@ -31,6 +31,7 @@ import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
 
 
 /**
@@ -87,7 +88,12 @@ public class ZipContainerDetector implem
                 throw new IOException("Office Open XML File detected, but 
corrupted - " + e.getMessage());
              }
           } else if(entry.getName().equals("buildVersionHistory.plist")) {
-             // TODO - iWork
+             // This is an iWork document
+             
+             // Reset and ask
+             zip.close();
+             zip = new ZipFile(input.getFile());
+             return IWorkPackageParser.identifyType(zip);
           } else if(entry.getName().equals("META-INF/")) {
              // Java Jar
              return MediaType.application("java-archive");

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java?rev=1024255&r1=1024254&r2=1024255&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
 Tue Oct 19 14:56:54 2010
@@ -16,6 +16,15 @@
  */
 package org.apache.tika.parser.iwork;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
 import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
@@ -29,13 +38,6 @@ import org.apache.tika.parser.Parser;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
 /**
  * A parser for the IWork container files. This includes *.key, *.pages and 
*.numbers files.
  * This parser delegates the relevant files to {...@link IWorkParser} that 
parsers the content.
@@ -78,4 +80,39 @@ public class IWorkPackageParser implemen
             throws IOException, SAXException, TikaException {
         parse(stream, handler, metadata, new ParseContext());
     }
+    
+    /**
+     * Locates the appropriate index file entry, and reads from that
+     *  the root element of the document. That is used to the identify
+     *  the correct type of the keynote container.
+     */
+    public static MediaType identifyType(ZipFile zip)
+         throws IOException {
+       for (ZipEntry entry : Collections.list(zip.entries())) {
+          if (relevantFileNames.contains(entry.getName())) {
+             // Bingo, found the right entry
+             
+             // Grab the first few hundred bytes of the file
+             // This is quite sick, but it is much quicker and less
+             //  memory intensive than the "proper" way to do it!
+             byte[] data = new byte[400];
+             InputStream stream = zip.getInputStream(entry);
+             stream.read(data);
+             
+             String docStart = new String(data, "UTF-8");
+             if(docStart.contains("ls:document")) {
+                return MediaType.application("vnd.apple.numbers");
+             }
+             if(docStart.contains("sl:document")) {
+                return MediaType.application("vnd.apple.pages");
+             }
+             if(docStart.contains("key:presentation")) {
+                return MediaType.application("vnd.apple.keynote");
+             }
+          }
+       }
+       
+       // Not sure, fallback to the container type
+       return MediaType.application("vnd.apple.iwork");
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1024255&r1=1024254&r2=1024255&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Tue Oct 19 14:56:54 2010
@@ -172,7 +172,25 @@ public class TestContainerAwareDetector 
     }
     
     public void testDetectIWork() throws Exception {
-       // TODO
+       TikaInputStream tis;
+
+       tis = TikaInputStream.get(getTestDoc("testKeynote.key"));
+       assertEquals(
+             MediaType.application("vnd.apple.keynote"),
+             d.detect(tis, new Metadata())
+       );
+
+       tis = TikaInputStream.get(getTestDoc("testNumbers.numbers"));
+       assertEquals(
+             MediaType.application("vnd.apple.numbers"),
+             d.detect(tis, new Metadata())
+       );
+
+       tis = TikaInputStream.get(getTestDoc("testPages.pages"));
+       assertEquals(
+             MediaType.application("vnd.apple.pages"),
+             d.detect(tis, new Metadata())
+       );
     }
     
     public void testDetectZip() throws Exception {
@@ -184,6 +202,12 @@ public class TestContainerAwareDetector 
              d.detect(tis, new Metadata())
        );
 
+       tis = TikaInputStream.get(getTestDoc("test-zip-of-zip.zip"));
+       assertEquals(
+             MediaType.application("zip"),
+             d.detect(tis, new Metadata())
+       );
+
        tis = TikaInputStream.get(getTestDoc("testJAR.jar"));
        assertEquals(
              MediaType.application("java-archive"),

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip?rev=1024255&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to