Author: nick Date: Tue Oct 19 14:56:54 2010 New Revision: 1024255 URL: http://svn.apache.org/viewvc?rev=1024255&view=rev Log: Add iWork support to the Container Aware Detector (TIKA-533) It's a bit icky for now, but it works and it's quick...
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip (with props) Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=1024255&r1=1024254&r2=1024255&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java Tue Oct 19 14:56:54 2010 @@ -31,6 +31,7 @@ import org.apache.tika.io.IOUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.iwork.IWorkPackageParser; /** @@ -87,7 +88,12 @@ public class ZipContainerDetector implem throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage()); } } else if(entry.getName().equals("buildVersionHistory.plist")) { - // TODO - iWork + // This is an iWork document + + // Reset and ask + zip.close(); + zip = new ZipFile(input.getFile()); + return IWorkPackageParser.identifyType(zip); } else if(entry.getName().equals("META-INF/")) { // Java Jar return MediaType.application("java-archive"); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java?rev=1024255&r1=1024254&r2=1024255&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java Tue Oct 19 14:56:54 2010 @@ -16,6 +16,15 @@ */ package org.apache.tika.parser.iwork; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; @@ -29,13 +38,6 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - /** * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files. * This parser delegates the relevant files to {...@link IWorkParser} that parsers the content. @@ -78,4 +80,39 @@ public class IWorkPackageParser implemen throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } + + /** + * Locates the appropriate index file entry, and reads from that + * the root element of the document. That is used to the identify + * the correct type of the keynote container. + */ + public static MediaType identifyType(ZipFile zip) + throws IOException { + for (ZipEntry entry : Collections.list(zip.entries())) { + if (relevantFileNames.contains(entry.getName())) { + // Bingo, found the right entry + + // Grab the first few hundred bytes of the file + // This is quite sick, but it is much quicker and less + // memory intensive than the "proper" way to do it! + byte[] data = new byte[400]; + InputStream stream = zip.getInputStream(entry); + stream.read(data); + + String docStart = new String(data, "UTF-8"); + if(docStart.contains("ls:document")) { + return MediaType.application("vnd.apple.numbers"); + } + if(docStart.contains("sl:document")) { + return MediaType.application("vnd.apple.pages"); + } + if(docStart.contains("key:presentation")) { + return MediaType.application("vnd.apple.keynote"); + } + } + } + + // Not sure, fallback to the container type + return MediaType.application("vnd.apple.iwork"); + } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1024255&r1=1024254&r2=1024255&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Tue Oct 19 14:56:54 2010 @@ -172,7 +172,25 @@ public class TestContainerAwareDetector } public void testDetectIWork() throws Exception { - // TODO + TikaInputStream tis; + + tis = TikaInputStream.get(getTestDoc("testKeynote.key")); + assertEquals( + MediaType.application("vnd.apple.keynote"), + d.detect(tis, new Metadata()) + ); + + tis = TikaInputStream.get(getTestDoc("testNumbers.numbers")); + assertEquals( + MediaType.application("vnd.apple.numbers"), + d.detect(tis, new Metadata()) + ); + + tis = TikaInputStream.get(getTestDoc("testPages.pages")); + assertEquals( + MediaType.application("vnd.apple.pages"), + d.detect(tis, new Metadata()) + ); } public void testDetectZip() throws Exception { @@ -184,6 +202,12 @@ public class TestContainerAwareDetector d.detect(tis, new Metadata()) ); + tis = TikaInputStream.get(getTestDoc("test-zip-of-zip.zip")); + assertEquals( + MediaType.application("zip"), + d.detect(tis, new Metadata()) + ); + tis = TikaInputStream.get(getTestDoc("testJAR.jar")); assertEquals( MediaType.application("java-archive"), Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip?rev=1024255&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test-zip-of-zip.zip ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream