Repository: tika Updated Branches: refs/heads/master a46ffacf1 -> d6981ad81
TIKA-1358 add preliminary detection for iWorks 2013 file types Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d6981ad8 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d6981ad8 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d6981ad8 Branch: refs/heads/master Commit: d6981ad81334eb20174004d7c0d96acd9f1d2f12 Parents: a46ffac Author: tballison <[email protected]> Authored: Wed Jun 22 11:56:37 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Jun 22 11:56:37 2016 -0400 ---------------------------------------------------------------------- .../iwork/iwana/IWork13PackageParser.java | 86 +++++++++++++++++++ .../tika/parser/pkg/ZipContainerDetector.java | 12 +++ .../tika/detect/TestContainerAwareDetector.java | 11 +++ .../test-documents/testKeynote2013.key | Bin 0 -> 274397 bytes .../resources/test-documents/testKeynoteNew.key | Bin 274397 -> 0 bytes .../test-documents/testNumbers2013.numbers | Bin 0 -> 179147 bytes .../test-documents/testNumbersNew.numbers | Bin 179147 -> 0 bytes .../test-documents/testPages2013.pages | Bin 0 -> 237567 bytes .../resources/test-documents/testPagesNew.pages | Bin 237567 -> 0 bytes 9 files changed, 109 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java new file mode 100644 index 0000000..637b51b --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.iwork.iwana; + +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +public class IWork13PackageParser extends AbstractParser { + + public enum IWork13DocumentType { + KEYNOTE13(MediaType.application("vnd.apple.keynote.13")), + NUMBERS13(MediaType.application("vnd.apple.numbers.13")), + PAGES13(MediaType.application("vnd.apple.pages.13")), + UNKNOWN13(MediaType.application("vnd.apple.unknown.13")); + + private final MediaType mediaType; + + IWork13DocumentType(MediaType mediaType) { + this.mediaType = mediaType; + } + + public MediaType getType() { + return mediaType; + } + + public static MediaType detect(ZipFile zipFile) { + ZipArchiveEntry entry = zipFile.getEntry("Index/MasterSlide.iwa"); + if (zipFile.getEntry("Index/MasterSlide.iwa") != null || + zipFile.getEntry("Index/Slide.iwa") != null) { + return KEYNOTE13.getType(); + } + //TODO: figure out how to distinguish numbers from pages + return UNKNOWN13.getType(); + } + } + + /** + * All iWork 13 files contain this, so we can detect based on it + */ + public final static String IWORK13_COMMON_ENTRY = "Metadata/BuildVersionHistory.plist"; + + private final static Set<MediaType> supportedTypes = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + IWork13DocumentType.KEYNOTE13.getType(), + IWork13DocumentType.NUMBERS13.getType(), + IWork13DocumentType.PAGES13.getType() + ))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return supportedTypes; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + //no-op for now + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index 12f22bc..d43a17c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -50,6 +50,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.iwork.IWorkPackageParser; import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType; +import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; import static java.nio.charset.StandardCharsets.UTF_8; @@ -147,6 +148,9 @@ public class ZipContainerDetector implements Detector { type = detectOPCBased(zip, tis); } if (type == null) { + type = detectIWork13(zip); + } + if (type == null) { type = detectIWork(zip); } if (type == null) { @@ -300,6 +304,14 @@ public class ZipContainerDetector implements Detector { } } + private static MediaType detectIWork13(ZipFile zip) { + if (zip.getEntry(IWork13PackageParser.IWORK13_COMMON_ENTRY) != null) { + return IWork13PackageParser.IWork13DocumentType.detect(zip); + } + return null; + + } + private static MediaType detectIWork(ZipFile zip) { if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) { // Locate the appropriate index file entry, and reads from that http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index 5787408..828c55c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -31,6 +31,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; import org.junit.Test; /** @@ -316,6 +317,16 @@ public class TestContainerAwareDetector { } @Test + public void testDetectIWork2013() throws Exception { + assertTypeByData("testKeynote2013.key", + IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString()); + assertTypeByData("testNumbers2013.numbers", + IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString()); + assertTypeByData("testPages2013.pages", + IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString()); + } + + @Test public void testDetectKMZ() throws Exception { assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz"); } http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynote2013.key ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testKeynote2013.key b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key new file mode 100644 index 0000000..d0dd416 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testKeynote2013.key differ http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key b/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key deleted file mode 100644 index d0dd416..0000000 Binary files a/tika-parsers/src/test/resources/test-documents/testKeynoteNew.key and /dev/null differ http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers new file mode 100644 index 0000000..3f9a013 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testNumbers2013.numbers differ http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers b/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers deleted file mode 100644 index 3f9a013..0000000 Binary files a/tika-parsers/src/test/resources/test-documents/testNumbersNew.numbers and /dev/null differ http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPages2013.pages ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testPages2013.pages b/tika-parsers/src/test/resources/test-documents/testPages2013.pages new file mode 100644 index 0000000..b82ac7a Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPages2013.pages differ http://git-wip-us.apache.org/repos/asf/tika/blob/d6981ad8/tika-parsers/src/test/resources/test-documents/testPagesNew.pages ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages b/tika-parsers/src/test/resources/test-documents/testPagesNew.pages deleted file mode 100644 index b82ac7a..0000000 Binary files a/tika-parsers/src/test/resources/test-documents/testPagesNew.pages and /dev/null differ
