Author: nick
Date: Wed Jul 28 13:55:20 2010
New Revision: 980058
URL: http://svn.apache.org/viewvc?rev=980058&view=rev
Log:
TIKA-447 - Container aware mimetype detection
Initial implementation of container aware detection. New ContainerAwareDetector
class, which is a Detector, which will open and handle OLE2 and Zip files to
detect the mimetype, falling back on a specified default detector for
non-container formats.
Some work remains - Not all Zip file based things are detected yet, and the Zip
based parsers don't yet take advantage of the already open zip stream. (OLE2
ones can)
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ogg/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=980058&r1=980057&r2=980058&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
Wed Jul 28 13:55:20 2010
@@ -168,6 +168,13 @@ public class TikaInputStream extends Pro
* Marked position, or -1 if there is no current mark.
*/
private long mark = -1;
+
+ /**
+ * A opened container, such as a POIFS FileSystem
+ * for an OLE2 document, or a Zip file for a
+ * zip based (eg ooxml, odf) document.
+ */
+ private Object openContainer;
/**
*
@@ -211,6 +218,26 @@ public class TikaInputStream extends Pro
return n;
}
+
+ /**
+ * Returns the open container object, such as a
+ * POIFS FileSystem in the event of an OLE2
+ * document being detected and processed by
+ * the OLE2 detector.
+ */
+ public Object getOpenContainer() {
+ return openContainer;
+ }
+
+ /**
+ * Stores the open container object against
+ * the stream, eg after a Zip contents
+ * detector has loaded the file to decide
+ * what it contains.
+ */
+ public void setOpenContainer(Object container) {
+ openContainer = container;
+ }
public File getFile() throws IOException {
if (file == null) {
@@ -298,6 +325,11 @@ public class TikaInputStream extends Pro
super.mark(readlimit);
mark = position;
}
+
+ @Override
+ public boolean markSupported() {
+ return true;
+ }
@Override
public void reset() throws IOException {
@@ -309,6 +341,9 @@ public class TikaInputStream extends Pro
@Override
public void close() throws IOException {
+ if (openContainer != null) {
+ openContainer = null;
+ }
if (in != null) {
in.close();
in = null;
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java?rev=980058&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
Wed Jul 28 13:55:20 2010
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.poifs.storage.HeaderBlockConstants;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypes;
+
+
+/**
+ * A detector that knows about the container formats that we support
+ * (eg POIFS, Zip), and is able to peek inside them to better figure
+ * out the contents.
+ * Delegates to another {...@link Detector} (normally {...@link MimeTypes})
+ * to handle detection for non container formats.
+ * Should normally be used with a {...@link TikaInputStream} to minimise
+ * the memory usage.
+ */
+public class ContainerAwareDetector implements Detector {
+ private Detector fallbackDetector;
+ private ZipContainerDetector zipDetector;
+ private POIFSContainerDetector poifsDetector;
+
+ /**
+ * Creates a new container detector, which will use the
+ * given detector for non container formats.
+ * @param fallbackDetector The detector to use for non-containers
+ */
+ public ContainerAwareDetector(Detector fallbackDetector) {
+ this.fallbackDetector = fallbackDetector;
+ poifsDetector = new POIFSContainerDetector();
+ zipDetector = new ZipContainerDetector();
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ return detect(TikaInputStream.get(input), metadata);
+ }
+
+ public MediaType detect(TikaInputStream input, Metadata metadata)
+ throws IOException {
+
+ // Grab the first 8 bytes, used to do container detection
+ input.mark(8);
+ byte[] first8 = new byte[8];
+ IOUtils.readFully(input, first8);
+ input.reset();
+
+ // Is this a zip file?
+ if(first8[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
+ first8[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
+ first8[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
+ first8[3] == POIFSConstants.OOXML_FILE_HEADER[3]) {
+ return zipDetector.detect(input, metadata);
+ }
+
+ // Is this an ole2 file?
+ long ole2Signature = LittleEndian.getLong(first8, 0);
+ if(ole2Signature == HeaderBlockConstants._signature) {
+ return poifsDetector.detect(input, metadata);
+ }
+
+ // Not a supported container, ask our fall back
+ // detector to figure it out
+ return fallbackDetector.detect(input, metadata);
+ }
+}
+
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java?rev=980058&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
Wed Jul 28 13:55:20 2010
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ * to figure out exactly what the file is
+ */
+public class POIFSContainerDetector implements Detector {
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ POIFSFileSystem fs = new POIFSFileSystem(input);
+
+ POIFSDocumentType type =
+ OfficeParser.POIFSDocumentType.detectType(fs);
+
+ if(input instanceof TikaInputStream) {
+ ((TikaInputStream)input).setOpenContainer(fs);
+ } else {
+ fs = null;
+ }
+
+ return type.getType();
+ }
+}
+
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=980058&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Wed Jul 28 13:55:20 2010
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+
+/**
+ * A detector that works on a Zip document
+ * to figure out exactly what the file is
+ */
+public class ZipContainerDetector implements Detector {
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ ZipInputStream zip = new ZipInputStream(input);
+ ZipEntry entry = zip.getNextEntry();
+ while (entry != null) {
+ // Is it an Open Document file?
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, "UTF-8");
+ int splitAt = type.indexOf('/');
+ if(splitAt > -1) {
+ return new MediaType(
+ type.substring(0,splitAt),
+ type.substring(splitAt+1)
+ );
+ }
+ return MediaType.APPLICATION_ZIP;
+ } else if (entry.getName().equals("[Content_Types].xml")) {
+ // Office Open XML
+ // TODO
+ }
+ entry = zip.getNextEntry();
+ }
+
+ return MediaType.APPLICATION_ZIP;
+ }
+}
+
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=980058&r1=980057&r2=980058&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Wed Jul 28 13:55:20 2010
@@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.D
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -151,7 +152,13 @@ public class OfficeParser implements Par
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+ POIFSFileSystem filesystem;
+ if(stream instanceof TikaInputStream &&
+ ((TikaInputStream)stream).getOpenContainer() != null) {
+ filesystem =
(POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
+ } else {
+ filesystem = new POIFSFileSystem(stream);
+ }
// Parse summary entries first, to make metadata available early
new SummaryExtractor(metadata).parseSummaries(filesystem);
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=980058&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Wed Jul 28 13:55:20 2010
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Junit test class for {...@link ContainerAwareDetector}
+ */
+public class TestContainerAwareDetector extends TestCase {
+ private TikaConfig tc;
+ private ContainerAwareDetector d;
+
+ public void setUp() throws Exception {
+ tc = TikaConfig.getDefaultConfig();
+ d = new ContainerAwareDetector(tc.getMimeRepository());
+ }
+
+ private InputStream getTestDoc(String filename) {
+ InputStream input =
TestContainerAwareDetector.class.getResourceAsStream(
+ "/test-documents/" + filename);
+ assertNotNull(input);
+ return input;
+ }
+
+ public void testDetectOLE2() throws Exception {
+ InputStream input;
+
+ input = getTestDoc("testEXCEL.xls");
+ assertEquals(
+ MediaType.application("vnd.ms-excel"),
+ d.detect(input, new Metadata())
+ );
+
+ input = getTestDoc("testWORD.doc");
+ assertEquals(
+ MediaType.application("msword"),
+ d.detect(input, new Metadata())
+ );
+
+ input = getTestDoc("testPPT.ppt");
+ assertEquals(
+ MediaType.application("vnd.ms-powerpoint"),
+ d.detect(input, new Metadata())
+ );
+
+ TikaInputStream tis = TikaInputStream.get(getTestDoc("testPPT.ppt"));
+ assertEquals(
+ MediaType.application("vnd.ms-powerpoint"),
+ d.detect(tis, new Metadata())
+ );
+ assertNotNull(tis.getOpenContainer());
+ }
+
+ public void testDetectODF() throws Exception {
+ InputStream input;
+
+ input = getTestDoc("testODFwithOOo3.odt");
+ assertEquals(
+ MediaType.application("vnd.oasis.opendocument.text"),
+ d.detect(input, new Metadata())
+ );
+
+ input = getTestDoc("testOpenOffice2.odf");
+ assertEquals(
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ d.detect(input, new Metadata())
+ );
+
+ TikaInputStream tis =
TikaInputStream.get(getTestDoc("testOpenOffice2.odf"));
+ assertEquals(
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ d.detect(tis, new Metadata())
+ );
+ // Doesn't store the zip parser yet
+ assertNull(tis.getOpenContainer());
+ }
+
+ public void testDetectOOXML() throws Exception {
+
+ }
+
+ public void testDetectZip() throws Exception {
+
+ }
+}