Author: jukka
Date: Wed Aug 4 09:23:20 2010
New Revision: 982175
URL: http://svn.apache.org/viewvc?rev=982175&view=rev
Log:
TIKA-447: Container aware mimetype detection
Use TikaInputStream in the container-aware Detectors to avoid changing the
visible state of the given stream
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
Wed Aug 4 09:23:20 2010
@@ -38,6 +38,19 @@ import org.apache.tika.metadata.Metadata
public class TikaInputStream extends ProxyInputStream {
/**
+ * Checks whether the given stream is a TikaInputStream instance.
+ * The given stream can be <code>null</code>, in which case the return
+ * value is <code>false</code>.
+ *
+ * @param stream input stream, possibly <code>null</code>
+ * @return <code>true</code> if the stream is a TikaInputStream instance,
+ * <code>false</code> otherwise
+ */
+ public static boolean isTikaInputStream(InputStream stream) {
+ return stream instanceof TikaInputStream;
+ }
+
+ /**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
* even when given just a normal input stream instance.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/POIFSContainerDetector.java
Wed Aug 4 09:23:20 2010
@@ -16,6 +16,7 @@
*/
package org.apache.tika.detect;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -23,7 +24,6 @@ import org.apache.poi.poifs.filesystem.P
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
@@ -32,20 +32,21 @@ import org.apache.tika.parser.microsoft.
* to figure out exactly what the file is
*/
public class POIFSContainerDetector implements Detector {
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
- POIFSFileSystem fs = new POIFSFileSystem(input);
+ if (TikaInputStream.isTikaInputStream(input)) {
+ TikaInputStream stream = TikaInputStream.get(input);
+
+ // NOTE: POIFSFileSystem will close the FileInputStream
+ POIFSFileSystem fs =
+ new POIFSFileSystem(new FileInputStream(stream.getFile()));
+ stream.setOpenContainer(fs);
- POIFSDocumentType type =
- OfficeParser.POIFSDocumentType.detectType(fs);
-
- if(input instanceof TikaInputStream) {
- ((TikaInputStream)input).setOpenContainer(fs);
+ return POIFSDocumentType.detectType(fs).getType();
} else {
- fs = null;
+ return MediaType.application("x-tika-msoffice");
}
-
- return type.getType();
}
-}
+}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java?rev=982175&r1=982174&r2=982175&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java
Wed Aug 4 09:23:20 2010
@@ -18,8 +18,9 @@ package org.apache.tika.detect;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collections;
import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
+import java.util.zip.ZipFile;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -37,29 +38,32 @@ import org.apache.tika.mime.MediaType;
* to figure out exactly what the file is
*/
public class ZipContainerDetector implements Detector {
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
- if(input instanceof TikaInputStream) {
- return detect((TikaInputStream)input, metadata);
- }
- return detect( TikaInputStream.get(input), metadata );
+ if (TikaInputStream.isTikaInputStream(input)) {
+ return detect(TikaInputStream.get(input));
+ } else {
+ return MediaType.APPLICATION_ZIP;
+ }
}
- public MediaType detect(TikaInputStream input, Metadata metadata)
- throws IOException {
- ZipInputStream zip = new ZipInputStream(input);
- ZipEntry entry = zip.getNextEntry();
- while (entry != null) {
+
+ private MediaType detect(TikaInputStream input) throws IOException {
+ ZipFile zip = new ZipFile(input.getFile());
+ for (ZipEntry entry : Collections.list(zip.entries())) {
// Is it an Open Document file?
if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, "UTF-8");
- return fromString(type);
+ InputStream stream = zip.getInputStream(entry);
+ try {
+ return fromString(IOUtils.toString(stream, "UTF-8"));
+ } finally {
+ stream.close();
+ }
} else if (entry.getName().equals("_rels/.rels") ||
entry.getName().equals("[Content_Types].xml")) {
// Office Open XML File
// As POI to open and investigate it for us
try {
- input.reset();
-
OPCPackage pkg = OPCPackage.open(input);
input.setOpenContainer(pkg);
@@ -85,8 +89,6 @@ public class ZipContainerDetector implem
// Java Jar
return MediaType.application("java-archive");
}
-
- entry = zip.getNextEntry();
}
return MediaType.APPLICATION_ZIP;