Author: jukka
Date: Tue Jan 27 23:21:11 2009
New Revision: 738297
URL: http://svn.apache.org/viewvc?rev=738297&view=rev
Log:
TIKA-95: Pluggable magic header detectors
Start using the Detector interface in AutoDetectParser by making MimeTypes
implement it.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=738297&r1=738296&r2=738297&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Tue Jan
27 23:21:11 2009
@@ -27,6 +27,9 @@
import java.util.SortedSet;
import java.util.TreeSet;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+
/**
* This class is a MimeType repository. It gathers a set of MimeTypes and
* enables to retrieves a content-type from its name, from a file name, or from
@@ -41,7 +44,7 @@
* (if available) to restore the stream back to the state it was before type
* detection if it wants to process the stream based on the detected type.
*/
-public final class MimeTypes {
+public final class MimeTypes implements Detector {
/**
* Name of the {...@link #root root} type, application/octet-stream.
@@ -425,4 +428,55 @@
}
}
+ /**
+ * Automatically detects the MIME type of a document based on magic
+ * markers in the stream prefix and any given metadata hints.
+ * <p>
+ * The given stream is expected to support marks, so that this method
+ * can reset the stream to the position it was in before this method
+ * was called.
+ *
+ * @param stream document stream
+ * @param metadata metadata hints
+ * @return MIME type of the document
+ * @throws IOException if the document stream could not be read
+ */
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ MimeType type;
+
+ // Get type based on magic prefix
+ input.mark(getMinLength());
+ try {
+ byte[] prefix = readMagicHeader(input);
+ type = getMimeType(prefix);
+ } finally {
+ input.reset();
+ }
+
+ // Get type based on resourceName hint (if available)
+ String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (resourceName != null) {
+ MimeType hint = getMimeType(resourceName);
+ if (hint.isDescendantOf(type)) {
+ type = hint;
+ }
+ }
+
+ // Get type based on metadata hint (if available)
+ String typeName = metadata.get(Metadata.CONTENT_TYPE);
+ if (typeName != null) {
+ try {
+ MimeType hint = forName(typeName);
+ if (hint.isDescendantOf(type)) {
+ type = hint;
+ }
+ } catch (MimeTypeException e) {
+ // Malformed type name, ignore
+ }
+ }
+
+ return MediaType.parse(type.getName());
+ }
+
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=738297&r1=738296&r2=738297&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Tue Jan 27 23:21:11 2009
@@ -17,15 +17,13 @@
package org.apache.tika.parser;
import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -73,88 +71,11 @@
}
// Automatically detect the MIME type of the document
- MimeType type = getMimeType(stream, metadata);
- metadata.set(Metadata.CONTENT_TYPE, type.getName());
+ MediaType type = types.detect(stream, metadata);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Parse the document
super.parse(stream, handler, metadata);
}
- /**
- * Automatically detects the MIME type of a document based on magic
- * markers in the stream prefix and any given metadata hints.
- * <p>
- * The given stream is expected to support marks, so that this method
- * can reset the stream to the position it was in before this method
- * was called.
- *
- * @param stream document stream
- * @param metadata metadata hints
- * @return MIME type of the document
- * @throws IOException if the document stream could not be read
- */
- private MimeType getMimeType(InputStream stream, Metadata metadata)
- throws IOException {
- MimeType type;
-
- // Get type based on magic prefix
- stream.mark(types.getMinLength());
- try {
- byte[] prefix = getPrefix(stream, types.getMinLength());
- type = types.getMimeType(prefix);
- } finally {
- stream.reset();
- }
-
- // Get type based on resourceName hint (if available)
- String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (resourceName != null) {
- MimeType hint = types.getMimeType(resourceName);
- if (hint.isDescendantOf(type)) {
- type = hint;
- }
- }
-
- // Get type based on metadata hint (if available)
- String typeName = metadata.get(Metadata.CONTENT_TYPE);
- if (typeName != null) {
- try {
- MimeType hint = types.forName(typeName);
- if (hint.isDescendantOf(type)) {
- type = hint;
- }
- } catch (MimeTypeException e) {
- // Malformed type name, ignore
- }
- }
-
- return type;
- }
-
- /**
- * Reads and returns the first <code>length</code> bytes from the
- * given stream. If the stream ends before that, returns all bytes
- * from the stream.
- *
- * @param input input stream
- * @param length number of bytes to read and return
- * @return stream prefix
- * @throws IOException if the stream could not be read
- */
- private byte[] getPrefix(InputStream input, int length) throws IOException
{
- ByteArrayOutputStream output = new ByteArrayOutputStream();
- byte[] buffer = new byte[Math.min(1024, length)];
- int n = input.read(buffer);
- while (n != -1) {
- output.write(buffer, 0, n);
- int remaining = length - output.size();
- if (remaining > 0) {
- n = input.read(buffer, 0, Math.min(buffer.length, remaining));
- } else {
- n = -1;
- }
- }
- return output.toByteArray();
- }
-
}