Author: kbennett
Date: Thu Oct 18 08:11:27 2007
New Revision: 586001
URL: http://svn.apache.org/viewvc?rev=586001&view=rev
Log:
TIKA-75: Provides a MimeUtils.getType(URL) method that will determine MIME type
based on the stream and, if necessary, the name.
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java?rev=586001&r1=586000&r2=586001&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java Thu
Oct 18 08:11:27 2007
@@ -18,6 +18,8 @@
// JDK imports
import java.io.InputStream;
+import java.io.IOException;
+import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
@@ -80,6 +82,26 @@
return typeName;
}
+
+ /**
+ * Determines the MIME type of the resource pointed to by the specified
URL.
+ * Examines the file's header, and if it cannot determine the MIME type
+ * from the header, guesses the MIME type from the URL extension
+ * (e.g. "pdf).
+ *
+ * @param url
+ * @return
+ * @throws IOException
+ */
+ public String getType(URL url) throws IOException {
+ InputStream stream = url.openStream();
+ try {
+ return getType(null, url.toString(), getHeader(stream));
+ } finally {
+ stream.close();
+ }
+ }
+
private final MimeTypes load(String tikaMimeFile) {
LOG.info("Loading [" + tikaMimeFile + "]");
Document document = getDocumentRoot(MimeUtils.class.getClassLoader()
@@ -111,4 +133,23 @@
return document;
}
+
+ /**
+ * Read the resource's header for use in determination of the MIME type.
+ */
+ private byte[] getHeader(InputStream stream) throws IOException {
+ byte[] bytes = new byte[repository.getMinLength()];
+ int totalRead = 0;
+ int lastRead = stream.read(bytes);
+ while (lastRead != -1) {
+ totalRead += lastRead;
+ if (totalRead == bytes.length) {
+ return bytes;
+ }
+ lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
+ }
+ byte[] shorter = new byte[totalRead];
+ System.arraycopy(bytes, 0, shorter, 0, totalRead);
+ return shorter;
+ }
}
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java?rev=586001&r1=586000&r2=586001&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
Thu Oct 18 08:11:27 2007
@@ -19,13 +19,12 @@
//JDK imports
import java.io.File;
+import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
-//Tika imports
import org.apache.tika.metadata.TikaMimeKeys;
-//Junit imports
import junit.framework.TestCase;
/**
@@ -61,6 +60,10 @@
assertNotNull(utils.getRepository().forName("text/x-tex"));
}
+
+ /**
+ * Tests MIME type determination based solely on the URL's extension.
+ */
public void testGuessMimeTypes() {
assertEquals("application/pdf", utils.getRepository().getMimeType(
@@ -93,4 +96,40 @@
.getMimeType("x.xyz").getName());
}
+
+ /**
+ * Tests MimeUtils.getMimeType(URL), which examines both the byte header
+ * and, if necessary, the URL's extension.
+ */
+ public void testMimeDeterminationForTestDocuments() {
+
+ assertEquals("text/html", getMimeType("testHTML.html"));
+ assertEquals("application/zip", getMimeType("test-documents.zip"));
+ assertEquals("application/vnd.ms-excel", getMimeType("testEXCEL.xls"));
+ assertEquals("text/html", getMimeType("testHTML_utf8.html"));
+ assertEquals("application/vnd.oasis.opendocument.text",
+ getMimeType("testOpenOffice2.odt"));
+ assertEquals("application/pdf", getMimeType("testPDF.pdf"));
+ assertEquals("application/vnd.ms-powerpoint",
getMimeType("testPPT.ppt"));
+ assertEquals("application/rtf", getMimeType("testRTF.rtf"));
+ assertEquals("text/plain", getMimeType("testTXT.txt"));
+ assertEquals("application/msword", getMimeType("testWORD.doc"));
+ assertEquals("application/xml", getMimeType("testXML.xml"));
+ }
+
+ private String getMimeType(String filename) {
+
+ String type = null;
+
+ try {
+ URL url = getClass().getResource("/test-documents/" + filename);
+ type = utils.getType(url);
+ } catch (MalformedURLException e) {
+ fail(e.getMessage());
+ } catch (IOException e) {
+ fail(e.getMessage());
+ }
+
+ return type;
+ }
}