Author: kbennett
Date: Thu Oct 18 08:11:27 2007
New Revision: 586001

URL: http://svn.apache.org/viewvc?rev=586001&view=rev
Log:
TIKA-75: Provides a MimeUtils.getType(URL) method that will determine MIME type 
based on the stream and, if necessary, the name.

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java?rev=586001&r1=586000&r2=586001&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java Thu 
Oct 18 08:11:27 2007
@@ -18,6 +18,8 @@
 
 // JDK imports
 import java.io.InputStream;
+import java.io.IOException;
+import java.net.URL;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import javax.xml.parsers.DocumentBuilder;
@@ -80,6 +82,26 @@
         return typeName;
     }
 
+
+    /**
+     * Determines the MIME type of the resource pointed to by the specified 
URL.
+     * Examines the file's header, and if it cannot determine the MIME type
+     * from the header, guesses the MIME type from the URL extension
+     * (e.g. "pdf).
+     *
+     * @param url
+     * @return
+     * @throws IOException
+     */
+    public String getType(URL url) throws IOException {
+        InputStream stream = url.openStream();
+        try {
+            return getType(null, url.toString(), getHeader(stream));
+        } finally {
+            stream.close();
+        }
+    }
+
     private final MimeTypes load(String tikaMimeFile) {
         LOG.info("Loading [" + tikaMimeFile + "]");
         Document document = getDocumentRoot(MimeUtils.class.getClassLoader()
@@ -111,4 +133,23 @@
         return document;
     }
 
+
+    /**
+     * Read the resource's header for use in determination of the MIME type.
+     */
+    private byte[] getHeader(InputStream stream) throws IOException {
+        byte[] bytes = new byte[repository.getMinLength()];
+        int totalRead = 0;
+        int lastRead = stream.read(bytes);
+        while (lastRead != -1) {
+            totalRead += lastRead;
+            if (totalRead == bytes.length) {
+                return bytes;
+            }
+            lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
+        }
+        byte[] shorter = new byte[totalRead];
+        System.arraycopy(bytes, 0, shorter, 0, totalRead);
+        return shorter;
+    }
 }

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java?rev=586001&r1=586000&r2=586001&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java 
Thu Oct 18 08:11:27 2007
@@ -19,13 +19,12 @@
 
 //JDK imports
 import java.io.File;
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 
-//Tika imports
 import org.apache.tika.metadata.TikaMimeKeys;
 
-//Junit imports
 import junit.framework.TestCase;
 
 /**
@@ -61,6 +60,10 @@
         assertNotNull(utils.getRepository().forName("text/x-tex"));
     }
 
+
+    /**
+     * Tests MIME type determination based solely on the URL's extension.
+     */
     public void testGuessMimeTypes() {
 
         assertEquals("application/pdf", utils.getRepository().getMimeType(
@@ -93,4 +96,40 @@
                 .getMimeType("x.xyz").getName());
     }
 
+
+    /**
+     * Tests MimeUtils.getMimeType(URL), which examines both the byte header
+     * and, if necessary, the URL's extension.
+     */
+    public void testMimeDeterminationForTestDocuments() {
+
+        assertEquals("text/html", getMimeType("testHTML.html"));
+        assertEquals("application/zip", getMimeType("test-documents.zip"));
+        assertEquals("application/vnd.ms-excel", getMimeType("testEXCEL.xls"));
+        assertEquals("text/html", getMimeType("testHTML_utf8.html"));
+        assertEquals("application/vnd.oasis.opendocument.text",
+                getMimeType("testOpenOffice2.odt"));
+        assertEquals("application/pdf", getMimeType("testPDF.pdf"));
+        assertEquals("application/vnd.ms-powerpoint", 
getMimeType("testPPT.ppt"));
+        assertEquals("application/rtf", getMimeType("testRTF.rtf"));
+        assertEquals("text/plain", getMimeType("testTXT.txt"));
+        assertEquals("application/msword", getMimeType("testWORD.doc"));
+        assertEquals("application/xml", getMimeType("testXML.xml"));
+    }
+
+    private String getMimeType(String filename) {
+
+        String type = null;
+
+        try {
+            URL url = getClass().getResource("/test-documents/" + filename);
+            type = utils.getType(url);
+        } catch (MalformedURLException e) {
+            fail(e.getMessage());
+        } catch (IOException e) {
+            fail(e.getMessage());
+        }
+
+        return type;
+    }
 }


Reply via email to