svn commit: r735193 - in /lucene/tika/trunk: ./ src/main/java/org/apache/tika/mime/ src/main/java/org/apache/tika/parser/ src/main/resources/mime/ src/test/java/org/apache/tika/mime/ src/test/java/org/apache/tika/parser/

jukka Fri, 16 Jan 2009 17:10:55 -0800

Author: jukka
Date: Fri Jan 16 17:10:20 2009
New Revision: 735193

URL: http://svn.apache.org/viewvc?rev=735193&view=rev
Log:
TIKA-154: Better detection of plain text versus binary formats with a text 
header


Implemented the plain text auto-detection mechanism from section 4 of the 
"Content-Type Processing Model" Internet-Draft.

The type detection methods in MimeTypes now return application/octet-stream 
instead of null when they don't find a better match for a given document.

Updated the MIME type configuration of the text-based RTF and XML types.

Improved the auto-detection algorithm in AutoDetectParser by allowing type 
hints like glob patterns to further specify a generic type detected by magic 
bytes.

Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
    lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
    
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Jan 16 17:10:20 2009
@@ -6,6 +6,10 @@
 
 The most notable changes in Tika 0.3 over the previous release are:
 
+  * Automatic detection of document types in Tika has been improved.
+    For example Tika can now detect plain text just by looking at the first
+    few bytes of the document. (TIKA-154)
+
   * Tika now disables the loading of all external entities in XML files
     that it parses as input documents. This improves security and avoids
     problems with potentially broken references. (TIKA-185)

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Fri Jan 
16 17:10:20 2009
@@ -43,11 +43,59 @@
  */
 public final class MimeTypes {
 
-    /** The default <code>application/octet-stream</code> MimeType */
-    public final static String DEFAULT = "application/octet-stream";
+    /**
+     * Name of the {...@link #root root} type, application/octet-stream.
+     */
+    public final static String OCTET_STREAM = "application/octet-stream";
 
+    /**
+     * Name of the {...@link #text text} type, text/plain.
+     */
+    public final static String PLAIN_TEXT = "text/plain";
+
+    /**
+     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+     * in the range below 0x20 (the space character). If an entry in this
+     * table is <code>true</code> then that byte is very unlikely to occur
+     * in a plain text document.
+     * <p>
+     * The contents of this lookup table are based on the following definition
+     * from section 4 of the "Content-Type Processing Model" Internet-draft
+     * (<a 
href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt";
+     * >draft-abarth-mime-sniff-01</a>).
+     * <pre>
+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * </pre>
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-154";>TIKA-154</a>
+     */
+    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+    static {
+        Arrays.fill(IS_CONTROL_BYTE, true);
+        IS_CONTROL_BYTE[0x09] = false; // tabulator
+        IS_CONTROL_BYTE[0x0A] = false; // new line
+        IS_CONTROL_BYTE[0x0C] = false; // new page
+        IS_CONTROL_BYTE[0x0D] = false; // carriage return
+        IS_CONTROL_BYTE[0x1B] = false; // escape
+    }
+
+    /**
+     * Root type, application/octet-stream.
+     */
     private final MimeType root;
 
+    /**
+     * Text type, text/plain.
+     */
+    private final MimeType text;
+
     /** All the registered MimeTypes indexed on their name */
     private final Map<String, MimeType> types = new HashMap<String, 
MimeType>();
 
@@ -61,8 +109,16 @@
     private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
 
     public MimeTypes() {
-        root = new MimeType(this, DEFAULT);
+        root = new MimeType(this, OCTET_STREAM);
+        text = new MimeType(this, PLAIN_TEXT);
+        try {
+            text.setSuperType(root);
+        } catch (MimeTypeException e) {
+            throw new IllegalStateException("Error in MimeType logic", e);
+        }
+
         types.put(root.getName(), root);
+        types.put(text.getName(), text);
     }
 
     /**
@@ -91,9 +147,9 @@
 
     /**
      * Find the Mime Content Type of a document from its name.
+     * Returns application/octet-stream if no better match is found.
      * 
-     * @param name
-     *            of the document to analyze.
+     * @param name of the document to analyze.
      * @return the Mime Content Type of the specified document name
      */
     public MimeType getMimeType(String name) {
@@ -111,13 +167,14 @@
 
     /**
      * Returns the MIME type that best matches the given first few bytes
-     * of a document stream.
+     * of a document stream. Returns application/octet-stream if no better
+     * match is found.
      * <p>
      * The given byte array is expected to be at least {...@link 
#getMinLength()}
      * long, or shorter only if the document stream itself is shorter.
      *
      * @param data first few bytes of a document stream
-     * @return matching MIME type, or <code>null</code> if no match is found
+     * @return matching MIME type
      */
     public MimeType getMimeType(byte[] data) {
         if (data == null) {
@@ -138,7 +195,14 @@
             }
         }
 
-        return null;
+        // Finally, assume plain text if no control bytes are found
+        for (int i = 0; i < data.length; i++) {
+            int b = data[i] & 0xFF; // prevent sign extension
+            if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
+                return root;
+            }
+        }
+        return text;
     }
 
     /**
@@ -213,9 +277,9 @@
      * from the header, guesses the MIME type from the URL extension
      * (e.g. "pdf).
      *
-     * @param url
-     * @return
-     * @throws IOException
+     * @param url URL of the document
+     * @return type of the document
+     * @throws IOException if the document can not be accessed
      */
     public String getType(URL url) throws IOException {
         InputStream stream = url.openStream();
@@ -287,7 +351,11 @@
             MimeType type = types.get(name);
             if (type == null) {
                 type = new MimeType(this, name);
-                type.setSuperType(root);
+                if (name.startsWith("text/")) {
+                    type.setSuperType(text);
+                } else {
+                    type.setSuperType(root);
+                }
                 types.put(name, type);
             }
             return type;

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
(original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
Fri Jan 16 17:10:20 2009
@@ -95,14 +95,13 @@
      */
     private MimeType getMimeType(InputStream stream, Metadata metadata)
             throws IOException {
+        MimeType type;
+
         // Get type based on magic prefix
         stream.mark(types.getMinLength());
         try {
             byte[] prefix = getPrefix(stream, types.getMinLength());
-            MimeType type = types.getMimeType(prefix);
-            if (type != null) {
-                return type;
-            }
+            type = types.getMimeType(prefix);
         } finally {
             stream.reset();
         }
@@ -110,29 +109,26 @@
         // Get type based on resourceName hint (if available)
         String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
         if (resourceName != null) {
-            MimeType type = types.getMimeType(resourceName);
-            if (type != null) {
-                return type;
+            MimeType hint = types.getMimeType(resourceName);
+            if (hint.isDescendantOf(type)) {
+                type = hint;
             }
         }
 
         // Get type based on metadata hint (if available)
-        String typename = metadata.get(Metadata.CONTENT_TYPE);
-        if (typename != null) {
+        String typeName = metadata.get(Metadata.CONTENT_TYPE);
+        if (typeName != null) {
             try {
-                return types.forName(typename);
+                MimeType hint = types.forName(typeName);
+                if (hint.isDescendantOf(type)) {
+                    type = hint;
+                }
             } catch (MimeTypeException e) {
                 // Malformed type name, ignore
             }
         }
 
-        // Finally, use the default type if no matches found
-        try {
-            return types.forName(MimeTypes.DEFAULT);
-        } catch (MimeTypeException e) {
-            // Should never happen
-            return null;
-        }
+        return type;
     }
 
     /**

Modified: lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Fri Jan 16 
17:10:20 2009
@@ -26,6 +26,12 @@
     <magic priority="50">
       <match value="This is TeX," type="string" offset="0" />
       <match value="This is METAFONT," type="string" offset="0" />
+      <!-- UTF-16BE BOM -->
+      <match value="0xfeff" type="string" offset="0"/>
+      <!-- UTF-16LE BOM -->
+      <match value="0xfffe" type="string" offset="0"/>
+      <!-- UTF-8 BOM -->
+      <match value="0xefbbbf" type="string" offset="0"/>
     </magic>
     <glob pattern="*.txt" />
     <glob pattern="*.asc" />
@@ -483,8 +489,9 @@
   </mime-type>
 
   <mime-type type="application/rtf">
+    <sub-class-of type="text/plain" />
     <magic priority="50">
-      <match value="{\rtf" type="string" offset="0" />
+      <match value="{\\rtf" type="string" offset="0" />
     </magic>
     <glob pattern="*.rtf" />
     <alias type="text/rtf" />
@@ -499,6 +506,10 @@
 
   <!--  added in by mattmann -->
   <mime-type type="application/xml">
+    <sub-class-of type="text/plain" />
+    <magic priority="50">
+      <match value="&lt;?xml" type="string" offset="0" />
+    </magic>
     <alias type="text/xml" />
     <glob pattern="*.xml" />
   </mime-type>

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java 
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java Fri 
Jan 16 17:10:20 2009
@@ -130,6 +130,28 @@
         } catch (IllegalArgumentException e) {
             // expected result
         }
+
+        // Plain text detection
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xEF, (byte) 0xFB, (byte) 0xBF });
+        assertText(new byte[] { 'a', 'b', 'c' });
+        assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+        assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+    }
+
+    private void assertText(byte[] prefix) {
+        assertMagic("text/plain", prefix);
+    }
+
+    private void assertNotText(byte[] prefix) {
+        assertMagic("application/octet-stream", prefix);
+    }
+
+    private void assertMagic(String expected, byte[] prefix) {
+        MimeType type = types.getMimeType(prefix);
+        assertNotNull(type);
+        assertEquals(expected, type.getName());
     }
 
     /** Test getMimeType(InputStream) */

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Fri 
Jan 16 17:10:20 2009
@@ -100,10 +100,9 @@
      * Tests MimeTypes.getMimeType(URL), which examines both the byte header
      * and, if necessary, the URL's extension.
      */
-    public void testMimeDeterminationForTestDocuments() {
-
-        assertEquals("text/html", getMimeType("testHTML.html"));
-        assertEquals("application/zip", getMimeType("test-documents.zip"));
+    public void testMimeDeterminationForTestDocuments() throws Exception {
+        assertType("text/html", "testHTML.html");
+        assertType("application/zip", "test-documents.zip");
         // TODO: Currently returns generic MS Office type based on
         // the magic header. The getMimeType method should understand
         // MS Office types better.
@@ -112,33 +111,23 @@
         // assertEquals("application/vnd.ms-powerpoint",
         // getMimeType("testPPT.ppt"));
         // assertEquals("application/msword", getMimeType("testWORD.doc"));
-        assertEquals("text/html", getMimeType("testHTML_utf8.html"));
-        assertEquals("application/vnd.oasis.opendocument.text",
-                getMimeType("testOpenOffice2.odt"));
-        assertEquals("application/pdf", getMimeType("testPDF.pdf"));
-        assertEquals("application/rtf", getMimeType("testRTF.rtf"));
-        assertEquals("text/plain", getMimeType("testTXT.txt"));
-        assertEquals("application/xml", getMimeType("testXML.xml"));
-        assertEquals("audio/basic", getMimeType("testAU.au"));
-        assertEquals("audio/x-aiff", getMimeType("testAIFF.aif"));
-        assertEquals("audio/x-wav", getMimeType("testWAV.wav"));
-        assertEquals("audio/midi", getMimeType("testMID.mid"));
+        assertType("text/html", "testHTML_utf8.html");
+        assertType(
+                "application/vnd.oasis.opendocument.text",
+                "testOpenOffice2.odt");
+        assertType("application/pdf", "testPDF.pdf");
+        assertType("application/rtf", "testRTF.rtf");
+        assertType("text/plain", "testTXT.txt");
+        assertType("application/xml", "testXML.xml");
+        assertType("audio/basic", "testAU.au");
+        assertType("audio/x-aiff", "testAIFF.aif");
+        assertType("audio/x-wav", "testWAV.wav");
+        assertType("audio/midi", "testMID.mid");
     }
 
-    private String getMimeType(String filename) {
-
-        String type = null;
-
-        try {
-            URL url = getClass().getResource("/test-documents/" + filename);
-            type = repo.getType(url);
-        } catch (MalformedURLException e) {
-            fail(e.getMessage());
-        } catch (IOException e) {
-            fail(e.getMessage());
-        }
-
-        return type;
+    private void assertType(String expected, String filename) throws Exception 
{
+        URL url = getClass().getResource("/test-documents/" + filename);
+        assertEquals(expected, repo.getType(url));
     }
 
 }

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java 
(original)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java 
Fri Jan 16 17:10:20 2009
@@ -50,19 +50,19 @@
         String data = "<p>test <span>content</span></p>";
         InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
         Reader reader = new ParsingReader(stream, "test.xml");
-        assertEquals('t', reader.read());
-        assertEquals('e', reader.read());
-        assertEquals('s', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals(' ', reader.read());
-        assertEquals('c', reader.read());
-        assertEquals('o', reader.read());
-        assertEquals('n', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals('e', reader.read());
-        assertEquals('n', reader.read());
-        assertEquals('t', reader.read());
-        assertEquals('\n', reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('e', (char) reader.read());
+        assertEquals('s', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals(' ', (char) reader.read());
+        assertEquals('c', (char) reader.read());
+        assertEquals('o', (char) reader.read());
+        assertEquals('n', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('e', (char) reader.read());
+        assertEquals('n', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('\n', (char) reader.read());
         assertEquals(-1, reader.read());
         reader.close();
         assertEquals(-1, stream.read());

svn commit: r735193 - in /lucene/tika/trunk: ./ src/main/java/org/apache/tika/mime/ src/main/java/org/apache/tika/parser/ src/main/resources/mime/ src/test/java/org/apache/tika/mime/ src/test/java/org/apache/tika/parser/

Reply via email to