Author: jukka
Date: Fri Jan 16 17:10:20 2009
New Revision: 735193
URL: http://svn.apache.org/viewvc?rev=735193&view=rev
Log:
TIKA-154: Better detection of plain text versus binary formats with a text
header
Implemented the plain text auto-detection mechanism from section 4 of the
"Content-Type Processing Model" Internet-Draft.
The type detection methods in MimeTypes now return application/octet-stream
instead of null when they don't find a better match for a given document.
Updated the MIME type configuration of the text-based RTF and XML types.
Improved the auto-detection algorithm in AutoDetectParser by allowing type
hints like glob patterns to further specify a generic type detected by magic
bytes.
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
Modified: lucene/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Jan 16 17:10:20 2009
@@ -6,6 +6,10 @@
The most notable changes in Tika 0.3 over the previous release are:
+ * Automatic detection of document types in Tika has been improved.
+ For example Tika can now detect plain text just by looking at the first
+ few bytes of the document. (TIKA-154)
+
* Tika now disables the loading of all external entities in XML files
that it parses as input documents. This improves security and avoids
problems with potentially broken references. (TIKA-185)
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Fri Jan
16 17:10:20 2009
@@ -43,11 +43,59 @@
*/
public final class MimeTypes {
- /** The default <code>application/octet-stream</code> MimeType */
- public final static String DEFAULT = "application/octet-stream";
+ /**
+ * Name of the {...@link #root root} type, application/octet-stream.
+ */
+ public final static String OCTET_STREAM = "application/octet-stream";
+ /**
+ * Name of the {...@link #text text} type, text/plain.
+ */
+ public final static String PLAIN_TEXT = "text/plain";
+
+ /**
+ * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+ * in the range below 0x20 (the space character). If an entry in this
+ * table is <code>true</code> then that byte is very unlikely to occur
+ * in a plain text document.
+ * <p>
+ * The contents of this lookup table are based on the following definition
+ * from section 4 of the "Content-Type Processing Model" Internet-draft
+ * (<a
href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * >draft-abarth-mime-sniff-01</a>).
+ * <pre>
+ * +-------------------------+
+ * | Binary data byte ranges |
+ * +-------------------------+
+ * | 0x00 -- 0x08 |
+ * | 0x0B |
+ * | 0x0E -- 0x1A |
+ * | 0x1C -- 0x1F |
+ * +-------------------------+
+ * </pre>
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+ */
+ private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+ static {
+ Arrays.fill(IS_CONTROL_BYTE, true);
+ IS_CONTROL_BYTE[0x09] = false; // tabulator
+ IS_CONTROL_BYTE[0x0A] = false; // new line
+ IS_CONTROL_BYTE[0x0C] = false; // new page
+ IS_CONTROL_BYTE[0x0D] = false; // carriage return
+ IS_CONTROL_BYTE[0x1B] = false; // escape
+ }
+
+ /**
+ * Root type, application/octet-stream.
+ */
private final MimeType root;
+ /**
+ * Text type, text/plain.
+ */
+ private final MimeType text;
+
/** All the registered MimeTypes indexed on their name */
private final Map<String, MimeType> types = new HashMap<String,
MimeType>();
@@ -61,8 +109,16 @@
private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
public MimeTypes() {
- root = new MimeType(this, DEFAULT);
+ root = new MimeType(this, OCTET_STREAM);
+ text = new MimeType(this, PLAIN_TEXT);
+ try {
+ text.setSuperType(root);
+ } catch (MimeTypeException e) {
+ throw new IllegalStateException("Error in MimeType logic", e);
+ }
+
types.put(root.getName(), root);
+ types.put(text.getName(), text);
}
/**
@@ -91,9 +147,9 @@
/**
* Find the Mime Content Type of a document from its name.
+ * Returns application/octet-stream if no better match is found.
*
- * @param name
- * of the document to analyze.
+ * @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
*/
public MimeType getMimeType(String name) {
@@ -111,13 +167,14 @@
/**
* Returns the MIME type that best matches the given first few bytes
- * of a document stream.
+ * of a document stream. Returns application/octet-stream if no better
+ * match is found.
* <p>
* The given byte array is expected to be at least {...@link
#getMinLength()}
* long, or shorter only if the document stream itself is shorter.
*
* @param data first few bytes of a document stream
- * @return matching MIME type, or <code>null</code> if no match is found
+ * @return matching MIME type
*/
public MimeType getMimeType(byte[] data) {
if (data == null) {
@@ -138,7 +195,14 @@
}
}
- return null;
+ // Finally, assume plain text if no control bytes are found
+ for (int i = 0; i < data.length; i++) {
+ int b = data[i] & 0xFF; // prevent sign extension
+ if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
+ return root;
+ }
+ }
+ return text;
}
/**
@@ -213,9 +277,9 @@
* from the header, guesses the MIME type from the URL extension
* (e.g. "pdf).
*
- * @param url
- * @return
- * @throws IOException
+ * @param url URL of the document
+ * @return type of the document
+ * @throws IOException if the document can not be accessed
*/
public String getType(URL url) throws IOException {
InputStream stream = url.openStream();
@@ -287,7 +351,11 @@
MimeType type = types.get(name);
if (type == null) {
type = new MimeType(this, name);
- type.setSuperType(root);
+ if (name.startsWith("text/")) {
+ type.setSuperType(text);
+ } else {
+ type.setSuperType(root);
+ }
types.put(name, type);
}
return type;
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Fri Jan 16 17:10:20 2009
@@ -95,14 +95,13 @@
*/
private MimeType getMimeType(InputStream stream, Metadata metadata)
throws IOException {
+ MimeType type;
+
// Get type based on magic prefix
stream.mark(types.getMinLength());
try {
byte[] prefix = getPrefix(stream, types.getMinLength());
- MimeType type = types.getMimeType(prefix);
- if (type != null) {
- return type;
- }
+ type = types.getMimeType(prefix);
} finally {
stream.reset();
}
@@ -110,29 +109,26 @@
// Get type based on resourceName hint (if available)
String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (resourceName != null) {
- MimeType type = types.getMimeType(resourceName);
- if (type != null) {
- return type;
+ MimeType hint = types.getMimeType(resourceName);
+ if (hint.isDescendantOf(type)) {
+ type = hint;
}
}
// Get type based on metadata hint (if available)
- String typename = metadata.get(Metadata.CONTENT_TYPE);
- if (typename != null) {
+ String typeName = metadata.get(Metadata.CONTENT_TYPE);
+ if (typeName != null) {
try {
- return types.forName(typename);
+ MimeType hint = types.forName(typeName);
+ if (hint.isDescendantOf(type)) {
+ type = hint;
+ }
} catch (MimeTypeException e) {
// Malformed type name, ignore
}
}
- // Finally, use the default type if no matches found
- try {
- return types.forName(MimeTypes.DEFAULT);
- } catch (MimeTypeException e) {
- // Should never happen
- return null;
- }
+ return type;
}
/**
Modified: lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Fri Jan 16
17:10:20 2009
@@ -26,6 +26,12 @@
<magic priority="50">
<match value="This is TeX," type="string" offset="0" />
<match value="This is METAFONT," type="string" offset="0" />
+ <!-- UTF-16BE BOM -->
+ <match value="0xfeff" type="string" offset="0"/>
+ <!-- UTF-16LE BOM -->
+ <match value="0xfffe" type="string" offset="0"/>
+ <!-- UTF-8 BOM -->
+ <match value="0xefbbbf" type="string" offset="0"/>
</magic>
<glob pattern="*.txt" />
<glob pattern="*.asc" />
@@ -483,8 +489,9 @@
</mime-type>
<mime-type type="application/rtf">
+ <sub-class-of type="text/plain" />
<magic priority="50">
- <match value="{\rtf" type="string" offset="0" />
+ <match value="{\\rtf" type="string" offset="0" />
</magic>
<glob pattern="*.rtf" />
<alias type="text/rtf" />
@@ -499,6 +506,10 @@
<!-- added in by mattmann -->
<mime-type type="application/xml">
+ <sub-class-of type="text/plain" />
+ <magic priority="50">
+ <match value="<?xml" type="string" offset="0" />
+ </magic>
<alias type="text/xml" />
<glob pattern="*.xml" />
</mime-type>
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java Fri
Jan 16 17:10:20 2009
@@ -130,6 +130,28 @@
} catch (IllegalArgumentException e) {
// expected result
}
+
+ // Plain text detection
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xEF, (byte) 0xFB, (byte) 0xBF });
+ assertText(new byte[] { 'a', 'b', 'c' });
+ assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+ assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+ }
+
+ private void assertText(byte[] prefix) {
+ assertMagic("text/plain", prefix);
+ }
+
+ private void assertNotText(byte[] prefix) {
+ assertMagic("application/octet-stream", prefix);
+ }
+
+ private void assertMagic(String expected, byte[] prefix) {
+ MimeType type = types.getMimeType(prefix);
+ assertNotNull(type);
+ assertEquals(expected, type.getName());
}
/** Test getMimeType(InputStream) */
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Fri
Jan 16 17:10:20 2009
@@ -100,10 +100,9 @@
* Tests MimeTypes.getMimeType(URL), which examines both the byte header
* and, if necessary, the URL's extension.
*/
- public void testMimeDeterminationForTestDocuments() {
-
- assertEquals("text/html", getMimeType("testHTML.html"));
- assertEquals("application/zip", getMimeType("test-documents.zip"));
+ public void testMimeDeterminationForTestDocuments() throws Exception {
+ assertType("text/html", "testHTML.html");
+ assertType("application/zip", "test-documents.zip");
// TODO: Currently returns generic MS Office type based on
// the magic header. The getMimeType method should understand
// MS Office types better.
@@ -112,33 +111,23 @@
// assertEquals("application/vnd.ms-powerpoint",
// getMimeType("testPPT.ppt"));
// assertEquals("application/msword", getMimeType("testWORD.doc"));
- assertEquals("text/html", getMimeType("testHTML_utf8.html"));
- assertEquals("application/vnd.oasis.opendocument.text",
- getMimeType("testOpenOffice2.odt"));
- assertEquals("application/pdf", getMimeType("testPDF.pdf"));
- assertEquals("application/rtf", getMimeType("testRTF.rtf"));
- assertEquals("text/plain", getMimeType("testTXT.txt"));
- assertEquals("application/xml", getMimeType("testXML.xml"));
- assertEquals("audio/basic", getMimeType("testAU.au"));
- assertEquals("audio/x-aiff", getMimeType("testAIFF.aif"));
- assertEquals("audio/x-wav", getMimeType("testWAV.wav"));
- assertEquals("audio/midi", getMimeType("testMID.mid"));
+ assertType("text/html", "testHTML_utf8.html");
+ assertType(
+ "application/vnd.oasis.opendocument.text",
+ "testOpenOffice2.odt");
+ assertType("application/pdf", "testPDF.pdf");
+ assertType("application/rtf", "testRTF.rtf");
+ assertType("text/plain", "testTXT.txt");
+ assertType("application/xml", "testXML.xml");
+ assertType("audio/basic", "testAU.au");
+ assertType("audio/x-aiff", "testAIFF.aif");
+ assertType("audio/x-wav", "testWAV.wav");
+ assertType("audio/midi", "testMID.mid");
}
- private String getMimeType(String filename) {
-
- String type = null;
-
- try {
- URL url = getClass().getResource("/test-documents/" + filename);
- type = repo.getType(url);
- } catch (MalformedURLException e) {
- fail(e.getMessage());
- } catch (IOException e) {
- fail(e.getMessage());
- }
-
- return type;
+ private void assertType(String expected, String filename) throws Exception
{
+ URL url = getClass().getResource("/test-documents/" + filename);
+ assertEquals(expected, repo.getType(url));
}
}
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=735193&r1=735192&r2=735193&view=diff
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
(original)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
Fri Jan 16 17:10:20 2009
@@ -50,19 +50,19 @@
String data = "<p>test <span>content</span></p>";
InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
Reader reader = new ParsingReader(stream, "test.xml");
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('s', reader.read());
- assertEquals('t', reader.read());
- assertEquals(' ', reader.read());
- assertEquals('c', reader.read());
- assertEquals('o', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('\n', reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('s', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
+ assertEquals('c', (char) reader.read());
+ assertEquals('o', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('\n', (char) reader.read());
assertEquals(-1, reader.read());
reader.close();
assertEquals(-1, stream.read());