Author: nick
Date: Wed Mar 23 18:11:32 2011
New Revision: 1084658
URL: http://svn.apache.org/viewvc?rev=1084658&view=rev
Log:
Add some more detection tests, which show that for container formats the
addition of the filename lets us specialise from eg tika-msoffice to msword
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1084658&r1=1084657&r2=1084658&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Wed Mar 23 18:11:32 2011
@@ -132,6 +132,34 @@ public class TestMimeTypes extends TestC
* iffy, as we can't be sure where things will end up.
* People really ought to use the container aware detection...
*/
+ public void testOLE2Detection() throws Exception {
+ // These have the properties block near the start, so our mime
+ // magic will spot them
+ assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
+
+ // This one quite legitimately doesn't have its properties block
+ // as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
+ assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
+
+
+ // By name + data:
+
+ // Those we got right to start with are fine
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
+
+ // And the name lets us specialise the generic OOXML
+ // ones to their actual type
+ assertTypeByNameAndData("application/vnd.ms-powerpoint",
"testPPT.ppt");
+ assertTypeByNameAndData("application/msword", "testWORD.doc");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
public void testOoxmlDetection() throws Exception {
// These two do luckily have [Content_Types].xml near the start,
// so our mime magic will spot them
@@ -143,6 +171,13 @@ public class TestMimeTypes extends TestC
// As such, our mime magic can't figure it out...
assertTypeByData("application/zip", "testWORD.docx");
+ // If we give the filename as well as the data, we can
+ // specialise the ooxml generic one to the correct type
+
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"testEXCEL.xlsx");
+
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation",
"testPPT.pptx");
+
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"testWORD.docx");
+
+ // Test a few of the less usual ones
assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12",
"testPPT.pptm");
assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12",
"testPPT.potm");
@@ -364,14 +399,7 @@ public class TestMimeTypes extends TestC
public void testMimeDeterminationForTestDocuments() throws Exception {
assertType("text/html", "testHTML.html");
assertType("application/zip", "test-documents.zip");
- // TODO: Currently returns generic MS Office type based on
- // the magic header. The getMimeType method should understand
- // MS Office types better.
- // assertEquals("application/vnd.ms-excel",
- // getMimeType("testEXCEL.xls"));
- // assertEquals("application/vnd.ms-powerpoint",
- // getMimeType("testPPT.ppt"));
- // assertEquals("application/msword", getMimeType("testWORD.doc"));
+
assertType("text/html", "testHTML_utf8.html");
assertType(
"application/vnd.oasis.opendocument.text",