Author: nick Date: Mon Nov 21 10:30:22 2011 New Revision: 1204435 URL: http://svn.apache.org/viewvc?rev=1204435&view=rev Log: Expand container detection tests, and added disabled (failing) tests for TIKA-786
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1204435&r1=1204434&r2=1204435&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Nov 21 10:30:22 2011 @@ -35,31 +35,52 @@ public class TestContainerAwareDetector private final Detector detector = new DefaultDetector(); - private void assertDetect(String file, String type) throws Exception { - TikaInputStream stream = TikaInputStream.get( - TestContainerAwareDetector.class.getResource( - "/test-documents/" + file)); - try { - assertEquals( - MediaType.parse(type), - detector.detect(stream, new Metadata())); - } finally { - stream.close(); - } + private void assertTypeByData(String file, String type) throws Exception { + assertTypeByNameAndData(file, null, type); + } + private void assertTypeByNameAndData(String file, String type) throws Exception { + assertTypeByNameAndData(file, file, type); + } + private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception { + TikaInputStream stream = TikaInputStream.get( + TestContainerAwareDetector.class.getResource( + "/test-documents/" + dataFile)); + try { + Metadata m = new Metadata(); + if (name != null) + m.add(Metadata.RESOURCE_NAME_KEY, name); + + assertEquals( + MediaType.parse(type), + detector.detect(stream, m)); + } finally { + stream.close(); + } } public void testDetectOLE2() throws Exception { // Microsoft office types known by POI - assertDetect("testEXCEL.xls", "application/vnd.ms-excel"); - assertDetect("testWORD.doc", "application/msword"); - assertDetect("testPPT.ppt", "application/vnd.ms-powerpoint"); + assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel"); + assertTypeByData("testWORD.doc", "application/msword"); + assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint"); // Try some ones that POI doesn't handle, that are still OLE2 based - assertDetect("testWORKS.wps", "application/vnd.ms-works"); - assertDetect("testWORKS2000.wps", "application/vnd.ms-works"); - assertDetect("testCOREL.shw", "application/x-corelpresentations"); - assertDetect("testQUATTRO.qpw", "application/x-quattro-pro"); - assertDetect("testQUATTRO.wb3", "application/x-quattro-pro"); + assertTypeByData("testWORKS.wps", "application/vnd.ms-works"); + assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works"); + assertTypeByData("testCOREL.shw", "application/x-corelpresentations"); + assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro"); + assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro"); + + // With the filename and data + assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel"); + assertTypeByNameAndData("testWORD.doc", "application/msword"); + assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint"); + + // With the wrong filename supplied, data will trump filename + // TODO Fix this! (TIKA-786) +// assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); +// assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); +// assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); } public void testOpenContainer() throws Exception { @@ -78,23 +99,37 @@ public class TestContainerAwareDetector } public void testDetectODF() throws Exception { - assertDetect("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text"); - assertDetect("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula"); + assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text"); + assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula"); } public void testDetectOOXML() throws Exception { - assertDetect("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - assertDetect("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - assertDetect("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // Check some of the less common OOXML types - assertDetect("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12"); - assertDetect("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow"); - assertDetect("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"); + assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12"); + assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow"); + assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"); // .xlsb is an OOXML file containing the binary parts, and not // an OLE2 file as you might initially expect! - assertDetect("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"); + assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"); + + // With the filename and data + assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + + // With the wrong filename supplied, data will trump filename + // TODO Fix this! (TIKA-786) +// assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); +// assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); +// assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + + // With an incorrect filename of a different container type, data trumps filename + assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); } /** @@ -131,15 +166,15 @@ public class TestContainerAwareDetector } public void testDetectIWork() throws Exception { - assertDetect("testKeynote.key", "application/vnd.apple.keynote"); - assertDetect("testNumbers.numbers", "application/vnd.apple.numbers"); - assertDetect("testPages.pages", "application/vnd.apple.pages"); + assertTypeByData("testKeynote.key", "application/vnd.apple.keynote"); + assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers"); + assertTypeByData("testPages.pages", "application/vnd.apple.pages"); } public void testDetectZip() throws Exception { - assertDetect("test-documents.zip", "application/zip"); - assertDetect("test-zip-of-zip.zip", "application/zip"); - assertDetect("testJAR.jar", "application/java-archive"); + assertTypeByData("test-documents.zip", "application/zip"); + assertTypeByData("test-zip-of-zip.zip", "application/zip"); + assertTypeByData("testJAR.jar", "application/java-archive"); } private TikaInputStream getTruncatedFile(String name, int n)