http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java new file mode 100644 index 0000000..b852de0 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -0,0 +1,1044 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +// Junit imports + +import static java.nio.charset.StandardCharsets.UTF_16BE; +import static java.nio.charset.StandardCharsets.UTF_16LE; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNotSame; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.junit.Before; +import org.junit.Test; + +/** + * + * Test Suite for the {@link MimeTypes} repository. + * + */ +public class TestMimeTypes extends TikaTest { + + private Tika tika; + + private MimeTypes repo; + + private URL u; + + private static final File f = new File("/a/b/c/x.pdf"); + + @Before + public void setUp() throws Exception{ + TikaConfig config = TikaConfig.getDefaultConfig(); + repo = config.getMimeRepository(); + tika = new Tika(config); + u = new URL("http://mydomain.com/x.pdf?x=y"); + } + + @Test + public void testCaseSensitivity() { + String type = tika.detect("test.PDF"); + assertNotNull(type); + assertEquals(type, tika.detect("test.pdf")); + assertEquals(type, tika.detect("test.PdF")); + assertEquals(type, tika.detect("test.pdF")); + } + + @Test + public void testLoadMimeTypes() throws MimeTypeException { + assertNotNull(repo.forName("application/octet-stream")); + assertNotNull(repo.forName("text/x-tex")); + } + + /** + * Tests MIME type determination based solely on the URL's extension. + */ + @Test + public void testGuessMimeTypes() throws Exception { + assertTypeByName("application/pdf", "x.pdf"); + assertEquals("application/pdf", tika.detect(u.toExternalForm())); + assertEquals("application/pdf", tika.detect(f.getPath())); + assertTypeByName("text/plain", "x.txt"); + assertTypeByName("text/html", "x.htm"); + assertTypeByName("text/html", "x.html"); + assertTypeByName("application/xhtml+xml", "x.xhtml"); + assertTypeByName("application/xml", "x.xml"); + assertTypeByName("application/zip", "x.zip"); + assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt"); + assertTypeByName("application/octet-stream", "x.unknown"); + + // Test for the MS Office media types and file extensions listed in + // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx + assertTypeByName("application/msword", "x.doc"); + assertTypeByName("application/msword", "x.dot"); + assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx"); + assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm"); + assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm"); + assertTypeByName("application/vnd.ms-excel", "x.xls"); + assertTypeByName("application/vnd.ms-excel", "x.xlt"); + assertTypeByName("application/vnd.ms-excel", "x.xla"); + assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx"); + assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm"); + assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm"); + assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam"); + assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb"); + assertTypeByName("application/vnd.ms-powerpoint", "x.ppt"); + assertTypeByName("application/vnd.ms-powerpoint", "x.pot"); + assertTypeByName("application/vnd.ms-powerpoint", "x.pps"); + assertTypeByName("application/vnd.ms-powerpoint", "x.ppa"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx"); + assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam"); + assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm"); + assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm"); + assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm"); + } + + /** + * Note - detecting container formats by mime magic is very very + * iffy, as we can't be sure where things will end up. + * People really ought to use the container aware detection... + */ + @Test + public void testOLE2Detection() throws Exception { + // These have the properties block near the start, so our mime + // magic will spot them + assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls"); + + // This one quite legitimately doesn't have its properties block + // as one of the first couple of entries + // As such, our mime magic can't figure it out... + assertTypeByData("application/x-tika-msoffice", "testWORD.doc"); + assertTypeByData("application/x-tika-msoffice", "testPPT.ppt"); + + + // By name + data: + + // Those we got right to start with are fine + assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls"); + + // And the name lets us specialise the generic OOXML + // ones to their actual type + assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt"); + assertTypeByNameAndData("application/msword", "testWORD.doc"); + } + + /** + * Files generated by Works 7.0 Spreadsheet application use the OLE2 + * structure and resemble Excel files (they contain a "Workbook"). They are + * not Excel though. They are distinguished from Excel files with an + * additional top-level entry in below the root of the POI filesystem. + * + * @throws Exception + */ + @Test + public void testWorksSpreadsheetDetection() throws Exception { + assertTypeDetection("testWORKSSpreadsheet7.0.xlr", + // with name-only, everything should be all right + "application/x-tika-msworks-spreadsheet", + // this is possible due to MimeTypes guessing the type + // based on the WksSSWorkBook near the beginning of the + // file + "application/x-tika-msworks-spreadsheet", + // this is right, the magic-based detection works, there is + // no need for the name-based detection to refine it + "application/x-tika-msworks-spreadsheet"); + } + + @Test + public void testStarOfficeDetection() throws Exception { + assertTypeDetection("testVORCalcTemplate.vor", + "application/x-staroffice-template", + "application/vnd.stardivision.calc", + "application/vnd.stardivision.calc"); + assertTypeDetection("testVORDrawTemplate.vor", + "application/x-staroffice-template", + "application/vnd.stardivision.draw", + "application/vnd.stardivision.draw"); + assertTypeDetection("testVORImpressTemplate.vor", + "application/x-staroffice-template", + "application/vnd.stardivision.impress", + "application/vnd.stardivision.impress"); + assertTypeDetection("testVORWriterTemplate.vor", + "application/x-staroffice-template", + "application/vnd.stardivision.writer", + "application/vnd.stardivision.writer"); + + assertTypeDetection("testStarOffice-5.2-calc.sdc", + "application/vnd.stardivision.calc", + "application/vnd.stardivision.calc", + "application/vnd.stardivision.calc"); + assertTypeDetection("testStarOffice-5.2-draw.sda", + "application/vnd.stardivision.draw", + "application/vnd.stardivision.draw", + "application/vnd.stardivision.draw"); + assertTypeDetection("testStarOffice-5.2-impress.sdd", + "application/vnd.stardivision.impress", + "application/vnd.stardivision.impress", + "application/vnd.stardivision.impress"); + assertTypeDetection("testStarOffice-5.2-writer.sdw", + "application/vnd.stardivision.writer", + "application/vnd.stardivision.writer", + "application/vnd.stardivision.writer"); + } + + /** + * Files generated by Works Word Processor versions 3.0 and 4.0 use the + * OLE2 structure. They don't resemble Word though. + * + * @throws Exception + */ + @Test + public void testOldWorksWordProcessorDetection() throws Exception { + assertTypeDetection( + "testWORKSWordProcessor3.0.wps", + // .wps is just like any other works extension + "application/vnd.ms-works", + // this is due to MatOST substring + "application/vnd.ms-works", + // magic-based detection works, no need to refine it + "application/vnd.ms-works"); + + // files in version 4.0 are no different from those in version 3.0 + assertTypeDetection( + "testWORKSWordProcessor4.0.wps", + "application/vnd.ms-works", + "application/vnd.ms-works", + "application/vnd.ms-works"); + } + + /** + * Files from Excel 2 through 4 are based on the BIFF record + * structure, but without a wrapping OLE2 structure. + * Excel 5 and Excel 95+ work on OLE2 + */ + @Test + public void testOldExcel() throws Exception { + // With just a name, we'll think everything's a new Excel file + assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls"); + assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls"); + assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls"); + + // With data, we can work out if it's old or new style + assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls"); + assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls"); + assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls"); + + assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls"); + assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls"); + assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls"); + } + + /** + * Note - detecting container formats by mime magic is very very + * iffy, as we can't be sure where things will end up. + * People really ought to use the container aware detection... + */ + @Test + public void testOoxmlDetection() throws Exception { + // These two do luckily have [Content_Types].xml near the start, + // so our mime magic will spot them + assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx"); + assertTypeByData("application/x-tika-ooxml", "testPPT.pptx"); + + // This one quite legitimately doesn't have its [Content_Types].xml + // file as one of the first couple of entries + // As such, our mime magic can't figure it out... + assertTypeByData("application/zip", "testWORD.docx"); + + // If we give the filename as well as the data, we can + // specialise the ooxml generic one to the correct type + assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx"); + assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx"); + assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx"); + + // Test a few of the less usual ones + assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb"); + assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm"); + assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm"); + assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm"); + } + + /** + * Note - container based formats, needs container detection + * to be properly correct + */ + @Test + public void testVisioDetection() throws Exception { + // By Name, should get it right + assertTypeByName("application/vnd.visio", "testVISIO.vsd"); + assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm"); + assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx"); + assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm"); + assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx"); + assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm"); + assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx"); + + // By Name and Data, should get it right + assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd"); + assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm"); + assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx"); + assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm"); + assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx"); + assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm"); + assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx"); + + // By Data only, will get the container parent + assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm"); + assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx"); + } + + /** + * Note - detecting container formats by mime magic is very very + * iffy, as we can't be sure where things will end up. + * People really ought to use the container aware detection... + */ + @Test + public void testIWorkDetection() throws Exception { + // By name is easy + assertTypeByName("application/vnd.apple.keynote", "testKeynote.key"); + assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers"); + assertTypeByName("application/vnd.apple.pages", "testPages.pages"); + + // We can't do it by data, as we'd need to unpack + // the zip file to check the XML + assertTypeByData("application/zip", "testKeynote.key"); + + assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key"); + assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers"); + assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages"); + } + + @Test + public void testArchiveDetection() throws Exception { + assertTypeByName("application/x-archive", "test.ar"); + assertTypeByName("application/zip", "test.zip"); + assertTypeByName("application/x-tar", "test.tar"); + assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it + assertTypeByName("application/x-cpio", "test.cpio"); + + // TODO Add an example .deb and .udeb, then check these + + // Check the mime magic patterns for them work too + assertTypeByData("application/x-archive", "testARofText.ar"); + assertTypeByData("application/x-archive", "testARofSND.ar"); + assertTypeByData("application/zip", "test-documents.zip"); + assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR + assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it + assertTypeByData("application/x-cpio", "test-documents.cpio"); + + // For spanned zip files, the .zip file doesn't have the header, it's the other parts + assertTypeByData("application/octet-stream", "test-documents-spanned.zip"); + assertTypeByData("application/zip", "test-documents-spanned.z01"); + } + + @Test + public void testFeedsDetection() throws Exception { + assertType("application/rss+xml", "rsstest.rss"); + assertType("application/atom+xml", "testATOM.atom"); + assertTypeByData("application/rss+xml", "rsstest.rss"); + assertTypeByName("application/rss+xml", "rsstest.rss"); + assertTypeByData("application/atom+xml", "testATOM.atom"); + assertTypeByName("application/atom+xml", "testATOM.atom"); + } + + @Test + public void testFitsDetection() throws Exception { + // FITS image created using imagemagick convert of testJPEG.jpg + assertType("application/fits", "testFITS.fits"); + assertTypeByData("application/fits", "testFITS.fits"); + assertTypeByName("application/fits", "testFITS.fits"); + } + + @Test + public void testJpegDetection() throws Exception { + assertType("image/jpeg", "testJPEG.jpg"); + assertTypeByData("image/jpeg", "testJPEG.jpg"); + assertTypeByName("image/jpeg", "x.jpg"); + assertTypeByName("image/jpeg", "x.JPG"); + assertTypeByName("image/jpeg", "x.jpeg"); + assertTypeByName("image/jpeg", "x.JPEG"); + assertTypeByName("image/jpeg", "x.jpe"); + assertTypeByName("image/jpeg", "x.jif"); + assertTypeByName("image/jpeg", "x.jfif"); + assertTypeByName("image/jpeg", "x.jfi"); + + assertType("image/jp2", "testJPEG.jp2"); + assertTypeByData("image/jp2", "testJPEG.jp2"); + assertTypeByName("image/jp2", "x.jp2"); + } + + @Test + public void testBpgDetection() throws Exception { + assertType("image/x-bpg", "testBPG.bpg"); + assertTypeByData("image/x-bpg", "testBPG.bpg"); + assertTypeByData("image/x-bpg", "testBPG_commented.bpg"); + assertTypeByName("image/x-bpg", "x.bpg"); + } + + @Test + public void testTiffDetection() throws Exception { + assertType("image/tiff", "testTIFF.tif"); + assertTypeByData("image/tiff", "testTIFF.tif"); + assertTypeByName("image/tiff", "x.tiff"); + assertTypeByName("image/tiff", "x.tif"); + assertTypeByName("image/tiff", "x.TIF"); + } + + @Test + public void testGifDetection() throws Exception { + assertType("image/gif", "testGIF.gif"); + assertTypeByData("image/gif", "testGIF.gif"); + assertTypeByName("image/gif", "x.gif"); + assertTypeByName("image/gif", "x.GIF"); + } + + @Test + public void testPngDetection() throws Exception { + assertType("image/png", "testPNG.png"); + assertTypeByData("image/png", "testPNG.png"); + assertTypeByName("image/png", "x.png"); + assertTypeByName("image/png", "x.PNG"); + } + + @Test + public void testWEBPDetection() throws Exception { + assertType("image/webp", "testWEBP.webp"); + assertTypeByData("image/webp", "testWEBP.webp"); + assertTypeByName("image/webp", "x.webp"); + assertTypeByName("image/webp", "x.WEBP"); + } + + @Test + public void testBmpDetection() throws Exception { + assertType("image/x-ms-bmp", "testBMP.bmp"); + assertTypeByData("image/x-ms-bmp", "testBMP.bmp"); + assertTypeByName("image/x-ms-bmp", "x.bmp"); + assertTypeByName("image/x-ms-bmp", "x.BMP"); + assertTypeByName("image/x-ms-bmp", "x.dib"); + assertTypeByName("image/x-ms-bmp", "x.DIB"); + //false positive check -- contains part of BMP signature + assertType("text/plain", "testBMPfp.txt"); + } + + @Test + public void testPnmDetection() throws Exception { + assertType("image/x-portable-bitmap", "testPBM.pbm"); + assertType("image/x-portable-graymap", "testPGM.pgm"); + assertType("image/x-portable-pixmap", "testPPM.ppm"); + assertTypeByData("image/x-portable-bitmap", "testPBM.pbm"); + assertTypeByData("image/x-portable-graymap", "testPGM.pgm"); + assertTypeByData("image/x-portable-pixmap", "testPPM.ppm"); + assertTypeByName("image/x-portable-anymap", "x.pnm"); + assertTypeByName("image/x-portable-anymap", "x.PNM"); + assertTypeByName("image/x-portable-bitmap", "x.pbm"); + assertTypeByName("image/x-portable-bitmap", "x.PBM"); + assertTypeByName("image/x-portable-graymap", "x.pgm"); + assertTypeByName("image/x-portable-graymap", "x.PGM"); + assertTypeByName("image/x-portable-pixmap", "x.ppm"); + assertTypeByName("image/x-portable-pixmap", "x.PPM"); + } + + @Test + public void testPictDetection() throws Exception { + assertType("image/x-pict", "testPICT.pct"); + assertTypeByData("image/x-pict", "testPICT.pct"); + assertTypeByName("image/x-pict", "x.pic"); + assertTypeByName("image/x-pict", "x.PCT"); + } + + @Test + public void testCgmDetection() throws Exception { + // TODO: Need a test image file + assertTypeByName("image/cgm", "x.cgm"); + assertTypeByName("image/cgm", "x.CGM"); + } + + @Test + public void testRdfXmlDetection() throws Exception { + assertTypeByName("application/rdf+xml", "x.rdf"); + assertTypeByName("application/rdf+xml", "x.owl"); + } + + @Test + public void testSvgDetection() throws Exception { + assertType("image/svg+xml", "testSVG.svg"); + assertTypeByData("image/svg+xml", "testSVG.svg"); + assertTypeByName("image/svg+xml", "x.svg"); + assertTypeByName("image/svg+xml", "x.SVG"); + + // Should *.svgz be svg or gzip + assertType("application/gzip", "testSVG.svgz"); + assertTypeByData("application/gzip", "testSVG.svgz"); + assertTypeByName("image/svg+xml", "x.svgz"); + assertTypeByName("image/svg+xml", "x.SVGZ"); + } + + @Test + public void testPdfDetection() throws Exception { + // PDF extension by name is enough + assertTypeByName("application/pdf", "x.pdf"); + assertTypeByName("application/pdf", "x.PDF"); + + // For normal PDFs, can get by name or data or both + assertType("application/pdf", "testPDF.pdf"); + assertTypeByData("application/pdf", "testPDF.pdf"); + + // PDF with a BoM works both ways too + assertType("application/pdf", "testPDF_bom.pdf"); + assertTypeByData("application/pdf", "testPDF_bom.pdf"); + } + + @Test + public void testSwfDetection() throws Exception { + assertTypeByName("application/x-shockwave-flash", "x.swf"); + assertTypeByName("application/x-shockwave-flash", "x.SWF"); + assertTypeByName("application/x-shockwave-flash", "test1.swf"); + assertTypeByName("application/x-shockwave-flash", "test2.swf"); + assertTypeByName("application/x-shockwave-flash", "test3.swf"); + } + + @Test + public void testDwgDetection() throws Exception { + assertTypeByName("image/vnd.dwg", "x.dwg"); + assertTypeByData("image/vnd.dwg", "testDWG2004.dwg"); + assertTypeByData("image/vnd.dwg", "testDWG2007.dwg"); + assertTypeByData("image/vnd.dwg", "testDWG2010.dwg"); + } + + @Test + public void testprtDetection() throws Exception { + assertTypeByName("application/x-prt", "x.prt"); + assertTypeByData("application/x-prt", "testCADKEY.prt"); + } + + /** + * Formats which are based on plain text + */ + @Test + public void testTextBasedFormatsDetection() throws Exception { + assertTypeByName("text/plain", "testTXT.txt"); + assertType( "text/plain", "testTXT.txt"); + + assertTypeByName("text/css", "testCSS.css"); + assertType( "text/css", "testCSS.css"); + + assertTypeByName("text/csv", "testCSV.csv"); + assertType( "text/csv", "testCSV.csv"); + + assertTypeByName("text/html", "testHTML.html"); + assertType( "text/html", "testHTML.html"); + + assertTypeByName("application/javascript", "testJS.js"); + assertType( "application/javascript", "testJS.js"); + } + + @Test + public void testJavaDetection() throws Exception { + // TODO Classloader doesn't seem to find the .class file in test-documents + //assertTypeDetection("AutoDetectParser.class", "application/java-vm"); + + // OSX Native Extension + assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib"); + } + + @Test + public void testXmlAndHtmlDetection() throws Exception { + assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>" + .getBytes(UTF_8)); + assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>" + .getBytes(UTF_16LE)); + assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>" + .getBytes(UTF_16BE)); + assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>" + .getBytes(UTF_8)); + assertTypeByData("text/html", "<html><body>HTML</body></html>" + .getBytes(UTF_8)); + assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>" + .getBytes(UTF_8)); + } + + @Test + public void testWmfDetection() throws Exception { + assertTypeByName("application/x-msmetafile", "x.wmf"); + assertTypeByData("application/x-msmetafile", "testWMF.wmf"); + assertTypeByName("application/x-msmetafile", "x.WMF"); + + assertTypeByName("application/x-emf", "x.emf"); + assertTypeByData("application/x-emf","testEMF.emf"); + assertTypeByName("application/x-emf", "x.EMF"); + // TODO: Need a test wmz file + assertTypeByName("application/x-ms-wmz", "x.wmz"); + assertTypeByName("application/x-ms-wmz", "x.WMZ"); + // TODO: Need a test emz file + assertTypeByName("application/gzip", "x.emz"); + assertTypeByName("application/gzip", "x.EMZ"); + } + + @Test + public void testPsDetection() throws Exception { + // TODO: Need a test postscript file + assertTypeByName("application/postscript", "x.ps"); + assertTypeByName("application/postscript", "x.PS"); + assertTypeByName("application/postscript", "x.eps"); + assertTypeByName("application/postscript", "x.epsf"); + assertTypeByName("application/postscript", "x.epsi"); + } + + @Test + public void testMicrosoftMultiMediaDetection() throws Exception { + assertTypeByName("video/x-ms-asf", "x.asf"); + assertTypeByName("video/x-ms-wmv", "x.wmv"); + assertTypeByName("audio/x-ms-wma", "x.wma"); + + assertTypeByData("video/x-ms-asf", "testASF.asf"); + assertTypeByData("video/x-ms-wmv", "testWMV.wmv"); + assertTypeByData("audio/x-ms-wma", "testWMA.wma"); + } + + /** + * All 3 DITA types are in theory handled by the same mimetype, + * but we specialise them + */ + @Test + public void testDITADetection() throws Exception { + assertTypeByName("application/dita+xml; format=topic", "test.dita"); + assertTypeByName("application/dita+xml; format=map", "test.ditamap"); + assertTypeByName("application/dita+xml; format=val", "test.ditaval"); + + assertTypeByData("application/dita+xml; format=task", "testDITA.dita"); + assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita"); + assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap"); + + assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita"); + assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita"); + assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap"); + + // These are all children of the official type + assertEquals("application/dita+xml", + repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString()); + assertEquals("application/dita+xml", + repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString()); + // Concept inherits from topic + assertEquals("application/dita+xml; format=topic", + repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString()); + } + + /** + * @since TIKA-194 + */ + @Test + public void testJavaRegex() throws Exception{ + MimeType testType = new MimeType(MediaType.parse("foo/bar")); + this.repo.add(testType); + assertNotNull(repo.forName("foo/bar")); + String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}"; + this.repo.addPattern(testType, pattern, true); + String testFileName = "rtg_sst_grb_0.5.12345678"; + assertEquals("foo/bar", tika.detect(testFileName)); + + MimeType testType2 = new MimeType(MediaType.parse("foo/bar2")); + this.repo.add(testType2); + assertNotNull(repo.forName("foo/bar2")); + this.repo.addPattern(testType2, pattern, false); + assertNotSame("foo/bar2", tika.detect(testFileName)); + } + + @Test + public void testRawDetection() throws Exception { + assertTypeByName("image/x-raw-adobe", "x.dng"); + assertTypeByName("image/x-raw-adobe", "x.DNG"); + assertTypeByName("image/x-raw-hasselblad", "x.3fr"); + assertTypeByName("image/x-raw-fuji", "x.raf"); + assertTypeByName("image/x-raw-canon", "x.crw"); + assertTypeByName("image/x-raw-canon", "x.cr2"); + assertTypeByName("image/x-raw-kodak", "x.k25"); + assertTypeByName("image/x-raw-kodak", "x.kdc"); + assertTypeByName("image/x-raw-kodak", "x.dcs"); + assertTypeByName("image/x-raw-kodak", "x.drf"); + assertTypeByName("image/x-raw-minolta", "x.mrw"); + assertTypeByName("image/x-raw-nikon", "x.nef"); + assertTypeByName("image/x-raw-nikon", "x.nrw"); + assertTypeByName("image/x-raw-olympus", "x.orf"); + assertTypeByName("image/x-raw-pentax", "x.ptx"); + assertTypeByName("image/x-raw-pentax", "x.pef"); + assertTypeByName("image/x-raw-sony", "x.arw"); + assertTypeByName("image/x-raw-sony", "x.srf"); + assertTypeByName("image/x-raw-sony", "x.sr2"); + assertTypeByName("image/x-raw-sigma", "x.x3f"); + assertTypeByName("image/x-raw-epson", "x.erf"); + assertTypeByName("image/x-raw-mamiya", "x.mef"); + assertTypeByName("image/x-raw-leaf", "x.mos"); + assertTypeByName("image/x-raw-panasonic", "x.raw"); + assertTypeByName("image/x-raw-panasonic", "x.rw2"); + assertTypeByName("image/x-raw-phaseone", "x.iiq"); + assertTypeByName("image/x-raw-red", "x.r3d"); + assertTypeByName("image/x-raw-imacon", "x.fff"); + assertTypeByName("image/x-raw-logitech", "x.pxn"); + assertTypeByName("image/x-raw-casio", "x.bay"); + assertTypeByName("image/x-raw-rawzor", "x.rwz"); + } + + /** + * Tests that we correctly detect the font types + */ + @Test + public void testFontDetection() throws Exception { + assertTypeByName("application/x-font-adobe-metric", "x.afm"); + assertTypeByData("application/x-font-adobe-metric", "testAFM.afm"); + + assertTypeByName("application/x-font-printer-metric", "x.pfm"); + // TODO Get a sample .pfm file + assertTypeByData( + "application/x-font-printer-metric", + new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f, + 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20} + ); + + assertTypeByName("application/x-font-type1", "x.pfa"); + // TODO Get a sample .pfa file + assertTypeByData( + "application/x-font-type1", + new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, + 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, + 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20} + ); + + assertTypeByName("application/x-font-type1", "x.pfb"); + // TODO Get a sample .pfm file + assertTypeByData( + "application/x-font-type1", + new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21, + 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65, + 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 } + ); + } + + /** + * Tests MimeTypes.getMimeType(URL), which examines both the byte header + * and, if necessary, the URL's extension. + */ + @Test + public void testMimeDeterminationForTestDocuments() throws Exception { + assertType("text/html", "testHTML.html"); + assertType("application/zip", "test-documents.zip"); + + assertType("text/html", "testHTML_utf8.html"); + assertType( + "application/vnd.oasis.opendocument.text", + "testOpenOffice2.odt"); + assertType("application/pdf", "testPDF.pdf"); + assertType("application/rtf", "testRTF.rtf"); + assertType("text/plain", "testTXT.txt"); + assertType("application/xml", "testXML.xml"); + assertType("audio/basic", "testAU.au"); + assertType("audio/x-aiff", "testAIFF.aif"); + assertType("audio/x-wav", "testWAV.wav"); + assertType("audio/midi", "testMID.mid"); + assertType("application/x-msaccess", "testACCESS.mdb"); + assertType("application/x-font-ttf", "testTrueType3.ttf"); + } + + @Test + public void test7ZipDetection() throws Exception { + assertTypeByName("application/x-7z-compressed","test-documents.7z"); + assertTypeByData("application/x-7z-compressed","test-documents.7z"); + assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z"); + } + + @Test + public void testWebArchiveDetection() throws Exception { + assertTypeByName("application/x-webarchive","x.webarchive"); + assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive"); + assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive"); + } + + /** + * KML, and KMZ (zipped KML) + */ + @Test + public void testKMLZDetection() throws Exception { + assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml"); + assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml"); + assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml"); + + assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz"); + assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz"); + + // By data only, mimetype magic only gets us to a .zip + // We need to use the Zip Aware detector to get the full type + assertTypeByData("application/zip","testKMZ.kmz"); + } + + @Test + public void testCreativeSuite() throws IOException { + assertTypeDetection("testINDD.indd", "application/x-adobe-indesign"); + assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop"); + } + + @Test + public void testAMR() throws IOException { + // AMR matches on name, data or both + assertTypeDetection("testAMR.amr", "audio/amr"); + + // AMR-WB subtype shares extension, so needs data to identify + assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb"); + + // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet + //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+"); + } + + @Test + public void testEmail() throws IOException { + // EMLX + assertTypeDetection("testEMLX.emlx", "message/x-emlx"); + + // Groupwise + assertTypeDetection("testGroupWiseEml.eml", "message/rfc822"); + + // Lotus + assertTypeDetection("testLotusEml.eml", "message/rfc822"); + + // Thunderbird - doesn't currently work by name + assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml"); + } + + @Test + public void testAxCrypt() throws Exception { + // test-TXT.txt encrypted with a key of "tika" + assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt"); + } + + @Test + public void testWindowsEXE() throws Exception { + assertTypeByName("application/x-msdownload", "x.dll"); + assertTypeByName("application/x-ms-installer", "x.msi"); + assertTypeByName("application/x-dosexec", "x.exe"); + + assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe"); + assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe"); + + // A jar file with part of a PE header, but not a full one + // should still be detected as a zip or jar (without/with name) + assertTypeByData("application/zip", "testJAR_with_PEHDR.jar"); + assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar"); + } + + @Test + public void testMatroskaDetection() throws Exception { + assertType("video/x-matroska", "testMKV.mkv"); + // TODO: Need custom detector data detection, see TIKA-1180 + assertTypeByData("application/x-matroska", "testMKV.mkv"); + assertTypeByNameAndData("video/x-matroska", "testMKV.mkv"); + assertTypeByName("video/x-matroska", "x.mkv"); + assertTypeByName("video/x-matroska", "x.MKV"); + assertTypeByName("audio/x-matroska", "x.mka"); + assertTypeByName("audio/x-matroska", "x.MKA"); + } + + @Test + public void testWebMDetection() throws Exception { + assertType("video/webm", "testWEBM.webm"); + // TODO: Need custom detector data detection, see TIKA-1180 + assertTypeByData("application/x-matroska", "testWEBM.webm"); + assertTypeByNameAndData("video/webm", "testWEBM.webm"); + assertTypeByName("video/webm", "x.webm"); + assertTypeByName("video/webm", "x.WEBM"); + } + + /** Test getMimeType(byte[]) */ + @Test + public void testGetMimeType_byteArray() throws IOException { + // Plain text detection + assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); + assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); + assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); + assertText(new byte[] { 'a', 'b', 'c' }); + assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B }); + assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C }); + } + + @Test + public void testBerkeleyDB() throws IOException { + assertTypeByData( + "application/x-berkeley-db; format=btree; version=2", + "testBDB_btree_2.db"); + assertTypeByData( + "application/x-berkeley-db; format=btree; version=3", + "testBDB_btree_3.db"); + assertTypeByData( + "application/x-berkeley-db; format=btree; version=4", + "testBDB_btree_4.db"); + // V4 and V5 share the same btree format + assertTypeByData( + "application/x-berkeley-db; format=btree; version=4", + "testBDB_btree_5.db"); + + assertTypeByData( + "application/x-berkeley-db; format=hash; version=2", + "testBDB_hash_2.db"); + assertTypeByData( + "application/x-berkeley-db; format=hash; version=3", + "testBDB_hash_3.db"); + assertTypeByData( + "application/x-berkeley-db; format=hash; version=4", + "testBDB_hash_4.db"); + assertTypeByData( + "application/x-berkeley-db; format=hash; version=5", + "testBDB_hash_5.db"); + } + + /** + * CBOR typically contains HTML + */ + @Test + public void testCBOR() throws IOException { + assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor"); + assertTypeByData("application/cbor", "NUTCH-1997.cbor"); + } + + @Test + public void testZLIB() throws IOException { + // ZLIB encoded versions of testTXT.txt + assertTypeByData("application/zlib", "testTXT.zlib"); + assertTypeByData("application/zlib", "testTXT.zlib0"); + assertTypeByData("application/zlib", "testTXT.zlib5"); + assertTypeByData("application/zlib", "testTXT.zlib9"); + } + + @Test + public void testTextFormats() throws Exception { + assertType("application/x-bibtex-text-file", "testBIBTEX.bib"); + assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib"); + } + + @Test + public void testCodeFormats() throws Exception { + assertType("text/x-csrc", "testC.c"); + assertType("text/x-chdr", "testH.h"); + assertTypeByData("text/x-csrc", "testC.c"); + assertTypeByData("text/x-chdr", "testH.h"); + + assertTypeByName("text/x-java-source", "testJAVA.java"); + assertType("text/x-java-properties", "testJAVAPROPS.properties"); + + assertType("text/x-matlab", "testMATLAB.m"); + assertType("text/x-matlab", "testMATLAB_wtsgaus.m"); + assertType("text/x-matlab", "testMATLAB_barcast.m"); + assertTypeByData("text/x-matlab", "testMATLAB.m"); + assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m"); + assertTypeByData("text/x-matlab", "testMATLAB_barcast.m"); + } + + @Test + public void testWebVTT() throws Exception { + assertType("text/vtt", "testWebVTT.vtt"); + assertTypeByData("text/vtt", "testWebVTT.vtt"); + } + + private void assertText(byte[] prefix) throws IOException { + assertMagic("text/plain", prefix); + } + + private void assertNotText(byte[] prefix) throws IOException { + assertMagic("application/octet-stream", prefix); + } + + private void assertMagic(String expected, byte[] prefix) throws IOException { + MediaType type = + repo.detect(new ByteArrayInputStream(prefix), new Metadata()); + assertNotNull(type); + assertEquals(expected, type.toString()); + } + + private void assertType(String expected, String filename) throws Exception { + try (InputStream stream = getTestDocumentAsStream(filename)) { + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + assertEquals(expected, repo.detect(stream, metadata).toString()); + } + } + + private void assertTypeByName(String expected, String filename) + throws IOException { + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + assertEquals(expected, repo.detect(null, metadata).toString()); + } + + private void assertTypeByData(String expected, String filename) + throws IOException { + try (InputStream stream = getTestDocumentAsStream(filename)) { + Metadata metadata = new Metadata(); + assertEquals(expected, repo.detect(stream, metadata).toString()); + } + } + + private void assertTypeByData(String expected, byte[] data) + throws IOException { + try (InputStream stream = new ByteArrayInputStream(data)) { + Metadata metadata = new Metadata(); + assertEquals(expected, repo.detect(stream, metadata).toString()); + } + } + + private void assertTypeDetection(String filename, String type) + throws IOException { + assertTypeDetection(filename, type, type, type); + } + + private void assertTypeDetection(String filename, String byName, String byData, + String byNameAndData) throws IOException { + assertTypeByName(byName, filename); + assertTypeByData(byData, filename); + assertTypeByNameAndData(byNameAndData, filename); + } + + private void assertTypeByNameAndData(String expected, String filename) + throws IOException { + assertEquals(expected, getTypeByNameAndData(filename).toString()); + } + + private MediaType getTypeByNameAndData(String filename) throws IOException { + try (InputStream stream = getTestDocumentAsStream(filename)) { + assertNotNull("Test document not found: " + filename, stream); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + return repo.detect(stream, metadata); + } + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java new file mode 100644 index 0000000..91b054e --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPDM; +import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.BodyContentHandler; +import org.gagravarr.tika.FlacParser; +import org.gagravarr.tika.OpusParser; +import org.gagravarr.tika.VorbisParser; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class AutoDetectParserTest { + private TikaConfig tika = TikaConfig.getDefaultConfig(); + + // Easy to read constants for the MIME types: + private static final String RAW = "application/octet-stream"; + private static final String EXCEL = "application/vnd.ms-excel"; + private static final String HTML = "text/html; charset=ISO-8859-1"; + private static final String PDF = "application/pdf"; + private static final String POWERPOINT = "application/vnd.ms-powerpoint"; + private static final String KEYNOTE = "application/vnd.apple.keynote"; + private static final String PAGES = "application/vnd.apple.pages"; + private static final String NUMBERS = "application/vnd.apple.numbers"; + private static final String CHM = "application/vnd.ms-htmlhelp"; + private static final String RTF = "application/rtf"; + private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1"; + private static final String UTF8TEXT = "text/plain; charset=UTF-8"; + private static final String WORD = "application/msword"; + private static final String XML = "application/xml"; + private static final String RSS = "application/rss+xml"; + private static final String BMP = "image/x-ms-bmp"; + private static final String GIF = "image/gif"; + private static final String JPEG = "image/jpeg"; + private static final String PNG = "image/png"; + private static final String OGG_VORBIS = "audio/vorbis"; + private static final String OGG_OPUS = "audio/opus"; + private static final String OGG_FLAC = "audio/x-oggflac"; + private static final String FLAC_NATIVE= "audio/x-flac"; + private static final String OPENOFFICE + = "application/vnd.oasis.opendocument.text"; + + + /** + * This is where a single test is done. + * @param tp the parameters encapsulated in a TestParams instance + * @throws IOException + */ + private void assertAutoDetect(TestParams tp) throws Exception { + try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) { + if (input == null) { + fail("Could not open stream from specified resource: " + + tp.resourceRealName); + } + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName); + metadata.set(Metadata.CONTENT_TYPE, tp.statedType); + ContentHandler handler = new BodyContentHandler(); + new AutoDetectParser(tika).parse(input, handler, metadata); + + assertEquals("Bad content type: " + tp, + tp.realType, metadata.get(Metadata.CONTENT_TYPE)); + + if (tp.expectedContentFragment != null) { + assertTrue("Expected content not found: " + tp, + handler.toString().contains(tp.expectedContentFragment)); + } + } + } + + /** + * Convenience method -- its sole purpose of existence is to make the + * call to it more readable than it would be if a TestParams instance + * would need to be instantiated there. + * + * @param resourceRealName real name of resource + * @param resourceStatedName stated name -- will a bad name fool us? + * @param realType - the real MIME type + * @param statedType - stated MIME type - will a wrong one fool us? + * @param expectedContentFragment - something expected in the text + * @throws Exception + */ + private void assertAutoDetect(String resourceRealName, + String resourceStatedName, + String realType, + String statedType, + String expectedContentFragment) + throws Exception { + + assertAutoDetect(new TestParams(resourceRealName, resourceStatedName, + realType, statedType, expectedContentFragment)); + } + + private void assertAutoDetect( + String resource, String type, String content) throws Exception { + + resource = "/test-documents/" + resource; + + // TODO !!!! The disabled tests below should work! + // The correct MIME type should be determined regardless of the + // stated type (ContentType hint) and the stated URL name. + + + // Try different combinations of correct and incorrect arguments: + final String wrongMimeType = RAW; + assertAutoDetect(resource, resource, type, type, content); + assertAutoDetect(resource, resource, type, null, content); + assertAutoDetect(resource, resource, type, wrongMimeType, content); + + assertAutoDetect(resource, null, type, type, content); + assertAutoDetect(resource, null, type, null, content); + assertAutoDetect(resource, null, type, wrongMimeType, content); + + final String badResource = "a.xyz"; + assertAutoDetect(resource, badResource, type, type, content); + assertAutoDetect(resource, badResource, type, null, content); + assertAutoDetect(resource, badResource, type, wrongMimeType, content); + } + + @Test + public void testKeynote() throws Exception { + assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation"); + } + + @Test + public void testPages() throws Exception { + assertAutoDetect("testPages.pages", PAGES, "Sample pages document"); + } + + @Test + public void testNumbers() throws Exception { + assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668"); + } + + @Test + public void testChm() throws Exception { + assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used."); + } + + @Test + public void testEpub() throws Exception { + assertAutoDetect( + "testEPUB.epub", "application/epub+zip", + "The previous headings were subchapters"); + } + + @Test + public void testExcel() throws Exception { + assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet"); + } + + @Test + public void testHTML() throws Exception { + assertAutoDetect("testHTML.html", HTML, "Test Indexation Html"); + } + + @Test + public void testOpenOffice() throws Exception { + assertAutoDetect("testOpenOffice2.odt", OPENOFFICE, + "This is a sample Open Office document"); + } + + @Test + public void testPDF() throws Exception { + assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit"); + + } + + @Test + public void testPowerpoint() throws Exception { + assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide"); + } + + @Test + public void testRdfXml() throws Exception { + assertAutoDetect("testRDF.rdf", "application/rdf+xml", ""); + } + + @Test + public void testRTF() throws Exception { + assertAutoDetect("testRTF.rtf", RTF, "indexation Word"); + } + + @Test + public void testText() throws Exception { + assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt"); + } + + @Test + public void testTextNonASCIIUTF8() throws Exception { + assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog"); + } + + @Test + public void testWord() throws Exception { + assertAutoDetect("testWORD.doc", WORD, "Sample Word Document"); + } + + @Test + public void testXML() throws Exception { + assertAutoDetect("testXML.xml", XML, "Lius"); + } + + @Test + public void testRss() throws Exception { + assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test"); + } + + @Test + public void testImages() throws Exception { + assertAutoDetect("testBMP.bmp", BMP, null); + assertAutoDetect("testGIF.gif", GIF, null); + assertAutoDetect("testJPEG.jpg", JPEG, null); + assertAutoDetect("testPNG.png", PNG, null); + } + + /** + * Make sure that zip bomb attacks are prevented. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a> + */ + @Test + public void testZipBombPrevention() throws Exception { + try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream( + "/test-documents/TIKA-216.tgz")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + new AutoDetectParser(tika).parse(tgz, handler, metadata); + fail("Zip bomb was not detected"); + } catch (TikaException e) { + // expected + } + } + + /** + * Make sure XML parse errors don't trigger ZIP bomb detection. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a> + */ + @Test + public void testNoBombDetectedForInvalidXml() throws Exception { + // create zip with ten empty / invalid XML files, 1.xml .. 10.xml + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ZipOutputStream zos = new ZipOutputStream(baos); + for (int i = 1; i <= 10; i++) { + zos.putNextEntry(new ZipEntry(i + ".xml")); + zos.closeEntry(); + } + zos.finish(); + zos.close(); + new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1), + new Metadata()); + } + + /** + * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc) + * have been correctly included, and are available + */ + @SuppressWarnings("deprecation") + @Test + public void testOggFlacAudio() throws Exception { + // The three test files should all have similar test data + String[] testFiles = new String[] { + "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", + "testOPUS.opus" + }; + MediaType[] mediaTypes = new MediaType[] { + MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), + MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) + }; + + // Check we can load the parsers, and they claim to do the right things + VorbisParser vParser = new VorbisParser(); + assertNotNull("Parser not found for " + mediaTypes[0], + vParser.getSupportedTypes(new ParseContext())); + + FlacParser fParser = new FlacParser(); + assertNotNull("Parser not found for " + mediaTypes[1], + fParser.getSupportedTypes(new ParseContext())); + assertNotNull("Parser not found for " + mediaTypes[2], + fParser.getSupportedTypes(new ParseContext())); + + OpusParser oParser = new OpusParser(); + assertNotNull("Parser not found for " + mediaTypes[3], + oParser.getSupportedTypes(new ParseContext())); + + // Check we found the parser + CompositeParser parser = (CompositeParser)tika.getParser(); + for (MediaType mt : mediaTypes) { + assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) ); + } + + // Have each file parsed, and check + for (int i=0; i<testFiles.length; i++) { + String file = testFiles[i]; + try (InputStream input = AutoDetectParserTest.class.getResourceAsStream( + "/test-documents/" + file)) { + if (input == null) { + fail("Could not find test file " + file); + } + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new AutoDetectParser(tika).parse(input, handler, metadata); + + assertEquals("Incorrect content type for " + file, + mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE)); + + // Check some of the common metadata + // Old style metadata + assertEquals("Test Artist", metadata.get(Metadata.AUTHOR)); + assertEquals("Test Title", metadata.get(Metadata.TITLE)); + // New style metadata + assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE)); + + // Check some of the XMPDM metadata + if (!file.endsWith(".opus")) { + assertEquals("Test Album", metadata.get(XMPDM.ALBUM)); + } + assertEquals("Test Artist", metadata.get(XMPDM.ARTIST)); + assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); + assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE)); + + // Check some of the text + String content = handler.toString(); + assertTrue(content.contains("Test Title")); + assertTrue(content.contains("Test Artist")); + } + } + } + + /** + * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit + * list of supported parsers. + * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a> + */ + @Test + public void testSpecificParserList() throws Exception { + AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser()); + + InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8)); + Metadata metadata = new Metadata(); + parser.parse(is, new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("value", metadata.get("MyParser")); + } + + private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser"); + + /** + * A test detector which always returns the type supported + * by the test parser + */ + @SuppressWarnings("serial") + private static class MyDetector implements Detector { + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + return MY_MEDIA_TYPE; + } + } + + @SuppressWarnings("serial") + private static class MyParser extends AbstractParser { + public Set<MediaType> getSupportedTypes(ParseContext context) { + Set<MediaType> supportedTypes = new HashSet<MediaType>(); + supportedTypes.add(MY_MEDIA_TYPE); + return supportedTypes; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) { + metadata.add("MyParser", "value"); + } + + } + + /** + * Minimal class to encapsulate all parameters -- the main reason for + * its existence is to aid in debugging via its toString() method. + * + * Getters and setters intentionally not provided. + */ + private static class TestParams { + + public String resourceRealName; + public String resourceStatedName; + public String realType; + public String statedType; + public String expectedContentFragment; + + + private TestParams(String resourceRealName, + String resourceStatedName, + String realType, + String statedType, + String expectedContentFragment) { + this.resourceRealName = resourceRealName; + this.resourceStatedName = resourceStatedName; + this.realType = realType; + this.statedType = statedType; + this.expectedContentFragment = expectedContentFragment; + } + + + /** + * Produces a string like the following: + * + * <pre> + * Test parameters: + * resourceRealName = /test-documents/testEXCEL.xls + * resourceStatedName = null + * realType = application/vnd.ms-excel + * statedType = null + * expectedContentFragment = Sample Excel Worksheet + * </pre> + */ + public String toString() { + return "Test parameters:\n" + + " resourceRealName = " + resourceRealName + "\n" + + " resourceStatedName = " + resourceStatedName + "\n" + + " realType = " + realType + "\n" + + " statedType = " + statedType + "\n" + + " expectedContentFragment = " + expectedContentFragment + "\n"; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java new file mode 100644 index 0000000..66323d3 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DigestingParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.digesting.CommonsDigester; +import org.junit.Test; + + +public class DigestingParserTest extends TikaTest { + + private final static String P = TikaCoreProperties.TIKA_META_PREFIX+ + "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER; + + private final int UNLIMITED = 1000000;//well, not really, but longer than input file + private final Parser p = new AutoDetectParser(); + + @Test + public void testBasic() throws Exception { + Map<CommonsDigester.DigestAlgorithm, String> expected = + new HashMap<CommonsDigester.DigestAlgorithm, String>(); + + expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f"); + expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772"); + expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6"); + expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" + + "82bc53764a0f1430d134ae3b70c32654"); + expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+ + "8b8a6923fdf251ddab72c6e4b5d54160" + + "9db917ba4260d1767995a844d8d654df"); + expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+ + "da4c21f36b54d7acd06fcf68e974663b"+ + "fed1d256875be58d22beacf178154cc3"+ + "a1178cb73443deaa53aa0840324708bb"); + + //test each one + for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) { + Metadata m = new Metadata(); + XMLResult xml = getXML("test_recursive_embedded.docx", + new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m); + assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString())); + } + + + //test comma separated + CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512"); + Metadata m = new Metadata(); + XMLResult xml = getXML("test_recursive_embedded.docx", + new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m); + for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{ + CommonsDigester.DigestAlgorithm.MD5, + CommonsDigester.DigestAlgorithm.SHA256, + CommonsDigester.DigestAlgorithm.SHA384, + CommonsDigester.DigestAlgorithm.SHA512}) { + assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString())); + } + + assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString())); + assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString())); + + } + + @Test + public void testLimitedRead() throws Exception { + CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5; + int limit = 100; + byte[] bytes = new byte[limit]; + InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx"); + is.read(bytes, 0, limit); + is.close(); + Metadata m = new Metadata(); + try { + XMLResult xml = getXML(TikaInputStream.get(bytes), + new DigestingParser(p, new CommonsDigester(100, algo)), m); + } catch (TikaException e) { + //thrown because this is just a file fragment + assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser", + e.getMessage()); + } + String expectedMD5 = m.get(P+"MD5"); + + m = new Metadata(); + XMLResult xml = getXML("test_recursive_embedded.docx", + new DigestingParser(p, new CommonsDigester(100, algo)), m); + assertEquals(expectedMD5, m.get(P+"MD5")); + } + + @Test + public void testReset() throws Exception { + String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0"; + Metadata m = new Metadata(); + XMLResult xml = getXML("test_recursive_embedded.docx", + new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m); + assertEquals(expectedMD5, m.get(P+"MD5")); + } + + @Test + public void testNegativeMaxMarkLength() throws Exception { + Metadata m = new Metadata(); + boolean ex = false; + try { + XMLResult xml = getXML("test_recursive_embedded.docx", + new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m); + } catch (IllegalArgumentException e) { + ex = true; + } + assertTrue("Exception not thrown", ex); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java new file mode 100644 index 0000000..71c07b7 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.Reader; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +public class ParsingReaderTest { + + @Test + public void testPlainText() throws Exception { + String data = "test content"; + InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); + Reader reader = new ParsingReader(stream, "test.txt"); + assertEquals('t', reader.read()); + assertEquals('e', reader.read()); + assertEquals('s', reader.read()); + assertEquals('t', reader.read()); + assertEquals(' ', reader.read()); + assertEquals('c', reader.read()); + assertEquals('o', reader.read()); + assertEquals('n', reader.read()); + assertEquals('t', reader.read()); + assertEquals('e', reader.read()); + assertEquals('n', reader.read()); + assertEquals('t', reader.read()); + assertEquals('\n', reader.read()); + assertEquals(-1, reader.read()); + reader.close(); + assertEquals(-1, stream.read()); + } + + @Test + public void testXML() throws Exception { + String data = "<p>test <span>content</span></p>"; + InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); + Reader reader = new ParsingReader(stream, "test.xml"); + assertEquals(' ', (char) reader.read()); + assertEquals('t', (char) reader.read()); + assertEquals('e', (char) reader.read()); + assertEquals('s', (char) reader.read()); + assertEquals('t', (char) reader.read()); + assertEquals(' ', (char) reader.read()); + assertEquals(' ', (char) reader.read()); + assertEquals('c', (char) reader.read()); + assertEquals('o', (char) reader.read()); + assertEquals('n', (char) reader.read()); + assertEquals('t', (char) reader.read()); + assertEquals('e', (char) reader.read()); + assertEquals('n', (char) reader.read()); + assertEquals('t', (char) reader.read()); + assertEquals('\n', (char) reader.read()); + assertEquals(-1, reader.read()); + reader.close(); + assertEquals(-1, stream.read()); + } + + /** + * Test case for TIKA-203 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a> + */ + @Test + public void testMetadata() throws Exception { + Metadata metadata = new Metadata(); + InputStream stream = ParsingReaderTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xls"); + try (Reader reader = new ParsingReader( + new AutoDetectParser(), stream, metadata, new ParseContext())) { + // Metadata should already be available + assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); + // Check that the internal buffering isn't broken + assertEquals('F', (char) reader.read()); + assertEquals('e', (char) reader.read()); + assertEquals('u', (char) reader.read()); + assertEquals('i', (char) reader.read()); + assertEquals('l', (char) reader.read()); + assertEquals('1', (char) reader.read()); + } + } + +}
