http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java deleted file mode 100644 index c3d13b7..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ /dev/null @@ -1,1047 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.mime; - -// Junit imports -import static java.nio.charset.StandardCharsets.UTF_16BE; -import static java.nio.charset.StandardCharsets.UTF_16LE; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNotSame; - -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - -import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.junit.Before; -import org.junit.Test; - -/** - * - * Test Suite for the {@link MimeTypes} repository. - * - */ -public class TestMimeTypes { - - private Tika tika; - - private MimeTypes repo; - - private URL u; - - private static final File f = new File("/a/b/c/x.pdf"); - - @Before - public void setUp() throws Exception{ - TikaConfig config = TikaConfig.getDefaultConfig(); - repo = config.getMimeRepository(); - tika = new Tika(config); - u = new URL("http://mydomain.com/x.pdf?x=y"); - } - - @Test - public void testCaseSensitivity() { - String type = tika.detect("test.PDF"); - assertNotNull(type); - assertEquals(type, tika.detect("test.pdf")); - assertEquals(type, tika.detect("test.PdF")); - assertEquals(type, tika.detect("test.pdF")); - } - - @Test - public void testLoadMimeTypes() throws MimeTypeException { - assertNotNull(repo.forName("application/octet-stream")); - assertNotNull(repo.forName("text/x-tex")); - } - - /** - * Tests MIME type determination based solely on the URL's extension. - */ - @Test - public void testGuessMimeTypes() throws Exception { - assertTypeByName("application/pdf", "x.pdf"); - assertEquals("application/pdf", tika.detect(u.toExternalForm())); - assertEquals("application/pdf", tika.detect(f.getPath())); - assertTypeByName("text/plain", "x.txt"); - assertTypeByName("text/html", "x.htm"); - assertTypeByName("text/html", "x.html"); - assertTypeByName("application/xhtml+xml", "x.xhtml"); - assertTypeByName("application/xml", "x.xml"); - assertTypeByName("application/zip", "x.zip"); - assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt"); - assertTypeByName("application/octet-stream", "x.unknown"); - - // Test for the MS Office media types and file extensions listed in - // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx - assertTypeByName("application/msword", "x.doc"); - assertTypeByName("application/msword", "x.dot"); - assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx"); - assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx"); - assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm"); - assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm"); - assertTypeByName("application/vnd.ms-excel", "x.xls"); - assertTypeByName("application/vnd.ms-excel", "x.xlt"); - assertTypeByName("application/vnd.ms-excel", "x.xla"); - assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx"); - assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx"); - assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm"); - assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm"); - assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam"); - assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb"); - assertTypeByName("application/vnd.ms-powerpoint", "x.ppt"); - assertTypeByName("application/vnd.ms-powerpoint", "x.pot"); - assertTypeByName("application/vnd.ms-powerpoint", "x.pps"); - assertTypeByName("application/vnd.ms-powerpoint", "x.ppa"); - assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx"); - assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx"); - assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx"); - assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam"); - assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm"); - assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm"); - assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm"); - } - - /** - * Note - detecting container formats by mime magic is very very - * iffy, as we can't be sure where things will end up. - * People really ought to use the container aware detection... - */ - @Test - public void testOLE2Detection() throws Exception { - // These have the properties block near the start, so our mime - // magic will spot them - assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls"); - - // This one quite legitimately doesn't have its properties block - // as one of the first couple of entries - // As such, our mime magic can't figure it out... - assertTypeByData("application/x-tika-msoffice", "testWORD.doc"); - assertTypeByData("application/x-tika-msoffice", "testPPT.ppt"); - - - // By name + data: - - // Those we got right to start with are fine - assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls"); - - // And the name lets us specialise the generic OOXML - // ones to their actual type - assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt"); - assertTypeByNameAndData("application/msword", "testWORD.doc"); - } - - /** - * Files generated by Works 7.0 Spreadsheet application use the OLE2 - * structure and resemble Excel files (they contain a "Workbook"). They are - * not Excel though. They are distinguished from Excel files with an - * additional top-level entry in below the root of the POI filesystem. - * - * @throws Exception - */ - @Test - public void testWorksSpreadsheetDetection() throws Exception { - assertTypeDetection("testWORKSSpreadsheet7.0.xlr", - // with name-only, everything should be all right - "application/x-tika-msworks-spreadsheet", - // this is possible due to MimeTypes guessing the type - // based on the WksSSWorkBook near the beginning of the - // file - "application/x-tika-msworks-spreadsheet", - // this is right, the magic-based detection works, there is - // no need for the name-based detection to refine it - "application/x-tika-msworks-spreadsheet"); - } - - @Test - public void testStarOfficeDetection() throws Exception { - assertTypeDetection("testVORCalcTemplate.vor", - "application/x-staroffice-template", - "application/vnd.stardivision.calc", - "application/vnd.stardivision.calc"); - assertTypeDetection("testVORDrawTemplate.vor", - "application/x-staroffice-template", - "application/vnd.stardivision.draw", - "application/vnd.stardivision.draw"); - assertTypeDetection("testVORImpressTemplate.vor", - "application/x-staroffice-template", - "application/vnd.stardivision.impress", - "application/vnd.stardivision.impress"); - assertTypeDetection("testVORWriterTemplate.vor", - "application/x-staroffice-template", - "application/vnd.stardivision.writer", - "application/vnd.stardivision.writer"); - - assertTypeDetection("testStarOffice-5.2-calc.sdc", - "application/vnd.stardivision.calc", - "application/vnd.stardivision.calc", - "application/vnd.stardivision.calc"); - assertTypeDetection("testStarOffice-5.2-draw.sda", - "application/vnd.stardivision.draw", - "application/vnd.stardivision.draw", - "application/vnd.stardivision.draw"); - assertTypeDetection("testStarOffice-5.2-impress.sdd", - "application/vnd.stardivision.impress", - "application/vnd.stardivision.impress", - "application/vnd.stardivision.impress"); - assertTypeDetection("testStarOffice-5.2-writer.sdw", - "application/vnd.stardivision.writer", - "application/vnd.stardivision.writer", - "application/vnd.stardivision.writer"); - } - - /** - * Files generated by Works Word Processor versions 3.0 and 4.0 use the - * OLE2 structure. They don't resemble Word though. - * - * @throws Exception - */ - @Test - public void testOldWorksWordProcessorDetection() throws Exception { - assertTypeDetection( - "testWORKSWordProcessor3.0.wps", - // .wps is just like any other works extension - "application/vnd.ms-works", - // this is due to MatOST substring - "application/vnd.ms-works", - // magic-based detection works, no need to refine it - "application/vnd.ms-works"); - - // files in version 4.0 are no different from those in version 3.0 - assertTypeDetection( - "testWORKSWordProcessor4.0.wps", - "application/vnd.ms-works", - "application/vnd.ms-works", - "application/vnd.ms-works"); - } - - /** - * Files from Excel 2 through 4 are based on the BIFF record - * structure, but without a wrapping OLE2 structure. - * Excel 5 and Excel 95+ work on OLE2 - */ - @Test - public void testOldExcel() throws Exception { - // With just a name, we'll think everything's a new Excel file - assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls"); - assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls"); - assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls"); - - // With data, we can work out if it's old or new style - assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls"); - assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls"); - assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls"); - - assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls"); - assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls"); - assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls"); - } - - /** - * Note - detecting container formats by mime magic is very very - * iffy, as we can't be sure where things will end up. - * People really ought to use the container aware detection... - */ - @Test - public void testOoxmlDetection() throws Exception { - // These two do luckily have [Content_Types].xml near the start, - // so our mime magic will spot them - assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx"); - assertTypeByData("application/x-tika-ooxml", "testPPT.pptx"); - - // This one quite legitimately doesn't have its [Content_Types].xml - // file as one of the first couple of entries - // As such, our mime magic can't figure it out... - assertTypeByData("application/zip", "testWORD.docx"); - - // If we give the filename as well as the data, we can - // specialise the ooxml generic one to the correct type - assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx"); - assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx"); - assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx"); - - // Test a few of the less usual ones - assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb"); - assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm"); - assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm"); - assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm"); - } - - /** - * Note - container based formats, needs container detection - * to be properly correct - */ - @Test - public void testVisioDetection() throws Exception { - // By Name, should get it right - assertTypeByName("application/vnd.visio", "testVISIO.vsd"); - assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm"); - assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx"); - assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm"); - assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx"); - assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm"); - assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx"); - - // By Name and Data, should get it right - assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd"); - assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm"); - assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx"); - assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm"); - assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx"); - assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm"); - assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx"); - - // By Data only, will get the container parent - assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm"); - assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx"); - } - - /** - * Note - detecting container formats by mime magic is very very - * iffy, as we can't be sure where things will end up. - * People really ought to use the container aware detection... - */ - @Test - public void testIWorkDetection() throws Exception { - // By name is easy - assertTypeByName("application/vnd.apple.keynote", "testKeynote.key"); - assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers"); - assertTypeByName("application/vnd.apple.pages", "testPages.pages"); - - // We can't do it by data, as we'd need to unpack - // the zip file to check the XML - assertTypeByData("application/zip", "testKeynote.key"); - - assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key"); - assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers"); - assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages"); - } - - @Test - public void testArchiveDetection() throws Exception { - assertTypeByName("application/x-archive", "test.ar"); - assertTypeByName("application/zip", "test.zip"); - assertTypeByName("application/x-tar", "test.tar"); - assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it - assertTypeByName("application/x-cpio", "test.cpio"); - - // TODO Add an example .deb and .udeb, then check these - - // Check the mime magic patterns for them work too - assertTypeByData("application/x-archive", "testARofText.ar"); - assertTypeByData("application/x-archive", "testARofSND.ar"); - assertTypeByData("application/zip", "test-documents.zip"); - assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR - assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it - assertTypeByData("application/x-cpio", "test-documents.cpio"); - - // For spanned zip files, the .zip file doesn't have the header, it's the other parts - assertTypeByData("application/octet-stream", "test-documents-spanned.zip"); - assertTypeByData("application/zip", "test-documents-spanned.z01"); - } - - @Test - public void testFeedsDetection() throws Exception { - assertType("application/rss+xml", "rsstest.rss"); - assertType("application/atom+xml", "testATOM.atom"); - assertTypeByData("application/rss+xml", "rsstest.rss"); - assertTypeByName("application/rss+xml", "rsstest.rss"); - assertTypeByData("application/atom+xml", "testATOM.atom"); - assertTypeByName("application/atom+xml", "testATOM.atom"); - } - - @Test - public void testFitsDetection() throws Exception { - // FITS image created using imagemagick convert of testJPEG.jpg - assertType("application/fits", "testFITS.fits"); - assertTypeByData("application/fits", "testFITS.fits"); - assertTypeByName("application/fits", "testFITS.fits"); - } - - @Test - public void testJpegDetection() throws Exception { - assertType("image/jpeg", "testJPEG.jpg"); - assertTypeByData("image/jpeg", "testJPEG.jpg"); - assertTypeByName("image/jpeg", "x.jpg"); - assertTypeByName("image/jpeg", "x.JPG"); - assertTypeByName("image/jpeg", "x.jpeg"); - assertTypeByName("image/jpeg", "x.JPEG"); - assertTypeByName("image/jpeg", "x.jpe"); - assertTypeByName("image/jpeg", "x.jif"); - assertTypeByName("image/jpeg", "x.jfif"); - assertTypeByName("image/jpeg", "x.jfi"); - - assertType("image/jp2", "testJPEG.jp2"); - assertTypeByData("image/jp2", "testJPEG.jp2"); - assertTypeByName("image/jp2", "x.jp2"); - } - - @Test - public void testBpgDetection() throws Exception { - assertType("image/x-bpg", "testBPG.bpg"); - assertTypeByData("image/x-bpg", "testBPG.bpg"); - assertTypeByData("image/x-bpg", "testBPG_commented.bpg"); - assertTypeByName("image/x-bpg", "x.bpg"); - } - - @Test - public void testTiffDetection() throws Exception { - assertType("image/tiff", "testTIFF.tif"); - assertTypeByData("image/tiff", "testTIFF.tif"); - assertTypeByName("image/tiff", "x.tiff"); - assertTypeByName("image/tiff", "x.tif"); - assertTypeByName("image/tiff", "x.TIF"); - } - - @Test - public void testGifDetection() throws Exception { - assertType("image/gif", "testGIF.gif"); - assertTypeByData("image/gif", "testGIF.gif"); - assertTypeByName("image/gif", "x.gif"); - assertTypeByName("image/gif", "x.GIF"); - } - - @Test - public void testPngDetection() throws Exception { - assertType("image/png", "testPNG.png"); - assertTypeByData("image/png", "testPNG.png"); - assertTypeByName("image/png", "x.png"); - assertTypeByName("image/png", "x.PNG"); - } - - @Test - public void testWEBPDetection() throws Exception { - assertType("image/webp", "testWEBP.webp"); - assertTypeByData("image/webp", "testWEBP.webp"); - assertTypeByName("image/webp", "x.webp"); - assertTypeByName("image/webp", "x.WEBP"); - } - - @Test - public void testBmpDetection() throws Exception { - assertType("image/x-ms-bmp", "testBMP.bmp"); - assertTypeByData("image/x-ms-bmp", "testBMP.bmp"); - assertTypeByName("image/x-ms-bmp", "x.bmp"); - assertTypeByName("image/x-ms-bmp", "x.BMP"); - assertTypeByName("image/x-ms-bmp", "x.dib"); - assertTypeByName("image/x-ms-bmp", "x.DIB"); - //false positive check -- contains part of BMP signature - assertType("text/plain", "testBMPfp.txt"); - } - - @Test - public void testPnmDetection() throws Exception { - assertType("image/x-portable-bitmap", "testPBM.pbm"); - assertType("image/x-portable-graymap", "testPGM.pgm"); - assertType("image/x-portable-pixmap", "testPPM.ppm"); - assertTypeByData("image/x-portable-bitmap", "testPBM.pbm"); - assertTypeByData("image/x-portable-graymap", "testPGM.pgm"); - assertTypeByData("image/x-portable-pixmap", "testPPM.ppm"); - assertTypeByName("image/x-portable-anymap", "x.pnm"); - assertTypeByName("image/x-portable-anymap", "x.PNM"); - assertTypeByName("image/x-portable-bitmap", "x.pbm"); - assertTypeByName("image/x-portable-bitmap", "x.PBM"); - assertTypeByName("image/x-portable-graymap", "x.pgm"); - assertTypeByName("image/x-portable-graymap", "x.PGM"); - assertTypeByName("image/x-portable-pixmap", "x.ppm"); - assertTypeByName("image/x-portable-pixmap", "x.PPM"); - } - - @Test - public void testPictDetection() throws Exception { - assertType("image/x-pict", "testPICT.pct"); - assertTypeByData("image/x-pict", "testPICT.pct"); - assertTypeByName("image/x-pict", "x.pic"); - assertTypeByName("image/x-pict", "x.PCT"); - } - - @Test - public void testCgmDetection() throws Exception { - // TODO: Need a test image file - assertTypeByName("image/cgm", "x.cgm"); - assertTypeByName("image/cgm", "x.CGM"); - } - - @Test - public void testRdfXmlDetection() throws Exception { - assertTypeByName("application/rdf+xml", "x.rdf"); - assertTypeByName("application/rdf+xml", "x.owl"); - } - - @Test - public void testSvgDetection() throws Exception { - assertType("image/svg+xml", "testSVG.svg"); - assertTypeByData("image/svg+xml", "testSVG.svg"); - assertTypeByName("image/svg+xml", "x.svg"); - assertTypeByName("image/svg+xml", "x.SVG"); - - // Should *.svgz be svg or gzip - assertType("application/gzip", "testSVG.svgz"); - assertTypeByData("application/gzip", "testSVG.svgz"); - assertTypeByName("image/svg+xml", "x.svgz"); - assertTypeByName("image/svg+xml", "x.SVGZ"); - } - - @Test - public void testPdfDetection() throws Exception { - // PDF extension by name is enough - assertTypeByName("application/pdf", "x.pdf"); - assertTypeByName("application/pdf", "x.PDF"); - - // For normal PDFs, can get by name or data or both - assertType("application/pdf", "testPDF.pdf"); - assertTypeByData("application/pdf", "testPDF.pdf"); - - // PDF with a BoM works both ways too - assertType("application/pdf", "testPDF_bom.pdf"); - assertTypeByData("application/pdf", "testPDF_bom.pdf"); - } - - @Test - public void testSwfDetection() throws Exception { - assertTypeByName("application/x-shockwave-flash", "x.swf"); - assertTypeByName("application/x-shockwave-flash", "x.SWF"); - assertTypeByName("application/x-shockwave-flash", "test1.swf"); - assertTypeByName("application/x-shockwave-flash", "test2.swf"); - assertTypeByName("application/x-shockwave-flash", "test3.swf"); - } - - @Test - public void testDwgDetection() throws Exception { - assertTypeByName("image/vnd.dwg", "x.dwg"); - assertTypeByData("image/vnd.dwg", "testDWG2004.dwg"); - assertTypeByData("image/vnd.dwg", "testDWG2007.dwg"); - assertTypeByData("image/vnd.dwg", "testDWG2010.dwg"); - } - - @Test - public void testprtDetection() throws Exception { - assertTypeByName("application/x-prt", "x.prt"); - assertTypeByData("application/x-prt", "testCADKEY.prt"); - } - - /** - * Formats which are based on plain text - */ - @Test - public void testTextBasedFormatsDetection() throws Exception { - assertTypeByName("text/plain", "testTXT.txt"); - assertType( "text/plain", "testTXT.txt"); - - assertTypeByName("text/css", "testCSS.css"); - assertType( "text/css", "testCSS.css"); - - assertTypeByName("text/csv", "testCSV.csv"); - assertType( "text/csv", "testCSV.csv"); - - assertTypeByName("text/html", "testHTML.html"); - assertType( "text/html", "testHTML.html"); - - assertTypeByName("application/javascript", "testJS.js"); - assertType( "application/javascript", "testJS.js"); - } - - @Test - public void testJavaDetection() throws Exception { - // TODO Classloader doesn't seem to find the .class file in test-documents - //assertTypeDetection("AutoDetectParser.class", "application/java-vm"); - - // OSX Native Extension - assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib"); - } - - @Test - public void testXmlAndHtmlDetection() throws Exception { - assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>" - .getBytes(UTF_8)); - assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>" - .getBytes(UTF_16LE)); - assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>" - .getBytes(UTF_16BE)); - assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>" - .getBytes(UTF_8)); - assertTypeByData("text/html", "<html><body>HTML</body></html>" - .getBytes(UTF_8)); - assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>" - .getBytes(UTF_8)); - } - - @Test - public void testWmfDetection() throws Exception { - assertTypeByName("application/x-msmetafile", "x.wmf"); - assertTypeByData("application/x-msmetafile", "testWMF.wmf"); - assertTypeByName("application/x-msmetafile", "x.WMF"); - - assertTypeByName("application/x-emf", "x.emf"); - assertTypeByData("application/x-emf","testEMF.emf"); - assertTypeByName("application/x-emf", "x.EMF"); - // TODO: Need a test wmz file - assertTypeByName("application/x-ms-wmz", "x.wmz"); - assertTypeByName("application/x-ms-wmz", "x.WMZ"); - // TODO: Need a test emz file - assertTypeByName("application/gzip", "x.emz"); - assertTypeByName("application/gzip", "x.EMZ"); - } - - @Test - public void testPsDetection() throws Exception { - // TODO: Need a test postscript file - assertTypeByName("application/postscript", "x.ps"); - assertTypeByName("application/postscript", "x.PS"); - assertTypeByName("application/postscript", "x.eps"); - assertTypeByName("application/postscript", "x.epsf"); - assertTypeByName("application/postscript", "x.epsi"); - } - - @Test - public void testMicrosoftMultiMediaDetection() throws Exception { - assertTypeByName("video/x-ms-asf", "x.asf"); - assertTypeByName("video/x-ms-wmv", "x.wmv"); - assertTypeByName("audio/x-ms-wma", "x.wma"); - - assertTypeByData("video/x-ms-asf", "testASF.asf"); - assertTypeByData("video/x-ms-wmv", "testWMV.wmv"); - assertTypeByData("audio/x-ms-wma", "testWMA.wma"); - } - - /** - * All 3 DITA types are in theory handled by the same mimetype, - * but we specialise them - */ - @Test - public void testDITADetection() throws Exception { - assertTypeByName("application/dita+xml; format=topic", "test.dita"); - assertTypeByName("application/dita+xml; format=map", "test.ditamap"); - assertTypeByName("application/dita+xml; format=val", "test.ditaval"); - - assertTypeByData("application/dita+xml; format=task", "testDITA.dita"); - assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita"); - assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap"); - - assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita"); - assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita"); - assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap"); - - // These are all children of the official type - assertEquals("application/dita+xml", - repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString()); - assertEquals("application/dita+xml", - repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString()); - // Concept inherits from topic - assertEquals("application/dita+xml; format=topic", - repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString()); - } - - /** - * @since TIKA-194 - */ - @Test - public void testJavaRegex() throws Exception{ - MimeType testType = new MimeType(MediaType.parse("foo/bar")); - this.repo.add(testType); - assertNotNull(repo.forName("foo/bar")); - String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}"; - this.repo.addPattern(testType, pattern, true); - String testFileName = "rtg_sst_grb_0.5.12345678"; - assertEquals("foo/bar", tika.detect(testFileName)); - - MimeType testType2 = new MimeType(MediaType.parse("foo/bar2")); - this.repo.add(testType2); - assertNotNull(repo.forName("foo/bar2")); - this.repo.addPattern(testType2, pattern, false); - assertNotSame("foo/bar2", tika.detect(testFileName)); - } - - @Test - public void testRawDetection() throws Exception { - assertTypeByName("image/x-raw-adobe", "x.dng"); - assertTypeByName("image/x-raw-adobe", "x.DNG"); - assertTypeByName("image/x-raw-hasselblad", "x.3fr"); - assertTypeByName("image/x-raw-fuji", "x.raf"); - assertTypeByName("image/x-raw-canon", "x.crw"); - assertTypeByName("image/x-raw-canon", "x.cr2"); - assertTypeByName("image/x-raw-kodak", "x.k25"); - assertTypeByName("image/x-raw-kodak", "x.kdc"); - assertTypeByName("image/x-raw-kodak", "x.dcs"); - assertTypeByName("image/x-raw-kodak", "x.drf"); - assertTypeByName("image/x-raw-minolta", "x.mrw"); - assertTypeByName("image/x-raw-nikon", "x.nef"); - assertTypeByName("image/x-raw-nikon", "x.nrw"); - assertTypeByName("image/x-raw-olympus", "x.orf"); - assertTypeByName("image/x-raw-pentax", "x.ptx"); - assertTypeByName("image/x-raw-pentax", "x.pef"); - assertTypeByName("image/x-raw-sony", "x.arw"); - assertTypeByName("image/x-raw-sony", "x.srf"); - assertTypeByName("image/x-raw-sony", "x.sr2"); - assertTypeByName("image/x-raw-sigma", "x.x3f"); - assertTypeByName("image/x-raw-epson", "x.erf"); - assertTypeByName("image/x-raw-mamiya", "x.mef"); - assertTypeByName("image/x-raw-leaf", "x.mos"); - assertTypeByName("image/x-raw-panasonic", "x.raw"); - assertTypeByName("image/x-raw-panasonic", "x.rw2"); - assertTypeByName("image/x-raw-phaseone", "x.iiq"); - assertTypeByName("image/x-raw-red", "x.r3d"); - assertTypeByName("image/x-raw-imacon", "x.fff"); - assertTypeByName("image/x-raw-logitech", "x.pxn"); - assertTypeByName("image/x-raw-casio", "x.bay"); - assertTypeByName("image/x-raw-rawzor", "x.rwz"); - } - - /** - * Tests that we correctly detect the font types - */ - @Test - public void testFontDetection() throws Exception { - assertTypeByName("application/x-font-adobe-metric", "x.afm"); - assertTypeByData("application/x-font-adobe-metric", "testAFM.afm"); - - assertTypeByName("application/x-font-printer-metric", "x.pfm"); - // TODO Get a sample .pfm file - assertTypeByData( - "application/x-font-printer-metric", - new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f, - 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20} - ); - - assertTypeByName("application/x-font-type1", "x.pfa"); - // TODO Get a sample .pfa file - assertTypeByData( - "application/x-font-type1", - new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, - 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, - 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20} - ); - - assertTypeByName("application/x-font-type1", "x.pfb"); - // TODO Get a sample .pfm file - assertTypeByData( - "application/x-font-type1", - new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21, - 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65, - 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 } - ); - } - - /** - * Tests MimeTypes.getMimeType(URL), which examines both the byte header - * and, if necessary, the URL's extension. - */ - @Test - public void testMimeDeterminationForTestDocuments() throws Exception { - assertType("text/html", "testHTML.html"); - assertType("application/zip", "test-documents.zip"); - - assertType("text/html", "testHTML_utf8.html"); - assertType( - "application/vnd.oasis.opendocument.text", - "testOpenOffice2.odt"); - assertType("application/pdf", "testPDF.pdf"); - assertType("application/rtf", "testRTF.rtf"); - assertType("text/plain", "testTXT.txt"); - assertType("application/xml", "testXML.xml"); - assertType("audio/basic", "testAU.au"); - assertType("audio/x-aiff", "testAIFF.aif"); - assertType("audio/x-wav", "testWAV.wav"); - assertType("audio/midi", "testMID.mid"); - assertType("application/x-msaccess", "testACCESS.mdb"); - assertType("application/x-font-ttf", "testTrueType3.ttf"); - } - - @Test - public void test7ZipDetection() throws Exception { - assertTypeByName("application/x-7z-compressed","test-documents.7z"); - assertTypeByData("application/x-7z-compressed","test-documents.7z"); - assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z"); - } - - @Test - public void testWebArchiveDetection() throws Exception { - assertTypeByName("application/x-webarchive","x.webarchive"); - assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive"); - assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive"); - } - - /** - * KML, and KMZ (zipped KML) - */ - @Test - public void testKMLZDetection() throws Exception { - assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml"); - assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml"); - assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml"); - - assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz"); - assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz"); - - // By data only, mimetype magic only gets us to a .zip - // We need to use the Zip Aware detector to get the full type - assertTypeByData("application/zip","testKMZ.kmz"); - } - - @Test - public void testCreativeSuite() throws IOException { - assertTypeDetection("testINDD.indd", "application/x-adobe-indesign"); - assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop"); - } - - @Test - public void testAMR() throws IOException { - // AMR matches on name, data or both - assertTypeDetection("testAMR.amr", "audio/amr"); - - // AMR-WB subtype shares extension, so needs data to identify - assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb"); - - // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet - //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+"); - } - - @Test - public void testEmail() throws IOException { - // EMLX - assertTypeDetection("testEMLX.emlx", "message/x-emlx"); - - // Groupwise - assertTypeDetection("testGroupWiseEml.eml", "message/rfc822"); - - // Lotus - assertTypeDetection("testLotusEml.eml", "message/rfc822"); - - // Thunderbird - doesn't currently work by name - assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml"); - } - - @Test - public void testAxCrypt() throws Exception { - // test-TXT.txt encrypted with a key of "tika" - assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt"); - } - - @Test - public void testWindowsEXE() throws Exception { - assertTypeByName("application/x-msdownload", "x.dll"); - assertTypeByName("application/x-ms-installer", "x.msi"); - assertTypeByName("application/x-dosexec", "x.exe"); - - assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe"); - assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe"); - - // A jar file with part of a PE header, but not a full one - // should still be detected as a zip or jar (without/with name) - assertTypeByData("application/zip", "testJAR_with_PEHDR.jar"); - assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar"); - } - - @Test - public void testMatroskaDetection() throws Exception { - assertType("video/x-matroska", "testMKV.mkv"); - // TODO: Need custom detector data detection, see TIKA-1180 - assertTypeByData("application/x-matroska", "testMKV.mkv"); - assertTypeByNameAndData("video/x-matroska", "testMKV.mkv"); - assertTypeByName("video/x-matroska", "x.mkv"); - assertTypeByName("video/x-matroska", "x.MKV"); - assertTypeByName("audio/x-matroska", "x.mka"); - assertTypeByName("audio/x-matroska", "x.MKA"); - } - - @Test - public void testWebMDetection() throws Exception { - assertType("video/webm", "testWEBM.webm"); - // TODO: Need custom detector data detection, see TIKA-1180 - assertTypeByData("application/x-matroska", "testWEBM.webm"); - assertTypeByNameAndData("video/webm", "testWEBM.webm"); - assertTypeByName("video/webm", "x.webm"); - assertTypeByName("video/webm", "x.WEBM"); - } - - /** Test getMimeType(byte[]) */ - @Test - public void testGetMimeType_byteArray() throws IOException { - // Plain text detection - assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); - assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); - assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); - assertText(new byte[] { 'a', 'b', 'c' }); - assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B }); - assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C }); - } - - @Test - public void testBerkeleyDB() throws IOException { - assertTypeByData( - "application/x-berkeley-db; format=btree; version=2", - "testBDB_btree_2.db"); - assertTypeByData( - "application/x-berkeley-db; format=btree; version=3", - "testBDB_btree_3.db"); - assertTypeByData( - "application/x-berkeley-db; format=btree; version=4", - "testBDB_btree_4.db"); - // V4 and V5 share the same btree format - assertTypeByData( - "application/x-berkeley-db; format=btree; version=4", - "testBDB_btree_5.db"); - - assertTypeByData( - "application/x-berkeley-db; format=hash; version=2", - "testBDB_hash_2.db"); - assertTypeByData( - "application/x-berkeley-db; format=hash; version=3", - "testBDB_hash_3.db"); - assertTypeByData( - "application/x-berkeley-db; format=hash; version=4", - "testBDB_hash_4.db"); - assertTypeByData( - "application/x-berkeley-db; format=hash; version=5", - "testBDB_hash_5.db"); - } - - /** - * CBOR typically contains HTML - */ - @Test - public void testCBOR() throws IOException { - assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor"); - assertTypeByData("application/cbor", "NUTCH-1997.cbor"); - } - - @Test - public void testZLIB() throws IOException { - // ZLIB encoded versions of testTXT.txt - assertTypeByData("application/zlib", "testTXT.zlib"); - assertTypeByData("application/zlib", "testTXT.zlib0"); - assertTypeByData("application/zlib", "testTXT.zlib5"); - assertTypeByData("application/zlib", "testTXT.zlib9"); - } - - @Test - public void testTextFormats() throws Exception { - assertType("application/x-bibtex-text-file", "testBIBTEX.bib"); - assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib"); - } - - @Test - public void testCodeFormats() throws Exception { - assertType("text/x-csrc", "testC.c"); - assertType("text/x-chdr", "testH.h"); - assertTypeByData("text/x-csrc", "testC.c"); - assertTypeByData("text/x-chdr", "testH.h"); - - assertTypeByName("text/x-java-source", "testJAVA.java"); - assertType("text/x-java-properties", "testJAVAPROPS.properties"); - - assertType("text/x-matlab", "testMATLAB.m"); - assertType("text/x-matlab", "testMATLAB_wtsgaus.m"); - assertType("text/x-matlab", "testMATLAB_barcast.m"); - assertTypeByData("text/x-matlab", "testMATLAB.m"); - assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m"); - assertTypeByData("text/x-matlab", "testMATLAB_barcast.m"); - } - - @Test - public void testWebVTT() throws Exception { - assertType("text/vtt", "testWebVTT.vtt"); - assertTypeByData("text/vtt", "testWebVTT.vtt"); - } - - private void assertText(byte[] prefix) throws IOException { - assertMagic("text/plain", prefix); - } - - private void assertNotText(byte[] prefix) throws IOException { - assertMagic("application/octet-stream", prefix); - } - - private void assertMagic(String expected, byte[] prefix) throws IOException { - MediaType type = - repo.detect(new ByteArrayInputStream(prefix), new Metadata()); - assertNotNull(type); - assertEquals(expected, type.toString()); - } - - private void assertType(String expected, String filename) throws Exception { - try (InputStream stream = TestMimeTypes.class.getResourceAsStream( - "/test-documents/" + filename)) { - assertNotNull("Test file not found: " + filename, stream); - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, filename); - assertEquals(expected, repo.detect(stream, metadata).toString()); - } - } - - private void assertTypeByName(String expected, String filename) - throws IOException { - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, filename); - assertEquals(expected, repo.detect(null, metadata).toString()); - } - - private void assertTypeByData(String expected, String filename) - throws IOException { - try (InputStream stream = TestMimeTypes.class.getResourceAsStream( - "/test-documents/" + filename)) { - assertNotNull("Test file not found: " + filename, stream); - Metadata metadata = new Metadata(); - assertEquals(expected, repo.detect(stream, metadata).toString()); - } - } - - private void assertTypeByData(String expected, byte[] data) - throws IOException { - try (InputStream stream = new ByteArrayInputStream(data)) { - Metadata metadata = new Metadata(); - assertEquals(expected, repo.detect(stream, metadata).toString()); - } - } - - private void assertTypeDetection(String filename, String type) - throws IOException { - assertTypeDetection(filename, type, type, type); - } - - private void assertTypeDetection(String filename, String byName, String byData, - String byNameAndData) throws IOException { - assertTypeByName(byName, filename); - assertTypeByData(byData, filename); - assertTypeByNameAndData(byNameAndData, filename); - } - - private void assertTypeByNameAndData(String expected, String filename) - throws IOException { - assertEquals(expected, getTypeByNameAndData(filename).toString()); - } - - private MediaType getTypeByNameAndData(String filename) throws IOException { - try (InputStream stream = TestMimeTypes.class.getResourceAsStream( - "/test-documents/" + filename)) { - assertNotNull("Test document not found: " + filename, stream); - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, filename); - return repo.detect(stream, metadata); - } - } -}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java deleted file mode 100644 index 91b054e..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ /dev/null @@ -1,459 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.HashSet; -import java.util.Set; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; - -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.Detector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.XMPDM; -import org.apache.tika.mime.MediaType; -import org.apache.tika.sax.BodyContentHandler; -import org.gagravarr.tika.FlacParser; -import org.gagravarr.tika.OpusParser; -import org.gagravarr.tika.VorbisParser; -import org.junit.Test; -import org.xml.sax.ContentHandler; - -public class AutoDetectParserTest { - private TikaConfig tika = TikaConfig.getDefaultConfig(); - - // Easy to read constants for the MIME types: - private static final String RAW = "application/octet-stream"; - private static final String EXCEL = "application/vnd.ms-excel"; - private static final String HTML = "text/html; charset=ISO-8859-1"; - private static final String PDF = "application/pdf"; - private static final String POWERPOINT = "application/vnd.ms-powerpoint"; - private static final String KEYNOTE = "application/vnd.apple.keynote"; - private static final String PAGES = "application/vnd.apple.pages"; - private static final String NUMBERS = "application/vnd.apple.numbers"; - private static final String CHM = "application/vnd.ms-htmlhelp"; - private static final String RTF = "application/rtf"; - private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1"; - private static final String UTF8TEXT = "text/plain; charset=UTF-8"; - private static final String WORD = "application/msword"; - private static final String XML = "application/xml"; - private static final String RSS = "application/rss+xml"; - private static final String BMP = "image/x-ms-bmp"; - private static final String GIF = "image/gif"; - private static final String JPEG = "image/jpeg"; - private static final String PNG = "image/png"; - private static final String OGG_VORBIS = "audio/vorbis"; - private static final String OGG_OPUS = "audio/opus"; - private static final String OGG_FLAC = "audio/x-oggflac"; - private static final String FLAC_NATIVE= "audio/x-flac"; - private static final String OPENOFFICE - = "application/vnd.oasis.opendocument.text"; - - - /** - * This is where a single test is done. - * @param tp the parameters encapsulated in a TestParams instance - * @throws IOException - */ - private void assertAutoDetect(TestParams tp) throws Exception { - try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) { - if (input == null) { - fail("Could not open stream from specified resource: " - + tp.resourceRealName); - } - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName); - metadata.set(Metadata.CONTENT_TYPE, tp.statedType); - ContentHandler handler = new BodyContentHandler(); - new AutoDetectParser(tika).parse(input, handler, metadata); - - assertEquals("Bad content type: " + tp, - tp.realType, metadata.get(Metadata.CONTENT_TYPE)); - - if (tp.expectedContentFragment != null) { - assertTrue("Expected content not found: " + tp, - handler.toString().contains(tp.expectedContentFragment)); - } - } - } - - /** - * Convenience method -- its sole purpose of existence is to make the - * call to it more readable than it would be if a TestParams instance - * would need to be instantiated there. - * - * @param resourceRealName real name of resource - * @param resourceStatedName stated name -- will a bad name fool us? - * @param realType - the real MIME type - * @param statedType - stated MIME type - will a wrong one fool us? - * @param expectedContentFragment - something expected in the text - * @throws Exception - */ - private void assertAutoDetect(String resourceRealName, - String resourceStatedName, - String realType, - String statedType, - String expectedContentFragment) - throws Exception { - - assertAutoDetect(new TestParams(resourceRealName, resourceStatedName, - realType, statedType, expectedContentFragment)); - } - - private void assertAutoDetect( - String resource, String type, String content) throws Exception { - - resource = "/test-documents/" + resource; - - // TODO !!!! The disabled tests below should work! - // The correct MIME type should be determined regardless of the - // stated type (ContentType hint) and the stated URL name. - - - // Try different combinations of correct and incorrect arguments: - final String wrongMimeType = RAW; - assertAutoDetect(resource, resource, type, type, content); - assertAutoDetect(resource, resource, type, null, content); - assertAutoDetect(resource, resource, type, wrongMimeType, content); - - assertAutoDetect(resource, null, type, type, content); - assertAutoDetect(resource, null, type, null, content); - assertAutoDetect(resource, null, type, wrongMimeType, content); - - final String badResource = "a.xyz"; - assertAutoDetect(resource, badResource, type, type, content); - assertAutoDetect(resource, badResource, type, null, content); - assertAutoDetect(resource, badResource, type, wrongMimeType, content); - } - - @Test - public void testKeynote() throws Exception { - assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation"); - } - - @Test - public void testPages() throws Exception { - assertAutoDetect("testPages.pages", PAGES, "Sample pages document"); - } - - @Test - public void testNumbers() throws Exception { - assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668"); - } - - @Test - public void testChm() throws Exception { - assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used."); - } - - @Test - public void testEpub() throws Exception { - assertAutoDetect( - "testEPUB.epub", "application/epub+zip", - "The previous headings were subchapters"); - } - - @Test - public void testExcel() throws Exception { - assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet"); - } - - @Test - public void testHTML() throws Exception { - assertAutoDetect("testHTML.html", HTML, "Test Indexation Html"); - } - - @Test - public void testOpenOffice() throws Exception { - assertAutoDetect("testOpenOffice2.odt", OPENOFFICE, - "This is a sample Open Office document"); - } - - @Test - public void testPDF() throws Exception { - assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit"); - - } - - @Test - public void testPowerpoint() throws Exception { - assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide"); - } - - @Test - public void testRdfXml() throws Exception { - assertAutoDetect("testRDF.rdf", "application/rdf+xml", ""); - } - - @Test - public void testRTF() throws Exception { - assertAutoDetect("testRTF.rtf", RTF, "indexation Word"); - } - - @Test - public void testText() throws Exception { - assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt"); - } - - @Test - public void testTextNonASCIIUTF8() throws Exception { - assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog"); - } - - @Test - public void testWord() throws Exception { - assertAutoDetect("testWORD.doc", WORD, "Sample Word Document"); - } - - @Test - public void testXML() throws Exception { - assertAutoDetect("testXML.xml", XML, "Lius"); - } - - @Test - public void testRss() throws Exception { - assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test"); - } - - @Test - public void testImages() throws Exception { - assertAutoDetect("testBMP.bmp", BMP, null); - assertAutoDetect("testGIF.gif", GIF, null); - assertAutoDetect("testJPEG.jpg", JPEG, null); - assertAutoDetect("testPNG.png", PNG, null); - } - - /** - * Make sure that zip bomb attacks are prevented. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a> - */ - @Test - public void testZipBombPrevention() throws Exception { - try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream( - "/test-documents/TIKA-216.tgz")) { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(-1); - new AutoDetectParser(tika).parse(tgz, handler, metadata); - fail("Zip bomb was not detected"); - } catch (TikaException e) { - // expected - } - } - - /** - * Make sure XML parse errors don't trigger ZIP bomb detection. - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a> - */ - @Test - public void testNoBombDetectedForInvalidXml() throws Exception { - // create zip with ten empty / invalid XML files, 1.xml .. 10.xml - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ZipOutputStream zos = new ZipOutputStream(baos); - for (int i = 1; i <= 10; i++) { - zos.putNextEntry(new ZipEntry(i + ".xml")); - zos.closeEntry(); - } - zos.finish(); - zos.close(); - new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1), - new Metadata()); - } - - /** - * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc) - * have been correctly included, and are available - */ - @SuppressWarnings("deprecation") - @Test - public void testOggFlacAudio() throws Exception { - // The three test files should all have similar test data - String[] testFiles = new String[] { - "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", - "testOPUS.opus" - }; - MediaType[] mediaTypes = new MediaType[] { - MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), - MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) - }; - - // Check we can load the parsers, and they claim to do the right things - VorbisParser vParser = new VorbisParser(); - assertNotNull("Parser not found for " + mediaTypes[0], - vParser.getSupportedTypes(new ParseContext())); - - FlacParser fParser = new FlacParser(); - assertNotNull("Parser not found for " + mediaTypes[1], - fParser.getSupportedTypes(new ParseContext())); - assertNotNull("Parser not found for " + mediaTypes[2], - fParser.getSupportedTypes(new ParseContext())); - - OpusParser oParser = new OpusParser(); - assertNotNull("Parser not found for " + mediaTypes[3], - oParser.getSupportedTypes(new ParseContext())); - - // Check we found the parser - CompositeParser parser = (CompositeParser)tika.getParser(); - for (MediaType mt : mediaTypes) { - assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) ); - } - - // Have each file parsed, and check - for (int i=0; i<testFiles.length; i++) { - String file = testFiles[i]; - try (InputStream input = AutoDetectParserTest.class.getResourceAsStream( - "/test-documents/" + file)) { - if (input == null) { - fail("Could not find test file " + file); - } - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new AutoDetectParser(tika).parse(input, handler, metadata); - - assertEquals("Incorrect content type for " + file, - mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE)); - - // Check some of the common metadata - // Old style metadata - assertEquals("Test Artist", metadata.get(Metadata.AUTHOR)); - assertEquals("Test Title", metadata.get(Metadata.TITLE)); - // New style metadata - assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE)); - - // Check some of the XMPDM metadata - if (!file.endsWith(".opus")) { - assertEquals("Test Album", metadata.get(XMPDM.ALBUM)); - } - assertEquals("Test Artist", metadata.get(XMPDM.ARTIST)); - assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); - assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE)); - - // Check some of the text - String content = handler.toString(); - assertTrue(content.contains("Test Title")); - assertTrue(content.contains("Test Artist")); - } - } - } - - /** - * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit - * list of supported parsers. - * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a> - */ - @Test - public void testSpecificParserList() throws Exception { - AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser()); - - InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8)); - Metadata metadata = new Metadata(); - parser.parse(is, new BodyContentHandler(), metadata, new ParseContext()); - - assertEquals("value", metadata.get("MyParser")); - } - - private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser"); - - /** - * A test detector which always returns the type supported - * by the test parser - */ - @SuppressWarnings("serial") - private static class MyDetector implements Detector { - public MediaType detect(InputStream input, Metadata metadata) throws IOException { - return MY_MEDIA_TYPE; - } - } - - @SuppressWarnings("serial") - private static class MyParser extends AbstractParser { - public Set<MediaType> getSupportedTypes(ParseContext context) { - Set<MediaType> supportedTypes = new HashSet<MediaType>(); - supportedTypes.add(MY_MEDIA_TYPE); - return supportedTypes; - } - - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) { - metadata.add("MyParser", "value"); - } - - } - - /** - * Minimal class to encapsulate all parameters -- the main reason for - * its existence is to aid in debugging via its toString() method. - * - * Getters and setters intentionally not provided. - */ - private static class TestParams { - - public String resourceRealName; - public String resourceStatedName; - public String realType; - public String statedType; - public String expectedContentFragment; - - - private TestParams(String resourceRealName, - String resourceStatedName, - String realType, - String statedType, - String expectedContentFragment) { - this.resourceRealName = resourceRealName; - this.resourceStatedName = resourceStatedName; - this.realType = realType; - this.statedType = statedType; - this.expectedContentFragment = expectedContentFragment; - } - - - /** - * Produces a string like the following: - * - * <pre> - * Test parameters: - * resourceRealName = /test-documents/testEXCEL.xls - * resourceStatedName = null - * realType = application/vnd.ms-excel - * statedType = null - * expectedContentFragment = Sample Excel Worksheet - * </pre> - */ - public String toString() { - return "Test parameters:\n" - + " resourceRealName = " + resourceRealName + "\n" - + " resourceStatedName = " + resourceStatedName + "\n" - + " realType = " + realType + "\n" - + " statedType = " + statedType + "\n" - + " expectedContentFragment = " + expectedContentFragment + "\n"; - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java deleted file mode 100644 index 68edfc2..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; - -import java.io.InputStream; -import java.util.HashMap; -import java.util.Map; - -import org.apache.tika.TikaTest; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.utils.CommonsDigester; -import org.junit.Test; - - -public class DigestingParserTest extends TikaTest { - - private final static String P = TikaCoreProperties.TIKA_META_PREFIX+ - "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER; - - private final int UNLIMITED = 1000000;//well, not really, but longer than input file - private final Parser p = new AutoDetectParser(); - - @Test - public void testBasic() throws Exception { - Map<CommonsDigester.DigestAlgorithm, String> expected = - new HashMap<CommonsDigester.DigestAlgorithm, String>(); - - expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f"); - expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772"); - expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6"); - expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" + - "82bc53764a0f1430d134ae3b70c32654"); - expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+ - "8b8a6923fdf251ddab72c6e4b5d54160" + - "9db917ba4260d1767995a844d8d654df"); - expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+ - "da4c21f36b54d7acd06fcf68e974663b"+ - "fed1d256875be58d22beacf178154cc3"+ - "a1178cb73443deaa53aa0840324708bb"); - - //test each one - for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) { - Metadata m = new Metadata(); - XMLResult xml = getXML("test_recursive_embedded.docx", - new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m); - assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString())); - } - - - //test comma separated - CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512"); - Metadata m = new Metadata(); - XMLResult xml = getXML("test_recursive_embedded.docx", - new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m); - for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{ - CommonsDigester.DigestAlgorithm.MD5, - CommonsDigester.DigestAlgorithm.SHA256, - CommonsDigester.DigestAlgorithm.SHA384, - CommonsDigester.DigestAlgorithm.SHA512}) { - assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString())); - } - - assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString())); - assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString())); - - } - - @Test - public void testLimitedRead() throws Exception { - CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5; - int limit = 100; - byte[] bytes = new byte[limit]; - InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx"); - is.read(bytes, 0, limit); - is.close(); - Metadata m = new Metadata(); - try { - XMLResult xml = getXML(TikaInputStream.get(bytes), - new DigestingParser(p, new CommonsDigester(100, algo)), m); - } catch (TikaException e) { - //thrown because this is just a file fragment - assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser", - e.getMessage()); - } - String expectedMD5 = m.get(P+"MD5"); - - m = new Metadata(); - XMLResult xml = getXML("test_recursive_embedded.docx", - new DigestingParser(p, new CommonsDigester(100, algo)), m); - assertEquals(expectedMD5, m.get(P+"MD5")); - } - - @Test - public void testReset() throws Exception { - String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0"; - Metadata m = new Metadata(); - XMLResult xml = getXML("test_recursive_embedded.docx", - new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m); - assertEquals(expectedMD5, m.get(P+"MD5")); - } - - @Test - public void testNegativeMaxMarkLength() throws Exception { - Metadata m = new Metadata(); - boolean ex = false; - try { - XMLResult xml = getXML("test_recursive_embedded.docx", - new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m); - } catch (IllegalArgumentException e) { - ex = true; - } - assertTrue("Exception not thrown", ex); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java deleted file mode 100644 index 2fcd1c3..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser; - -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.Reader; - -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.junit.Test; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; - -public class ParsingReaderTest { - - @Test - public void testPlainText() throws Exception { - String data = "test content"; - InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); - Reader reader = new ParsingReader(stream, "test.txt"); - assertEquals('t', reader.read()); - assertEquals('e', reader.read()); - assertEquals('s', reader.read()); - assertEquals('t', reader.read()); - assertEquals(' ', reader.read()); - assertEquals('c', reader.read()); - assertEquals('o', reader.read()); - assertEquals('n', reader.read()); - assertEquals('t', reader.read()); - assertEquals('e', reader.read()); - assertEquals('n', reader.read()); - assertEquals('t', reader.read()); - assertEquals('\n', reader.read()); - assertEquals(-1, reader.read()); - reader.close(); - assertEquals(-1, stream.read()); - } - - @Test - public void testXML() throws Exception { - String data = "<p>test <span>content</span></p>"; - InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); - Reader reader = new ParsingReader(stream, "test.xml"); - assertEquals(' ', (char) reader.read()); - assertEquals('t', (char) reader.read()); - assertEquals('e', (char) reader.read()); - assertEquals('s', (char) reader.read()); - assertEquals('t', (char) reader.read()); - assertEquals(' ', (char) reader.read()); - assertEquals(' ', (char) reader.read()); - assertEquals('c', (char) reader.read()); - assertEquals('o', (char) reader.read()); - assertEquals('n', (char) reader.read()); - assertEquals('t', (char) reader.read()); - assertEquals('e', (char) reader.read()); - assertEquals('n', (char) reader.read()); - assertEquals('t', (char) reader.read()); - assertEquals('\n', (char) reader.read()); - assertEquals(-1, reader.read()); - reader.close(); - assertEquals(-1, stream.read()); - } - - /** - * Test case for TIKA-203 - * - * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a> - */ - @Test - public void testMetadata() throws Exception { - Metadata metadata = new Metadata(); - InputStream stream = ParsingReaderTest.class.getResourceAsStream( - "/test-documents/testEXCEL.xls"); - try (Reader reader = new ParsingReader( - new AutoDetectParser(), stream, metadata, new ParseContext())) { - // Metadata should already be available - assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); - // Check that the internal buffering isn't broken - assertEquals('F', (char) reader.read()); - assertEquals('e', (char) reader.read()); - assertEquals('u', (char) reader.read()); - assertEquals('i', (char) reader.read()); - assertEquals('l', (char) reader.read()); - assertEquals('1', (char) reader.read()); - } - } - -}
