[11/13] tika git commit: TIKA-1855 -- first pass. Need to turn back on the forbidden-apis testCheck. More clean up remains.

tallison Mon, 21 Mar 2016 18:19:48 -0700

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
new file mode 100644
index 0000000..b852de0
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -0,0 +1,1044 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// Junit imports
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * 
+ * Test Suite for the {@link MimeTypes} repository.
+ * 
+ */
+public class TestMimeTypes extends TikaTest {
+
+    private Tika tika;
+
+    private MimeTypes repo;
+
+    private URL u;
+
+    private static final File f = new File("/a/b/c/x.pdf");
+
+    @Before
+    public void setUp() throws Exception{
+        TikaConfig config = TikaConfig.getDefaultConfig();
+        repo = config.getMimeRepository();
+        tika = new Tika(config);
+        u = new URL("http://mydomain.com/x.pdf?x=y";);
+    }
+
+    @Test
+    public void testCaseSensitivity() {
+        String type = tika.detect("test.PDF");
+        assertNotNull(type);
+        assertEquals(type, tika.detect("test.pdf"));
+        assertEquals(type, tika.detect("test.PdF"));
+        assertEquals(type, tika.detect("test.pdF"));
+    }
+
+    @Test
+    public void testLoadMimeTypes() throws MimeTypeException {
+        assertNotNull(repo.forName("application/octet-stream"));
+        assertNotNull(repo.forName("text/x-tex"));
+    }
+
+    /**
+     * Tests MIME type determination based solely on the URL's extension.
+     */
+    @Test
+    public void testGuessMimeTypes() throws Exception {
+        assertTypeByName("application/pdf", "x.pdf");
+        assertEquals("application/pdf", tika.detect(u.toExternalForm()));
+        assertEquals("application/pdf", tika.detect(f.getPath()));
+        assertTypeByName("text/plain", "x.txt");
+        assertTypeByName("text/html", "x.htm");
+        assertTypeByName("text/html", "x.html");
+        assertTypeByName("application/xhtml+xml", "x.xhtml");
+        assertTypeByName("application/xml", "x.xml");
+        assertTypeByName("application/zip", "x.zip");
+        assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
+        assertTypeByName("application/octet-stream", "x.unknown");
+
+        // Test for the MS Office media types and file extensions listed in
+        // 
http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
+        assertTypeByName("application/msword", "x.doc");
+        assertTypeByName("application/msword", "x.dot");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 "x.docx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template",
 "x.dotx");
+        assertTypeByName("application/vnd.ms-word.document.macroenabled.12", 
"x.docm");
+        assertTypeByName("application/vnd.ms-word.template.macroenabled.12", 
"x.dotm");
+        assertTypeByName("application/vnd.ms-excel", "x.xls");
+        assertTypeByName("application/vnd.ms-excel", "x.xlt");
+        assertTypeByName("application/vnd.ms-excel", "x.xla");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 "x.xlsx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template",
 "x.xltx");
+        assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", 
"x.xlsm");
+        assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", 
"x.xltm");
+        assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", 
"x.xlam");
+        
assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", 
"x.xlsb");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation",
 "x.pptx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template",
 "x.potx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow",
 "x.ppsx");
+        
assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", 
"x.ppam");
+        
assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", 
"x.pptm");
+        
assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", 
"x.potm");
+        
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", 
"x.ppsm");
+    }
+
+    /**
+     * Note - detecting container formats by mime magic is very very
+     *  iffy, as we can't be sure where things will end up.
+     * People really ought to use the container aware detection...
+     */
+    @Test
+    public void testOLE2Detection() throws Exception {
+        // These have the properties block near the start, so our mime
+        //  magic will spot them
+        assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
+        
+        // This one quite legitimately doesn't have its properties block
+        //  as one of the first couple of entries
+        // As such, our mime magic can't figure it out...
+        assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
+        assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
+        
+        
+        // By name + data:
+        
+        // Those we got right to start with are fine
+        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
+        
+        // And the name lets us specialise the generic OOXML
+        //  ones to their actual type
+        assertTypeByNameAndData("application/vnd.ms-powerpoint", 
"testPPT.ppt");
+        assertTypeByNameAndData("application/msword", "testWORD.doc");
+    }
+    
+    /**
+     * Files generated by Works 7.0 Spreadsheet application use the OLE2
+     * structure and resemble Excel files (they contain a "Workbook"). They are
+     * not Excel though. They are distinguished from Excel files with an
+     * additional top-level entry in below the root of the POI filesystem.
+     * 
+     * @throws Exception
+     */
+    @Test
+    public void testWorksSpreadsheetDetection() throws Exception {
+        assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
+                // with name-only, everything should be all right 
+                "application/x-tika-msworks-spreadsheet",
+                // this is possible due to MimeTypes guessing the type
+                // based on the WksSSWorkBook near the beginning of the
+                // file
+                "application/x-tika-msworks-spreadsheet",
+                // this is right, the magic-based detection works, there is
+                // no need for the name-based detection to refine it
+                "application/x-tika-msworks-spreadsheet");
+    }
+    
+    @Test
+    public void testStarOfficeDetection() throws Exception {
+        assertTypeDetection("testVORCalcTemplate.vor",
+                "application/x-staroffice-template",
+                "application/vnd.stardivision.calc",
+                "application/vnd.stardivision.calc");
+        assertTypeDetection("testVORDrawTemplate.vor",
+                "application/x-staroffice-template",
+                "application/vnd.stardivision.draw",
+                "application/vnd.stardivision.draw");
+        assertTypeDetection("testVORImpressTemplate.vor",
+                "application/x-staroffice-template",
+                "application/vnd.stardivision.impress",
+                "application/vnd.stardivision.impress");
+        assertTypeDetection("testVORWriterTemplate.vor",
+                "application/x-staroffice-template",
+                "application/vnd.stardivision.writer",
+                "application/vnd.stardivision.writer");
+        
+        assertTypeDetection("testStarOffice-5.2-calc.sdc",
+                "application/vnd.stardivision.calc",
+                "application/vnd.stardivision.calc",
+                "application/vnd.stardivision.calc");
+        assertTypeDetection("testStarOffice-5.2-draw.sda",
+                "application/vnd.stardivision.draw",
+                "application/vnd.stardivision.draw",
+                "application/vnd.stardivision.draw");
+        assertTypeDetection("testStarOffice-5.2-impress.sdd",
+                "application/vnd.stardivision.impress",
+                "application/vnd.stardivision.impress",
+                "application/vnd.stardivision.impress");
+        assertTypeDetection("testStarOffice-5.2-writer.sdw",
+                "application/vnd.stardivision.writer",
+                "application/vnd.stardivision.writer",
+                "application/vnd.stardivision.writer");
+    }
+    
+    /**
+     * Files generated by Works Word Processor versions 3.0 and 4.0 use the
+     * OLE2 structure. They don't resemble Word though.
+     * 
+     * @throws Exception
+     */
+    @Test
+    public void testOldWorksWordProcessorDetection() throws Exception {
+        assertTypeDetection(
+                "testWORKSWordProcessor3.0.wps",
+                // .wps is just like any other works extension
+                "application/vnd.ms-works",
+                // this is due to MatOST substring
+                "application/vnd.ms-works",
+                // magic-based detection works, no need to refine it
+                "application/vnd.ms-works");
+        
+        // files in version 4.0 are no different from those in version 3.0
+        assertTypeDetection(
+                "testWORKSWordProcessor4.0.wps",
+                "application/vnd.ms-works",
+                "application/vnd.ms-works",
+                "application/vnd.ms-works");
+    }
+    
+    /**
+     * Files from Excel 2 through 4 are based on the BIFF record
+     *  structure, but without a wrapping OLE2 structure.
+     * Excel 5 and Excel 95+ work on OLE2
+     */
+    @Test
+    public void testOldExcel() throws Exception {
+        // With just a name, we'll think everything's a new Excel file
+        assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
+        assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
+        assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
+        
+        // With data, we can work out if it's old or new style
+        assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+        assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
+        assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
+        
+        
assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
+        assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
+    }
+    
+    /**
+     * Note - detecting container formats by mime magic is very very
+     *  iffy, as we can't be sure where things will end up.
+     * People really ought to use the container aware detection...
+     */
+    @Test
+    public void testOoxmlDetection() throws Exception {
+        // These two do luckily have [Content_Types].xml near the start,
+        //  so our mime magic will spot them
+        assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
+        assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+        
+        // This one quite legitimately doesn't have its [Content_Types].xml
+        //  file as one of the first couple of entries
+        // As such, our mime magic can't figure it out...
+        assertTypeByData("application/zip", "testWORD.docx");
+        
+        // If we give the filename as well as the data, we can
+        //  specialise the ooxml generic one to the correct type
+        
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 "testEXCEL.xlsx");
+        
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation",
 "testPPT.pptx");
+        
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 "testWORD.docx");
+        
+        // Test a few of the less usual ones
+        
assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
+        
assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12",
 "testPPT.pptm");
+        
assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12",
 "testPPT.potm");
+        
assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12",
 "testPPT.ppsm");
+    }
+    
+    /**
+     * Note - container based formats, needs container detection
+     *  to be properly correct
+     */
+    @Test
+    public void testVisioDetection() throws Exception {
+        // By Name, should get it right
+        assertTypeByName("application/vnd.visio", "testVISIO.vsd");
+        assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", 
"testVISIO.vsdm");
+        assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+        assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", 
"testVISIO.vssm");
+        assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+        assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", 
"testVISIO.vstm");
+        assertTypeByName("application/vnd.ms-visio.template", 
"testVISIO.vstx");
+        
+        // By Name and Data, should get it right
+        assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
+        
assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", 
"testVISIO.vsdm");
+        assertTypeByNameAndData("application/vnd.ms-visio.drawing", 
"testVISIO.vsdx");
+        
assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", 
"testVISIO.vssm");
+        assertTypeByNameAndData("application/vnd.ms-visio.stencil", 
"testVISIO.vssx");
+        
assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", 
"testVISIO.vstm");
+        assertTypeByNameAndData("application/vnd.ms-visio.template", 
"testVISIO.vstx");
+        
+        // By Data only, will get the container parent
+        assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
+        assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
+    }
+
+    /**
+     * Note - detecting container formats by mime magic is very very
+     *  iffy, as we can't be sure where things will end up.
+     * People really ought to use the container aware detection...
+     */
+    @Test
+    public void testIWorkDetection() throws Exception {
+        // By name is easy
+       assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
+       assertTypeByName("application/vnd.apple.numbers", 
"testNumbers.numbers");
+       assertTypeByName("application/vnd.apple.pages", "testPages.pages");
+       
+       // We can't do it by data, as we'd need to unpack
+       //  the zip file to check the XML 
+       assertTypeByData("application/zip", "testKeynote.key");
+       
+       assertTypeByNameAndData("application/vnd.apple.keynote", 
"testKeynote.key");
+       assertTypeByNameAndData("application/vnd.apple.numbers", 
"testNumbers.numbers");
+       assertTypeByNameAndData("application/vnd.apple.pages", 
"testPages.pages");
+    }
+    
+    @Test
+    public void testArchiveDetection() throws Exception {
+       assertTypeByName("application/x-archive", "test.ar");
+       assertTypeByName("application/zip",    "test.zip");
+       assertTypeByName("application/x-tar",  "test.tar");
+       assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar 
contents of it
+       assertTypeByName("application/x-cpio", "test.cpio");
+       
+       // TODO Add an example .deb and .udeb, then check these
+       
+       // Check the mime magic patterns for them work too
+       assertTypeByData("application/x-archive", "testARofText.ar");
+       assertTypeByData("application/x-archive", "testARofSND.ar"); 
+       assertTypeByData("application/zip",    "test-documents.zip");
+       assertTypeByData("application/x-gtar",  "test-documents.tar"); // GNU 
TAR
+       assertTypeByData("application/gzip", "test-documents.tgz"); // See 
GZIP, not tar contents of it
+       assertTypeByData("application/x-cpio", "test-documents.cpio");
+       
+       // For spanned zip files, the .zip file doesn't have the header, it's 
the other parts
+       assertTypeByData("application/octet-stream", 
"test-documents-spanned.zip");
+       assertTypeByData("application/zip",          
"test-documents-spanned.z01");
+    }
+    
+    @Test
+    public void testFeedsDetection() throws Exception {
+        assertType("application/rss+xml",  "rsstest.rss");
+        assertType("application/atom+xml", "testATOM.atom");
+        assertTypeByData("application/rss+xml",  "rsstest.rss");
+        assertTypeByName("application/rss+xml",  "rsstest.rss");
+        assertTypeByData("application/atom+xml", "testATOM.atom");
+        assertTypeByName("application/atom+xml", "testATOM.atom");
+    }
+    
+    @Test
+    public void testFitsDetection() throws Exception {
+        // FITS image created using imagemagick convert of testJPEG.jpg
+        assertType("application/fits", "testFITS.fits");
+        assertTypeByData("application/fits", "testFITS.fits");
+        assertTypeByName("application/fits", "testFITS.fits");
+    }
+
+    @Test
+    public void testJpegDetection() throws Exception {
+        assertType("image/jpeg", "testJPEG.jpg");
+        assertTypeByData("image/jpeg", "testJPEG.jpg");
+        assertTypeByName("image/jpeg", "x.jpg");
+        assertTypeByName("image/jpeg", "x.JPG");
+        assertTypeByName("image/jpeg", "x.jpeg");
+        assertTypeByName("image/jpeg", "x.JPEG");
+        assertTypeByName("image/jpeg", "x.jpe");
+        assertTypeByName("image/jpeg", "x.jif");
+        assertTypeByName("image/jpeg", "x.jfif");
+        assertTypeByName("image/jpeg", "x.jfi");
+        
+        assertType("image/jp2", "testJPEG.jp2");
+        assertTypeByData("image/jp2", "testJPEG.jp2");
+        assertTypeByName("image/jp2", "x.jp2");
+    }
+
+    @Test
+    public void testBpgDetection() throws Exception {
+        assertType("image/x-bpg", "testBPG.bpg");
+        assertTypeByData("image/x-bpg", "testBPG.bpg");
+        assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
+        assertTypeByName("image/x-bpg", "x.bpg");
+    }
+    
+    @Test
+    public void testTiffDetection() throws Exception {
+        assertType("image/tiff", "testTIFF.tif");
+        assertTypeByData("image/tiff", "testTIFF.tif");
+        assertTypeByName("image/tiff", "x.tiff");
+        assertTypeByName("image/tiff", "x.tif");
+        assertTypeByName("image/tiff", "x.TIF");
+    }
+
+    @Test
+    public void testGifDetection() throws Exception {
+        assertType("image/gif", "testGIF.gif");
+        assertTypeByData("image/gif", "testGIF.gif");
+        assertTypeByName("image/gif", "x.gif");
+        assertTypeByName("image/gif", "x.GIF");
+    }
+
+    @Test
+    public void testPngDetection() throws Exception {
+        assertType("image/png", "testPNG.png");
+        assertTypeByData("image/png", "testPNG.png");
+        assertTypeByName("image/png", "x.png");
+        assertTypeByName("image/png", "x.PNG");
+    }
+
+    @Test
+    public void testWEBPDetection() throws Exception {
+        assertType("image/webp", "testWEBP.webp");
+        assertTypeByData("image/webp", "testWEBP.webp");
+        assertTypeByName("image/webp", "x.webp");
+        assertTypeByName("image/webp", "x.WEBP");
+    }
+
+    @Test
+    public void testBmpDetection() throws Exception {
+        assertType("image/x-ms-bmp", "testBMP.bmp");
+        assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
+        assertTypeByName("image/x-ms-bmp", "x.bmp");
+        assertTypeByName("image/x-ms-bmp", "x.BMP");
+        assertTypeByName("image/x-ms-bmp", "x.dib");
+        assertTypeByName("image/x-ms-bmp", "x.DIB");
+        //false positive check -- contains part of BMP signature
+        assertType("text/plain", "testBMPfp.txt");
+    }
+
+    @Test
+    public void testPnmDetection() throws Exception {
+        assertType("image/x-portable-bitmap", "testPBM.pbm");
+        assertType("image/x-portable-graymap", "testPGM.pgm");
+        assertType("image/x-portable-pixmap", "testPPM.ppm");
+        assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
+        assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
+        assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
+        assertTypeByName("image/x-portable-anymap", "x.pnm");
+        assertTypeByName("image/x-portable-anymap", "x.PNM");
+        assertTypeByName("image/x-portable-bitmap", "x.pbm");
+        assertTypeByName("image/x-portable-bitmap", "x.PBM");
+        assertTypeByName("image/x-portable-graymap", "x.pgm");
+        assertTypeByName("image/x-portable-graymap", "x.PGM");
+        assertTypeByName("image/x-portable-pixmap", "x.ppm");
+        assertTypeByName("image/x-portable-pixmap", "x.PPM");
+    }
+
+    @Test
+    public void testPictDetection() throws Exception {
+        assertType("image/x-pict", "testPICT.pct");
+        assertTypeByData("image/x-pict", "testPICT.pct");
+        assertTypeByName("image/x-pict", "x.pic");
+        assertTypeByName("image/x-pict", "x.PCT");
+    }
+
+    @Test
+    public void testCgmDetection() throws Exception {
+        // TODO: Need a test image file
+        assertTypeByName("image/cgm", "x.cgm");
+        assertTypeByName("image/cgm", "x.CGM");
+    }
+
+    @Test
+    public void testRdfXmlDetection() throws Exception {
+        assertTypeByName("application/rdf+xml", "x.rdf");
+        assertTypeByName("application/rdf+xml", "x.owl");
+    }
+
+    @Test
+    public void testSvgDetection() throws Exception {
+        assertType("image/svg+xml", "testSVG.svg");
+        assertTypeByData("image/svg+xml", "testSVG.svg");
+        assertTypeByName("image/svg+xml", "x.svg");
+        assertTypeByName("image/svg+xml", "x.SVG");
+
+        // Should *.svgz be svg or gzip
+        assertType("application/gzip", "testSVG.svgz");
+        assertTypeByData("application/gzip", "testSVG.svgz");
+        assertTypeByName("image/svg+xml", "x.svgz");
+        assertTypeByName("image/svg+xml", "x.SVGZ");
+    }
+
+    @Test
+    public void testPdfDetection() throws Exception {
+        // PDF extension by name is enough
+        assertTypeByName("application/pdf", "x.pdf");
+        assertTypeByName("application/pdf", "x.PDF");
+
+        // For normal PDFs, can get by name or data or both
+        assertType("application/pdf", "testPDF.pdf");
+        assertTypeByData("application/pdf", "testPDF.pdf");
+
+        // PDF with a BoM works both ways too
+        assertType("application/pdf", "testPDF_bom.pdf");
+        assertTypeByData("application/pdf", "testPDF_bom.pdf");
+    }
+
+    @Test
+    public void testSwfDetection() throws Exception {
+        assertTypeByName("application/x-shockwave-flash", "x.swf");
+        assertTypeByName("application/x-shockwave-flash", "x.SWF");
+        assertTypeByName("application/x-shockwave-flash", "test1.swf");
+        assertTypeByName("application/x-shockwave-flash", "test2.swf");
+        assertTypeByName("application/x-shockwave-flash", "test3.swf");
+    }
+
+    @Test
+    public void testDwgDetection() throws Exception {
+        assertTypeByName("image/vnd.dwg", "x.dwg");
+        assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
+        assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
+        assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
+    }
+
+    @Test
+    public void testprtDetection() throws Exception {
+       assertTypeByName("application/x-prt", "x.prt");
+       assertTypeByData("application/x-prt", "testCADKEY.prt");
+   }
+    
+    /**
+     * Formats which are based on plain text
+     */
+    @Test
+    public void testTextBasedFormatsDetection() throws Exception {
+       assertTypeByName("text/plain", "testTXT.txt");
+       assertType(      "text/plain", "testTXT.txt");
+       
+       assertTypeByName("text/css", "testCSS.css");
+       assertType(      "text/css", "testCSS.css");
+       
+       assertTypeByName("text/csv", "testCSV.csv");
+       assertType(      "text/csv", "testCSV.csv");
+       
+       assertTypeByName("text/html", "testHTML.html");
+       assertType(      "text/html", "testHTML.html");
+       
+       assertTypeByName("application/javascript", "testJS.js");
+       assertType(      "application/javascript", "testJS.js");
+    }
+    
+    @Test
+    public void testJavaDetection() throws Exception {
+        // TODO Classloader doesn't seem to find the .class file in 
test-documents
+        //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
+        
+        // OSX Native Extension
+        assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
+    }
+
+    @Test
+    public void testXmlAndHtmlDetection() throws Exception {
+        assertTypeByData("application/xml", "<?xml version=\"1.0\" 
encoding=\"UTF-8\"?><records><record/></records>"
+                .getBytes(UTF_8));
+        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" 
encoding=\"UTF-16\"?><records><record/></records>"
+                .getBytes(UTF_16LE));
+        assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" 
encoding=\"UTF-16\"?><records><record/></records>"
+                .getBytes(UTF_16BE));
+        assertTypeByData("application/xml", "<!-- XML without processing 
instructions --><records><record/></records>"
+                .getBytes(UTF_8));
+        assertTypeByData("text/html", "<html><body>HTML</body></html>"
+                .getBytes(UTF_8));
+        assertTypeByData("text/html", "<!-- HTML comment 
--><html><body>HTML</body></html>"
+                .getBytes(UTF_8));
+    }
+
+    @Test
+    public void testWmfDetection() throws Exception {
+        assertTypeByName("application/x-msmetafile", "x.wmf");
+        assertTypeByData("application/x-msmetafile", "testWMF.wmf");
+        assertTypeByName("application/x-msmetafile", "x.WMF");
+
+        assertTypeByName("application/x-emf", "x.emf");
+        assertTypeByData("application/x-emf","testEMF.emf");
+        assertTypeByName("application/x-emf", "x.EMF");
+        // TODO: Need a test wmz file
+        assertTypeByName("application/x-ms-wmz", "x.wmz");
+        assertTypeByName("application/x-ms-wmz", "x.WMZ");
+        // TODO: Need a test emz file
+        assertTypeByName("application/gzip", "x.emz");
+        assertTypeByName("application/gzip", "x.EMZ");
+    }
+
+    @Test
+    public void testPsDetection() throws Exception {
+        // TODO: Need a test postscript file
+        assertTypeByName("application/postscript", "x.ps");
+        assertTypeByName("application/postscript", "x.PS");
+        assertTypeByName("application/postscript", "x.eps");
+        assertTypeByName("application/postscript", "x.epsf");
+        assertTypeByName("application/postscript", "x.epsi");
+    }
+    
+    @Test
+    public void testMicrosoftMultiMediaDetection() throws Exception {
+       assertTypeByName("video/x-ms-asf", "x.asf");
+       assertTypeByName("video/x-ms-wmv", "x.wmv");
+       assertTypeByName("audio/x-ms-wma", "x.wma");
+       
+       assertTypeByData("video/x-ms-asf", "testASF.asf");
+       assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
+       assertTypeByData("audio/x-ms-wma", "testWMA.wma");
+    }
+    
+    /**
+     * All 3 DITA types are in theory handled by the same mimetype,
+     *  but we specialise them 
+     */
+    @Test
+    public void testDITADetection() throws Exception {
+       assertTypeByName("application/dita+xml; format=topic", "test.dita");
+       assertTypeByName("application/dita+xml; format=map", "test.ditamap");
+       assertTypeByName("application/dita+xml; format=val", "test.ditaval");
+       
+       assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
+       assertTypeByData("application/dita+xml; format=concept", 
"testDITA2.dita");
+       assertTypeByData("application/dita+xml; format=map", 
"testDITA.ditamap");
+       
+       assertTypeByNameAndData("application/dita+xml; format=task", 
"testDITA.dita");
+       assertTypeByNameAndData("application/dita+xml; format=concept", 
"testDITA2.dita");
+       assertTypeByNameAndData("application/dita+xml; format=map", 
"testDITA.ditamap");
+       
+       // These are all children of the official type
+       assertEquals("application/dita+xml", 
+             
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
+       assertEquals("application/dita+xml", 
+             
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
+       // Concept inherits from topic
+       assertEquals("application/dita+xml; format=topic", 
+             
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
+    }
+
+    /**
+     * @since TIKA-194
+     */
+    @Test
+    public void testJavaRegex() throws Exception{
+        MimeType testType = new MimeType(MediaType.parse("foo/bar"));
+        this.repo.add(testType);
+        assertNotNull(repo.forName("foo/bar"));
+        String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
+        this.repo.addPattern(testType, pattern, true);
+        String testFileName = "rtg_sst_grb_0.5.12345678";
+        assertEquals("foo/bar", tika.detect(testFileName));
+
+        MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
+        this.repo.add(testType2);
+        assertNotNull(repo.forName("foo/bar2"));
+        this.repo.addPattern(testType2, pattern, false);
+        assertNotSame("foo/bar2", tika.detect(testFileName));
+    }
+    
+    @Test
+    public void testRawDetection() throws Exception {
+        assertTypeByName("image/x-raw-adobe", "x.dng");
+        assertTypeByName("image/x-raw-adobe", "x.DNG");
+        assertTypeByName("image/x-raw-hasselblad", "x.3fr");
+        assertTypeByName("image/x-raw-fuji", "x.raf");
+        assertTypeByName("image/x-raw-canon", "x.crw");
+        assertTypeByName("image/x-raw-canon", "x.cr2");
+        assertTypeByName("image/x-raw-kodak", "x.k25");
+        assertTypeByName("image/x-raw-kodak", "x.kdc");
+        assertTypeByName("image/x-raw-kodak", "x.dcs");
+        assertTypeByName("image/x-raw-kodak", "x.drf");
+        assertTypeByName("image/x-raw-minolta", "x.mrw");
+        assertTypeByName("image/x-raw-nikon", "x.nef");
+        assertTypeByName("image/x-raw-nikon", "x.nrw");
+        assertTypeByName("image/x-raw-olympus", "x.orf");
+        assertTypeByName("image/x-raw-pentax", "x.ptx");
+        assertTypeByName("image/x-raw-pentax", "x.pef");
+        assertTypeByName("image/x-raw-sony", "x.arw");
+        assertTypeByName("image/x-raw-sony", "x.srf");
+        assertTypeByName("image/x-raw-sony", "x.sr2");
+        assertTypeByName("image/x-raw-sigma", "x.x3f");
+        assertTypeByName("image/x-raw-epson", "x.erf");
+        assertTypeByName("image/x-raw-mamiya", "x.mef");
+        assertTypeByName("image/x-raw-leaf", "x.mos");
+        assertTypeByName("image/x-raw-panasonic", "x.raw");
+        assertTypeByName("image/x-raw-panasonic", "x.rw2");
+        assertTypeByName("image/x-raw-phaseone", "x.iiq");
+        assertTypeByName("image/x-raw-red", "x.r3d");
+        assertTypeByName("image/x-raw-imacon", "x.fff");
+        assertTypeByName("image/x-raw-logitech", "x.pxn");
+        assertTypeByName("image/x-raw-casio", "x.bay");
+        assertTypeByName("image/x-raw-rawzor", "x.rwz");
+    }
+    
+    /**
+     * Tests that we correctly detect the font types
+     */
+    @Test
+    public void testFontDetection() throws Exception {
+       assertTypeByName("application/x-font-adobe-metric", "x.afm");
+       assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
+       
+       assertTypeByName("application/x-font-printer-metric", "x.pfm");
+       // TODO Get a sample .pfm file
+       assertTypeByData(
+             "application/x-font-printer-metric", 
+             new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,  
+                         0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
+       );
+       
+       assertTypeByName("application/x-font-type1", "x.pfa");
+       // TODO Get a sample .pfa file
+       assertTypeByData(
+             "application/x-font-type1", 
+             new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
+                         0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
+                         0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
+       );
+       
+       assertTypeByName("application/x-font-type1", "x.pfb");
+       // TODO Get a sample .pfm file
+       assertTypeByData(
+             "application/x-font-type1", 
+             new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
+                          0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
+                          0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
+       );
+    }
+
+    /**
+     * Tests MimeTypes.getMimeType(URL), which examines both the byte header
+     * and, if necessary, the URL's extension.
+     */
+    @Test
+    public void testMimeDeterminationForTestDocuments() throws Exception {
+        assertType("text/html", "testHTML.html");
+        assertType("application/zip", "test-documents.zip");
+
+        assertType("text/html", "testHTML_utf8.html");
+        assertType(
+                "application/vnd.oasis.opendocument.text",
+                "testOpenOffice2.odt");
+        assertType("application/pdf", "testPDF.pdf");
+        assertType("application/rtf", "testRTF.rtf");
+        assertType("text/plain", "testTXT.txt");
+        assertType("application/xml", "testXML.xml");
+        assertType("audio/basic", "testAU.au");
+        assertType("audio/x-aiff", "testAIFF.aif");
+        assertType("audio/x-wav", "testWAV.wav");
+        assertType("audio/midi", "testMID.mid");
+        assertType("application/x-msaccess", "testACCESS.mdb");
+        assertType("application/x-font-ttf", "testTrueType3.ttf");
+    }
+    
+    @Test
+    public void test7ZipDetection() throws Exception {
+       assertTypeByName("application/x-7z-compressed","test-documents.7z");
+       assertTypeByData("application/x-7z-compressed","test-documents.7z");
+       assertTypeByNameAndData("application/x-7z-compressed", 
"test-documents.7z");
+   }
+
+    @Test
+    public void testWebArchiveDetection() throws Exception {
+        assertTypeByName("application/x-webarchive","x.webarchive");
+        assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
+        assertTypeByNameAndData("application/x-webarchive", 
"testWEBARCHIVE.webarchive");
+    }
+
+    /**
+     * KML, and KMZ (zipped KML)
+     */
+    @Test
+    public void testKMLZDetection() throws Exception {
+       assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
+       assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
+       assertTypeByNameAndData("application/vnd.google-earth.kml+xml", 
"testKML.kml");
+       
+       assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
+       assertTypeByNameAndData("application/vnd.google-earth.kmz", 
"testKMZ.kmz");
+       
+       // By data only, mimetype magic only gets us to a .zip
+       // We need to use the Zip Aware detector to get the full type
+       assertTypeByData("application/zip","testKMZ.kmz");
+   }
+
+    @Test
+    public void testCreativeSuite() throws IOException {
+        assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
+        assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
+    }
+    
+    @Test
+    public void testAMR() throws IOException {
+        // AMR matches on name, data or both
+        assertTypeDetection("testAMR.amr", "audio/amr");
+        
+        // AMR-WB subtype shares extension, so needs data to identify
+        assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", 
"audio/amr-wb");
+        
+        // Ditto for the AMR-WB+ subtype, which we don't have a sample file of 
yet
+        //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", 
"audio/amr-wb+");
+    }
+    
+    @Test
+    public void testEmail() throws IOException {
+        // EMLX
+        assertTypeDetection("testEMLX.emlx", "message/x-emlx");
+        
+        // Groupwise
+        assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
+        
+        // Lotus
+        assertTypeDetection("testLotusEml.eml", "message/rfc822");
+        
+        // Thunderbird - doesn't currently work by name
+        assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
+    }
+    
+    @Test
+    public void testAxCrypt() throws Exception {
+        // test-TXT.txt encrypted with a key of "tika"
+        assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
+    }
+    
+    @Test
+    public void testWindowsEXE() throws Exception {
+        assertTypeByName("application/x-msdownload", "x.dll");
+        assertTypeByName("application/x-ms-installer", "x.msi");
+        assertTypeByName("application/x-dosexec", "x.exe");
+        
+        assertTypeByData("application/x-msdownload; format=pe", 
"testTinyPE.exe");
+        assertTypeByNameAndData("application/x-msdownload; format=pe", 
"testTinyPE.exe");
+        
+        // A jar file with part of a PE header, but not a full one
+        //  should still be detected as a zip or jar (without/with name)
+        assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
+        assertTypeByNameAndData("application/java-archive", 
"testJAR_with_PEHDR.jar");
+    }
+    
+    @Test
+    public void testMatroskaDetection() throws Exception {
+        assertType("video/x-matroska", "testMKV.mkv");
+        // TODO: Need custom detector data detection, see TIKA-1180
+        assertTypeByData("application/x-matroska", "testMKV.mkv");
+        assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
+        assertTypeByName("video/x-matroska", "x.mkv");
+        assertTypeByName("video/x-matroska", "x.MKV");
+        assertTypeByName("audio/x-matroska", "x.mka");
+        assertTypeByName("audio/x-matroska", "x.MKA");
+    }
+    
+    @Test
+    public void testWebMDetection() throws Exception {
+        assertType("video/webm", "testWEBM.webm");
+        // TODO: Need custom detector data detection, see TIKA-1180
+        assertTypeByData("application/x-matroska", "testWEBM.webm");
+        assertTypeByNameAndData("video/webm", "testWEBM.webm");
+        assertTypeByName("video/webm", "x.webm");
+        assertTypeByName("video/webm", "x.WEBM");
+    }
+
+    /** Test getMimeType(byte[]) */
+    @Test
+    public void testGetMimeType_byteArray() throws IOException {
+        // Plain text detection
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
+        assertText(new byte[] { 'a', 'b', 'c' });
+        assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+        assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+    }
+    
+    @Test
+    public void testBerkeleyDB() throws IOException {
+        assertTypeByData(
+                "application/x-berkeley-db; format=btree; version=2", 
+                "testBDB_btree_2.db");
+        assertTypeByData(
+                "application/x-berkeley-db; format=btree; version=3", 
+                "testBDB_btree_3.db");
+        assertTypeByData(
+                "application/x-berkeley-db; format=btree; version=4", 
+                "testBDB_btree_4.db");
+        // V4 and V5 share the same btree format
+        assertTypeByData(
+                "application/x-berkeley-db; format=btree; version=4", 
+                "testBDB_btree_5.db");
+        
+        assertTypeByData(
+                "application/x-berkeley-db; format=hash; version=2", 
+                "testBDB_hash_2.db");
+        assertTypeByData(
+                "application/x-berkeley-db; format=hash; version=3", 
+                "testBDB_hash_3.db");
+        assertTypeByData(
+                "application/x-berkeley-db; format=hash; version=4", 
+                "testBDB_hash_4.db");
+        assertTypeByData(
+                "application/x-berkeley-db; format=hash; version=5", 
+                "testBDB_hash_5.db");
+    }
+    
+    /**
+     * CBOR typically contains HTML
+     */
+    @Test
+    public void testCBOR() throws IOException {
+        assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
+        assertTypeByData("application/cbor", "NUTCH-1997.cbor");
+    }
+    
+    @Test
+    public void testZLIB() throws IOException {
+        // ZLIB encoded versions of testTXT.txt
+        assertTypeByData("application/zlib", "testTXT.zlib");
+        assertTypeByData("application/zlib", "testTXT.zlib0");
+        assertTypeByData("application/zlib", "testTXT.zlib5");
+        assertTypeByData("application/zlib", "testTXT.zlib9");
+    }
+    
+    @Test
+    public void testTextFormats() throws Exception {
+        assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
+        assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
+    }
+    
+    @Test
+    public void testCodeFormats() throws Exception {
+        assertType("text/x-csrc", "testC.c");
+        assertType("text/x-chdr", "testH.h");
+        assertTypeByData("text/x-csrc", "testC.c");
+        assertTypeByData("text/x-chdr", "testH.h");
+        
+        assertTypeByName("text/x-java-source", "testJAVA.java");
+        assertType("text/x-java-properties", "testJAVAPROPS.properties");
+        
+        assertType("text/x-matlab", "testMATLAB.m");
+        assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
+        assertType("text/x-matlab", "testMATLAB_barcast.m");
+        assertTypeByData("text/x-matlab", "testMATLAB.m");
+        assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
+        assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
+    }
+
+    @Test
+    public void testWebVTT() throws Exception {
+        assertType("text/vtt", "testWebVTT.vtt");
+        assertTypeByData("text/vtt", "testWebVTT.vtt");
+    }
+    
+    private void assertText(byte[] prefix) throws IOException {
+        assertMagic("text/plain", prefix);
+    }
+
+    private void assertNotText(byte[] prefix) throws IOException {
+        assertMagic("application/octet-stream", prefix);
+    }
+
+    private void assertMagic(String expected, byte[] prefix) throws 
IOException {
+        MediaType type =
+                repo.detect(new ByteArrayInputStream(prefix), new Metadata());
+        assertNotNull(type);
+        assertEquals(expected, type.toString());
+    }
+
+    private void assertType(String expected, String filename) throws Exception 
{
+        try (InputStream stream = getTestDocumentAsStream(filename)) {
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            assertEquals(expected, repo.detect(stream, metadata).toString());
+        }
+    }
+
+    private void assertTypeByName(String expected, String filename)
+            throws IOException {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+        assertEquals(expected, repo.detect(null, metadata).toString());
+    }
+
+    private void assertTypeByData(String expected, String filename)
+            throws IOException {
+        try (InputStream stream = getTestDocumentAsStream(filename)) {
+            Metadata metadata = new Metadata();
+            assertEquals(expected, repo.detect(stream, metadata).toString());
+        }
+    }
+
+    private void assertTypeByData(String expected, byte[] data)
+            throws IOException {
+        try (InputStream stream = new ByteArrayInputStream(data)) {
+            Metadata metadata = new Metadata();
+            assertEquals(expected, repo.detect(stream, metadata).toString());
+        }
+    }
+
+    private void assertTypeDetection(String filename, String type)
+            throws IOException {
+        assertTypeDetection(filename, type, type, type);
+    }
+
+    private void assertTypeDetection(String filename, String byName, String 
byData, 
+            String byNameAndData) throws IOException {
+        assertTypeByName(byName, filename);
+        assertTypeByData(byData, filename);
+        assertTypeByNameAndData(byNameAndData, filename);
+    }
+
+    private void assertTypeByNameAndData(String expected, String filename)
+        throws IOException {
+       assertEquals(expected, getTypeByNameAndData(filename).toString());
+    }
+
+    private MediaType getTypeByNameAndData(String filename) throws IOException 
{
+        try (InputStream stream = getTestDocumentAsStream(filename)) {
+            assertNotNull("Test document not found: " + filename, stream);
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            return repo.detect(stream, metadata);
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java 
b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
new file mode 100644
index 0000000..91b054e
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
+import org.gagravarr.tika.FlacParser;
+import org.gagravarr.tika.OpusParser;
+import org.gagravarr.tika.VorbisParser;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class AutoDetectParserTest {
+    private TikaConfig tika = TikaConfig.getDefaultConfig();
+
+    // Easy to read constants for the MIME types:
+    private static final String RAW        = "application/octet-stream";
+    private static final String EXCEL      = "application/vnd.ms-excel";
+    private static final String HTML       = "text/html; charset=ISO-8859-1";
+    private static final String PDF        = "application/pdf";
+    private static final String POWERPOINT = "application/vnd.ms-powerpoint";
+    private static final String KEYNOTE    = "application/vnd.apple.keynote";
+    private static final String PAGES      = "application/vnd.apple.pages";
+    private static final String NUMBERS    = "application/vnd.apple.numbers";
+    private static final String CHM        = "application/vnd.ms-htmlhelp";
+    private static final String RTF        = "application/rtf";
+    private static final String PLAINTEXT  = "text/plain; charset=ISO-8859-1";
+    private static final String UTF8TEXT   = "text/plain; charset=UTF-8";
+    private static final String WORD       = "application/msword";
+    private static final String XML        = "application/xml";
+    private static final String RSS        = "application/rss+xml";
+    private static final String BMP        = "image/x-ms-bmp";
+    private static final String GIF        = "image/gif";
+    private static final String JPEG       = "image/jpeg";
+    private static final String PNG        = "image/png";
+    private static final String OGG_VORBIS = "audio/vorbis";
+    private static final String OGG_OPUS   = "audio/opus";
+    private static final String OGG_FLAC   = "audio/x-oggflac"; 
+    private static final String FLAC_NATIVE= "audio/x-flac";
+    private static final String OPENOFFICE
+            = "application/vnd.oasis.opendocument.text";
+
+
+    /**
+     * This is where a single test is done.
+     * @param tp the parameters encapsulated in a TestParams instance
+     * @throws IOException
+     */
+    private void assertAutoDetect(TestParams tp) throws Exception {
+        try (InputStream input = 
AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
+            if (input == null) {
+                fail("Could not open stream from specified resource: "
+                        + tp.resourceRealName);
+            }
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
+            metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
+            ContentHandler handler = new BodyContentHandler();
+            new AutoDetectParser(tika).parse(input, handler, metadata);
+
+            assertEquals("Bad content type: " + tp,
+                    tp.realType, metadata.get(Metadata.CONTENT_TYPE));
+
+            if (tp.expectedContentFragment != null) {
+                assertTrue("Expected content not found: " + tp,
+                        
handler.toString().contains(tp.expectedContentFragment));
+            }
+        }
+    }
+
+    /**
+     * Convenience method -- its sole purpose of existence is to make the
+     * call to it more readable than it would be if a TestParams instance
+     * would need to be instantiated there.
+     *
+     * @param resourceRealName real name of resource
+     * @param resourceStatedName stated name -- will a bad name fool us?
+     * @param realType - the real MIME type
+     * @param statedType - stated MIME type - will a wrong one fool us?
+     * @param expectedContentFragment - something expected in the text
+     * @throws Exception
+     */
+    private void assertAutoDetect(String resourceRealName,
+                                  String resourceStatedName,
+                                  String realType,
+                                  String statedType,
+                                  String expectedContentFragment)
+            throws Exception {
+
+        assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
+                realType, statedType, expectedContentFragment));
+    }
+
+    private void assertAutoDetect(
+            String resource, String type, String content) throws Exception {
+
+        resource = "/test-documents/" + resource;
+
+        // TODO !!!!  The disabled tests below should work!
+        // The correct MIME type should be determined regardless of the
+        // stated type (ContentType hint) and the stated URL name.
+
+
+        // Try different combinations of correct and incorrect arguments:
+        final String wrongMimeType = RAW;
+        assertAutoDetect(resource, resource, type, type,          content);
+        assertAutoDetect(resource, resource, type, null,          content);
+        assertAutoDetect(resource, resource, type, wrongMimeType, content);
+
+        assertAutoDetect(resource, null, type, type,          content);
+        assertAutoDetect(resource, null, type, null,          content);
+        assertAutoDetect(resource, null, type, wrongMimeType, content);
+
+        final String badResource = "a.xyz";
+        assertAutoDetect(resource, badResource, type, type,          content);
+        assertAutoDetect(resource, badResource, type, null,          content);
+        assertAutoDetect(resource, badResource, type, wrongMimeType, content);
+    }
+
+    @Test
+    public void testKeynote() throws Exception {
+        assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
+    }
+
+    @Test
+    public void testPages() throws Exception {
+        assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
+    }
+
+    @Test
+    public void testNumbers() throws Exception {
+        assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 
300545668");
+    }
+
+    @Test
+    public void testChm() throws Exception {
+        assertAutoDetect("testChm.chm", CHM, "If you do not specify a window 
type or a window name, the main window is used.");
+    }
+
+    @Test
+    public void testEpub() throws Exception {
+        assertAutoDetect(
+                "testEPUB.epub", "application/epub+zip",
+                "The previous headings were subchapters");
+    }
+
+    @Test
+    public void testExcel() throws Exception {
+        assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
+    }
+
+    @Test
+    public void testHTML() throws Exception {
+        assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
+    }
+
+    @Test
+    public void testOpenOffice() throws Exception {
+        assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
+                "This is a sample Open Office document");
+    }
+
+    @Test
+    public void testPDF() throws Exception {
+        assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
+
+    }
+
+    @Test
+    public void testPowerpoint() throws Exception {
+        assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
+    }
+
+    @Test
+    public void testRdfXml() throws Exception {
+        assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
+    }
+
+    @Test
+    public void testRTF() throws Exception {
+        assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
+    }
+
+    @Test
+    public void testText() throws Exception {
+        assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
+    }
+    
+    @Test
+    public void testTextNonASCIIUTF8() throws Exception {
+        assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown 
fox jumps over the lazy dog");
+    }
+
+    @Test
+    public void testWord() throws Exception {
+        assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
+    }
+
+    @Test
+    public void testXML() throws Exception {
+        assertAutoDetect("testXML.xml", XML, "Lius");
+    }
+
+    @Test
+    public void testRss() throws Exception {
+        assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, 
"application/rss+xml", "Sample RSS File for Junit test");
+    }
+    
+    @Test
+    public void testImages() throws Exception {
+       assertAutoDetect("testBMP.bmp", BMP, null);
+       assertAutoDetect("testGIF.gif", GIF, null);
+       assertAutoDetect("testJPEG.jpg", JPEG, null);
+       assertAutoDetect("testPNG.png", PNG, null);
+   }
+
+    /**
+     * Make sure that zip bomb attacks are prevented.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-216";>TIKA-216</a>
+     */
+    @Test
+    public void testZipBombPrevention() throws Exception {
+        try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
+                "/test-documents/TIKA-216.tgz")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler(-1);
+            new AutoDetectParser(tika).parse(tgz, handler, metadata);
+            fail("Zip bomb was not detected");
+        } catch (TikaException e) {
+            // expected
+        }
+    }
+
+    /**
+     * Make sure XML parse errors don't trigger ZIP bomb detection.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-1322";>TIKA-1322</a>
+     */
+    @Test
+    public void testNoBombDetectedForInvalidXml() throws Exception {
+        // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        ZipOutputStream zos = new ZipOutputStream(baos);
+        for (int i = 1; i <= 10; i++) {
+            zos.putNextEntry(new ZipEntry(i + ".xml"));
+            zos.closeEntry();
+        }
+        zos.finish();
+        zos.close();
+        new AutoDetectParser(tika).parse(new 
ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
+                new Metadata());
+    }
+
+    /**
+     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
+     *  have been correctly included, and are available
+     */
+    @SuppressWarnings("deprecation")
+    @Test
+    public void testOggFlacAudio() throws Exception {
+       // The three test files should all have similar test data
+       String[] testFiles = new String[] {
+             "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
+             "testOPUS.opus"
+       };
+       MediaType[] mediaTypes = new MediaType[] {
+               MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
+               MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
+       };
+       
+       // Check we can load the parsers, and they claim to do the right things
+       VorbisParser vParser = new VorbisParser();
+       assertNotNull("Parser not found for " + mediaTypes[0], 
+                     vParser.getSupportedTypes(new ParseContext()));
+       
+       FlacParser fParser = new FlacParser();
+       assertNotNull("Parser not found for " + mediaTypes[1], 
+                     fParser.getSupportedTypes(new ParseContext()));
+       assertNotNull("Parser not found for " + mediaTypes[2], 
+                     fParser.getSupportedTypes(new ParseContext()));
+       
+       OpusParser oParser = new OpusParser();
+       assertNotNull("Parser not found for " + mediaTypes[3], 
+                     oParser.getSupportedTypes(new ParseContext()));
+       
+       // Check we found the parser
+       CompositeParser parser = (CompositeParser)tika.getParser();
+       for (MediaType mt : mediaTypes) {
+          assertNotNull("Parser not found for " + mt, 
parser.getParsers().get(mt) );
+       }
+       
+       // Have each file parsed, and check
+       for (int i=0; i<testFiles.length; i++) {
+           String file = testFiles[i];
+           try (InputStream input = 
AutoDetectParserTest.class.getResourceAsStream(
+                   "/test-documents/" + file)) {
+               if (input == null) {
+                   fail("Could not find test file " + file);
+               }
+               Metadata metadata = new Metadata();
+               ContentHandler handler = new BodyContentHandler();
+               new AutoDetectParser(tika).parse(input, handler, metadata);
+
+               assertEquals("Incorrect content type for " + file,
+                       mediaTypes[i].toString(), 
metadata.get(Metadata.CONTENT_TYPE));
+
+               // Check some of the common metadata
+               // Old style metadata
+               assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+               assertEquals("Test Title", metadata.get(Metadata.TITLE));
+               // New style metadata
+               assertEquals("Test Artist", 
metadata.get(TikaCoreProperties.CREATOR));
+               assertEquals("Test Title", 
metadata.get(TikaCoreProperties.TITLE));
+
+               // Check some of the XMPDM metadata
+               if (!file.endsWith(".opus")) {
+                   assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+               }
+               assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+               assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+               assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+
+               // Check some of the text
+               String content = handler.toString();
+               assertTrue(content.contains("Test Title"));
+               assertTrue(content.contains("Test Artist"));
+           }
+       }
+    }
+    
+    /**
+     * Test case for TIKA-514. Provide constructor for AutoDetectParser that 
has explicit
+     * list of supported parsers.
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-514";>TIKA-514</a>
+     */
+    @Test
+    public void testSpecificParserList() throws Exception {
+        AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new 
MyParser());
+        
+        InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
+        Metadata metadata = new Metadata();
+        parser.parse(is, new BodyContentHandler(), metadata, new 
ParseContext());
+        
+        assertEquals("value", metadata.get("MyParser"));
+    }
+
+    private static final MediaType MY_MEDIA_TYPE = new 
MediaType("application", "x-myparser");
+    
+    /**
+     * A test detector which always returns the type supported
+     *  by the test parser
+     */
+    @SuppressWarnings("serial")
+    private static class MyDetector implements Detector {
+        public MediaType detect(InputStream input, Metadata metadata) throws 
IOException {
+            return MY_MEDIA_TYPE;
+        }
+    }
+    
+    @SuppressWarnings("serial")
+    private static class MyParser extends AbstractParser {
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            Set<MediaType> supportedTypes = new HashSet<MediaType>();
+            supportedTypes.add(MY_MEDIA_TYPE);
+            return supportedTypes;
+        }
+
+        public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context) {
+            metadata.add("MyParser", "value");
+        }
+
+    }
+    
+    /**
+     * Minimal class to encapsulate all parameters -- the main reason for
+     * its existence is to aid in debugging via its toString() method.
+     *
+     * Getters and setters intentionally not provided.
+     */
+    private static class TestParams {
+
+        public String resourceRealName;
+        public String resourceStatedName;
+        public String realType;
+        public String statedType;
+        public String expectedContentFragment;
+
+
+        private TestParams(String resourceRealName,
+                           String resourceStatedName,
+                           String realType,
+                           String statedType,
+                           String expectedContentFragment) {
+            this.resourceRealName = resourceRealName;
+            this.resourceStatedName = resourceStatedName;
+            this.realType = realType;
+            this.statedType = statedType;
+            this.expectedContentFragment = expectedContentFragment;
+        }
+
+
+        /**
+         * Produces a string like the following:
+         *
+         * <pre>
+         * Test parameters:
+         *   resourceRealName        = /test-documents/testEXCEL.xls
+         *   resourceStatedName      = null
+         *   realType                = application/vnd.ms-excel
+         *   statedType              = null
+         *   expectedContentFragment = Sample Excel Worksheet
+         * </pre>
+         */
+        public String toString() {
+            return "Test parameters:\n"
+                + "  resourceRealName        = " + resourceRealName + "\n"
+                + "  resourceStatedName      = " + resourceStatedName + "\n"
+                + "  realType                = " + realType + "\n"
+                + "  statedType              = " + statedType + "\n"
+                + "  expectedContentFragment = " + expectedContentFragment + 
"\n";
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java 
b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
new file mode 100644
index 0000000..66323d3
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.digesting.CommonsDigester;
+import org.junit.Test;
+
+
+public class DigestingParserTest extends TikaTest {
+
+    private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
+            "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+    private final int UNLIMITED = 1000000;//well, not really, but longer than 
input file
+    private final Parser p = new AutoDetectParser();
+
+    @Test
+    public void testBasic() throws Exception {
+        Map<CommonsDigester.DigestAlgorithm, String> expected =
+                new HashMap<CommonsDigester.DigestAlgorithm, String>();
+
+        
expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
+        
expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f"
 +
+                                                            
"82bc53764a0f1430d134ae3b70c32654");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
+                                                            
"8b8a6923fdf251ddab72c6e4b5d54160" +
+                                                            
"9db917ba4260d1767995a844d8d654df");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
+                                                            
"da4c21f36b54d7acd06fcf68e974663b"+
+                                                            
"fed1d256875be58d22beacf178154cc3"+
+                                                            
"a1178cb73443deaa53aa0840324708bb");
+
+        //test each one
+        for (CommonsDigester.DigestAlgorithm algo : 
CommonsDigester.DigestAlgorithm.values()) {
+            Metadata m = new Metadata();
+            XMLResult xml = getXML("test_recursive_embedded.docx",
+                    new DigestingParser(p, new CommonsDigester(UNLIMITED, 
algo)), m);
+            assertEquals(algo.toString(), expected.get(algo), m.get(P + 
algo.toString()));
+        }
+
+
+        //test comma separated
+        CommonsDigester.DigestAlgorithm[] algos = 
CommonsDigester.parse("md5,sha256,sha384,sha512");
+        Metadata m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), 
m);
+        for (CommonsDigester.DigestAlgorithm algo : new 
CommonsDigester.DigestAlgorithm[]{
+                CommonsDigester.DigestAlgorithm.MD5,
+                CommonsDigester.DigestAlgorithm.SHA256,
+                CommonsDigester.DigestAlgorithm.SHA384,
+                CommonsDigester.DigestAlgorithm.SHA512}) {
+            assertEquals(algo.toString(), expected.get(algo), m.get(P + 
algo.toString()));
+        }
+
+        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
+        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+
+    }
+
+    @Test
+    public void testLimitedRead() throws Exception {
+        CommonsDigester.DigestAlgorithm algo = 
CommonsDigester.DigestAlgorithm.MD5;
+        int limit = 100;
+        byte[] bytes = new byte[limit];
+        InputStream is = 
getResourceAsStream("/test-documents/test_recursive_embedded.docx");
+        is.read(bytes, 0, limit);
+        is.close();
+        Metadata m = new Metadata();
+        try {
+            XMLResult xml = getXML(TikaInputStream.get(bytes),
+                    new DigestingParser(p, new CommonsDigester(100, algo)), m);
+        } catch (TikaException e) {
+            //thrown because this is just a file fragment
+            assertContains("Unexpected RuntimeException from 
org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
+                    e.getMessage());
+        }
+        String expectedMD5 = m.get(P+"MD5");
+
+        m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(100, algo)), m);
+        assertEquals(expectedMD5, m.get(P+"MD5"));
+    }
+
+    @Test
+    public void testReset() throws Exception {
+        String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+        Metadata m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(100, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+        assertEquals(expectedMD5, m.get(P+"MD5"));
+    }
+
+    @Test
+    public void testNegativeMaxMarkLength() throws Exception {
+        Metadata m = new Metadata();
+        boolean ex = false;
+        try {
+            XMLResult xml = getXML("test_recursive_embedded.docx",
+                    new DigestingParser(p, new CommonsDigester(-1, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+        } catch (IllegalArgumentException e) {
+            ex = true;
+        }
+        assertTrue("Exception not thrown", ex);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java 
b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
new file mode 100644
index 0000000..71c07b7
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ParsingReaderTest {
+
+    @Test
+    public void testPlainText() throws Exception {
+        String data = "test content";
+        InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+        Reader reader = new ParsingReader(stream, "test.txt");
+        assertEquals('t', reader.read());
+        assertEquals('e', reader.read());
+        assertEquals('s', reader.read());
+        assertEquals('t', reader.read());
+        assertEquals(' ', reader.read());
+        assertEquals('c', reader.read());
+        assertEquals('o', reader.read());
+        assertEquals('n', reader.read());
+        assertEquals('t', reader.read());
+        assertEquals('e', reader.read());
+        assertEquals('n', reader.read());
+        assertEquals('t', reader.read());
+        assertEquals('\n', reader.read());
+        assertEquals(-1, reader.read());
+        reader.close();
+        assertEquals(-1, stream.read());
+    }
+
+    @Test
+    public void testXML() throws Exception {
+        String data = "<p>test <span>content</span></p>";
+        InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+        Reader reader = new ParsingReader(stream, "test.xml");
+        assertEquals(' ', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('e', (char) reader.read());
+        assertEquals('s', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals(' ', (char) reader.read());
+        assertEquals(' ', (char) reader.read());
+        assertEquals('c', (char) reader.read());
+        assertEquals('o', (char) reader.read());
+        assertEquals('n', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('e', (char) reader.read());
+        assertEquals('n', (char) reader.read());
+        assertEquals('t', (char) reader.read());
+        assertEquals('\n', (char) reader.read());
+        assertEquals(-1, reader.read());
+        reader.close();
+        assertEquals(-1, stream.read());
+    }
+
+    /**
+     * Test case for TIKA-203
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-203";>TIKA-203</a>
+     */
+    @Test
+    public void testMetadata() throws Exception {
+        Metadata metadata = new Metadata();
+        InputStream stream = ParsingReaderTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL.xls");
+        try (Reader reader = new ParsingReader(
+                new AutoDetectParser(), stream, metadata, new ParseContext())) 
{
+            // Metadata should already be available
+            assertEquals("Simple Excel document", 
metadata.get(TikaCoreProperties.TITLE));
+            // Check that the internal buffering isn't broken
+            assertEquals('F', (char) reader.read());
+            assertEquals('e', (char) reader.read());
+            assertEquals('u', (char) reader.read());
+            assertEquals('i', (char) reader.read());
+            assertEquals('l', (char) reader.read());
+            assertEquals('1', (char) reader.read());
+        }
+    }
+
+}

[11/13] tika git commit: TIKA-1855 -- first pass. Need to turn back on the forbidden-apis testCheck. More clean up remains.

Reply via email to