Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 Fri May 29 14:36:21 2015
@@ -28,300 +28,299 @@ import org.junit.Test;
 
 /**
  * Tests that the various POI powered parsers are
- *  able to extract their embedded contents.
+ * able to extract their embedded contents.
  */
 public class POIContainerExtractionTest extends 
AbstractPOIContainerExtractionTest {
-   
+
     /**
      * For office files which don't have anything embedded in them
      */
     @Test
     public void testWithoutEmbedded() throws Exception {
-       ContainerExtractor extractor = new ParserContainerExtractor();
-       
-       String[] files = new String[] {
-             "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
-             "testVISIO.vsd", "test-outlook.msg"
-       };
-       for(String file : files) {
-          // Process it without recursing
-          TrackingHandler handler = process(file, extractor, false);
-          
-          // Won't have fired
-          assertEquals(0, handler.filenames.size());
-          assertEquals(0, handler.mediaTypes.size());
-          
-          // Ditto with recursing
-          handler = process(file, extractor, true);
-          assertEquals(0, handler.filenames.size());
-          assertEquals(0, handler.mediaTypes.size());
-       }
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        String[] files = new String[]{
+                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
+                "testVISIO.vsd", "test-outlook.msg"
+        };
+        for (String file : files) {
+            // Process it without recursing
+            TrackingHandler handler = process(file, extractor, false);
+
+            // Won't have fired
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+
+            // Ditto with recursing
+            handler = process(file, extractor, true);
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+        }
     }
-    
+
     /**
      * Office files with embedded images, but no other
-     *  office files in them
+     * office files in them
      */
     @Test
     public void testEmbeddedImages() throws Exception {
-       ContainerExtractor extractor = new ParserContainerExtractor();
-       TrackingHandler handler;
-       
-       // Excel with 1 image
-       handler = process("testEXCEL_1img.xls", extractor, false);
-       assertEquals(1, handler.filenames.size());
-       assertEquals(1, handler.mediaTypes.size());
-       
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-       
-       // PowerPoint with 2 images + sound
-       // TODO
-       
-       
-       // Word with 1 image
-       handler = process("testWORD_1img.doc", extractor, false);
-       assertEquals(1, handler.filenames.size());
-       assertEquals(1, handler.mediaTypes.size());
-       
-       assertEquals("image1.png", handler.filenames.get(0));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-       
-       // Word with 3 images
-       handler = process("testWORD_3imgs.doc", extractor, false);
-       assertEquals(3, handler.filenames.size());
-       assertEquals(3, handler.mediaTypes.size());
-       
-       assertEquals("image1.png", handler.filenames.get(0));
-       assertEquals("image2.jpg", handler.filenames.get(1));
-       assertEquals("image3.png", handler.filenames.get(2));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        // Excel with 1 image
+        handler = process("testEXCEL_1img.xls", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // PowerPoint with 2 images + sound
+        // TODO
+
+
+        // Word with 1 image
+        handler = process("testWORD_1img.doc", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // Word with 3 images
+        handler = process("testWORD_3imgs.doc", extractor, false);
+        assertEquals(3, handler.filenames.size());
+        assertEquals(3, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals("image2.jpg", handler.filenames.get(1));
+        assertEquals("image3.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
     }
-    
+
     /**
      * Office files which have other office files
-     *  embedded into them. The embedded office files
-     *  will sometimes have images in them.
-     *  
-     *  eg xls
-     *       -> word
-     *           -> image
-     *           -> image
-     *       -> powerpoint
-     *       -> excel
-     *           -> image
+     * embedded into them. The embedded office files
+     * will sometimes have images in them.
+     * <p/>
+     * eg xls
+     * -> word
+     * -> image
+     * -> image
+     * -> powerpoint
+     * -> excel
+     * -> image
      */
     @Test
     public void testEmbeddedOfficeFiles() throws Exception {
-       ContainerExtractor extractor = new ParserContainerExtractor();
-       TrackingHandler handler;
-       
-       
-       // Excel with a word doc and a powerpoint doc, both of which have 
images in them
-       // Without recursion, should see both documents + the images
-       handler = process("testEXCEL_embeded.xls", extractor, false);
-       assertEquals(5, handler.filenames.size());
-       assertEquals(5, handler.mediaTypes.size());
-       
-       // We don't know their filenames
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
-       assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-       assertEquals("MBD00032A24.doc", handler.filenames.get(4));
-       // But we do know their types
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office 
doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office 
doc
-       
-       
-       // With recursion, should get the images embedded in the office files 
too
-       handler = process("testEXCEL_embeded.xls", extractor, true);
-       assertEquals(17, handler.filenames.size());
-       assertEquals(17, handler.mediaTypes.size());
-       
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
-       assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-       assertEquals("1", handler.filenames.get(4));
-       assertEquals(null, handler.filenames.get(5));
-       assertEquals("2", handler.filenames.get(6));
-       assertEquals("image1.png", handler.filenames.get(7));
-       assertEquals("image2.jpg", handler.filenames.get(8));
-       assertEquals("image3.png", handler.filenames.get(9));
-       assertEquals("image1.png", handler.filenames.get(16));
-
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded 
presentation
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
-
-       // Word with .docx, powerpoint and excel
-       handler = process("testWORD_embeded.doc", extractor, false);
-       assertEquals(9, handler.filenames.size());
-       assertEquals(9, handler.mediaTypes.size());
-       
-       // Filenames are a bit iffy...
-       // Should really be 3*embedded pictures then 3*icons then embedded docs
-       assertEquals("image1.emf", handler.filenames.get(0));
-       assertEquals("image4.png", handler.filenames.get(1));
-       assertEquals("image5.jpg", handler.filenames.get(2));
-       assertEquals("image6.png", handler.filenames.get(3));
-       assertEquals("image2.emf", handler.filenames.get(4));
-       assertEquals("image3.emf", handler.filenames.get(5));
-       assertEquals(null, handler.filenames.get(6));
-       assertEquals("_1345471035.ppt", handler.filenames.get(7));
-       assertEquals("_1345470949.xls", handler.filenames.get(8));
-       
-       // But we do know their types
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc?
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc?
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc?
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office 
doc
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office 
doc
-       
-       
-       // With recursion, should get their images too
-       handler = process("testWORD_embeded.doc", extractor, true);
-       assertEquals(16, handler.filenames.size());
-       assertEquals(16, handler.mediaTypes.size());
-       
-       // We don't know their filenames, except for doc images + docx
-       assertEquals("image1.emf", handler.filenames.get(0));
-       assertEquals("image4.png", handler.filenames.get(1));
-       assertEquals("image5.jpg", handler.filenames.get(2));
-       assertEquals("image6.png", handler.filenames.get(3));
-       assertEquals("image2.emf", handler.filenames.get(4));
-       assertEquals("image3.emf", handler.filenames.get(5));
-       assertEquals(null, handler.filenames.get(6));
-       assertEquals("image2.png", handler.filenames.get(7));
-       assertEquals("image3.jpeg", handler.filenames.get(8));
-       assertEquals("image4.png", handler.filenames.get(9));
-       for(int i=11; i<14; i++) {
-          assertNull(handler.filenames.get(i));
-       }
-       // But we do know their types
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside 
.docx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside 
.docx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside 
.docx
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office 
doc
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside 
.xls
-       
-       
-       // PowerPoint with excel and word
-       handler = process("testPPT_embeded.ppt", extractor, false);
-       assertEquals(7, handler.filenames.size());
-       assertEquals(7, handler.mediaTypes.size());
-       
-       // We don't get all that helpful filenames
-       assertEquals("1", handler.filenames.get(0));
-       assertEquals("2", handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
-       assertEquals(null, handler.filenames.get(3));
-       assertEquals(null, handler.filenames.get(4));
-       assertEquals(null, handler.filenames.get(5));
-       assertEquals(null, handler.filenames.get(6));
-       // But we do know their types
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office 
doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded 
office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
-       
-       // Run again on PowerPoint but with recursion
-       handler = process("testPPT_embeded.ppt", extractor, true);
-       assertEquals(11, handler.filenames.size());
-       assertEquals(11, handler.mediaTypes.size());
-       
-       assertEquals("1",  handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals("2",  handler.filenames.get(2));
-       assertEquals("image1.png", handler.filenames.get(3));
-       assertEquals("image2.jpg", handler.filenames.get(4));
-       assertEquals("image3.png", handler.filenames.get(5));
-       assertEquals(null, handler.filenames.get(6));
-       assertEquals(null, handler.filenames.get(7));
-       assertEquals(null, handler.filenames.get(8));
-       assertEquals(null, handler.filenames.get(9));
-       assertEquals(null, handler.filenames.get(10));
-       
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside .xls
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside 
.docx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside 
.docx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside 
.docx
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded 
office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
-       
-       
-       // Word, with a non-office file (PDF)
-       handler = process("testWORD_embedded_pdf.doc", extractor, true);
-       assertEquals(2, handler.filenames.size());
-       assertEquals(2, handler.mediaTypes.size());
-       
-       assertEquals("image1.emf", handler.filenames.get(0));
-       assertEquals("_1402837031.pdf", handler.filenames.get(1));
-
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
pdf
-       assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF 
itself
-       
-       
-       
-       // Outlook with a text file and a word document
-       handler = process("testMSG_att_doc.msg", extractor, true);
-       assertEquals(2, handler.filenames.size());
-       assertEquals(2, handler.mediaTypes.size());
-       
-       assertEquals("test-unicode.doc", handler.filenames.get(0));
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
-       
-       assertEquals("pj1.txt", handler.filenames.get(1));
-       assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
-       
-       
-       // Outlook with a pdf and another outlook message
-       handler = process("testMSG_att_msg.msg", extractor, true);
-       assertEquals(2, handler.filenames.size());
-       assertEquals(2, handler.mediaTypes.size());
-       
-       assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
-       assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
-       
-       assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
-       assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+
+        // Excel with a word doc and a powerpoint doc, both of which have 
images in them
+        // Without recursion, should see both documents + the images
+        handler = process("testEXCEL_embeded.xls", extractor, false);
+        assertEquals(5, handler.filenames.size());
+        assertEquals(5, handler.mediaTypes.size());
+
+        // We don't know their filenames
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office 
doc
+
+
+        // With recursion, should get the images embedded in the office files 
too
+        handler = process("testEXCEL_embeded.xls", extractor, true);
+        assertEquals(17, handler.filenames.size());
+        assertEquals(17, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("1", handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals("2", handler.filenames.get(6));
+        assertEquals("image1.png", handler.filenames.get(7));
+        assertEquals("image2.jpg", handler.filenames.get(8));
+        assertEquals("image3.png", handler.filenames.get(9));
+        assertEquals("image1.png", handler.filenames.get(16));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded 
presentation
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
+
+        // Word with .docx, powerpoint and excel
+        handler = process("testWORD_embeded.doc", extractor, false);
+        assertEquals(9, handler.filenames.size());
+        assertEquals(9, handler.mediaTypes.size());
+
+        // Filenames are a bit iffy...
+        // Should really be 3*embedded pictures then 3*icons then embedded docs
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("_1345471035.ppt", handler.filenames.get(7));
+        assertEquals("_1345470949.xls", handler.filenames.get(8));
+
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office 
doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office 
doc
+
+
+        // With recursion, should get their images too
+        handler = process("testWORD_embeded.doc", extractor, true);
+        assertEquals(16, handler.filenames.size());
+        assertEquals(16, handler.mediaTypes.size());
+
+        // We don't know their filenames, except for doc images + docx
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("image2.png", handler.filenames.get(7));
+        assertEquals("image3.jpeg", handler.filenames.get(8));
+        assertEquals("image4.png", handler.filenames.get(9));
+        for (int i = 11; i < 14; i++) {
+            assertNull(handler.filenames.get(i));
+        }
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside 
.docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside 
.docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside 
.docx
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office 
doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside 
.xls
+
+
+        // PowerPoint with excel and word
+        handler = process("testPPT_embeded.ppt", extractor, false);
+        assertEquals(7, handler.filenames.size());
+        assertEquals(7, handler.mediaTypes.size());
+
+        // We don't get all that helpful filenames
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals("2", handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals(null, handler.filenames.get(3));
+        assertEquals(null, handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        // But we do know their types
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office 
doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
+
+        // Run again on PowerPoint but with recursion
+        handler = process("testPPT_embeded.ppt", extractor, true);
+        assertEquals(11, handler.filenames.size());
+        assertEquals(11, handler.mediaTypes.size());
+
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals("2", handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.jpg", handler.filenames.get(4));
+        assertEquals("image3.png", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals(null, handler.filenames.get(7));
+        assertEquals(null, handler.filenames.get(8));
+        assertEquals(null, handler.filenames.get(9));
+        assertEquals(null, handler.filenames.get(10));
+
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside 
.xls
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside 
.docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside 
.docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside 
.docx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
+
+
+        // Word, with a non-office file (PDF)
+        handler = process("testWORD_embedded_pdf.doc", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("_1402837031.pdf", handler.filenames.get(1));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
pdf
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF 
itself
+
+
+        // Outlook with a text file and a word document
+        handler = process("testMSG_att_doc.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("test-unicode.doc", handler.filenames.get(0));
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
+
+        assertEquals("pj1.txt", handler.filenames.get(1));
+        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
+
+
+        // Outlook with a pdf and another outlook message
+        handler = process("testMSG_att_msg.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
+
+        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
     }
 
     @Test

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,6 @@
 package org.apache.tika.parser.microsoft;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
 import java.util.Locale;
@@ -148,9 +147,9 @@ public class PowerPointParserTest extend
 
         // Make sure boilerplate text didn't come through:
         assertEquals(-1, content.indexOf("Click to edit Master"));
- 
-       //TIKA-1171
-       assertEquals(-1, content.indexOf("*"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
     }
 
     /**
@@ -207,54 +206,54 @@ public class PowerPointParserTest extend
      */
     @Test
     public void testCustomProperties() throws Exception {
-       InputStream input = PowerPointParserTest.class.getResourceAsStream(
-             "/test-documents/testPPT_custom_props.ppt");
-       Metadata metadata = new Metadata();
-       
-       try {
-          ContentHandler handler = new BodyContentHandler(-1);
-          ParseContext context = new ParseContext();
-          context.set(Locale.class, Locale.US);
-          new OfficeParser().parse(input, handler, metadata, context);
-       } finally {
-          input.close();
-       }
-       
-       assertEquals("application/vnd.ms-powerpoint", 
metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("JOUVIN ETIENNE",       
metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("EJ04325S",             
metadata.get(TikaCoreProperties.MODIFIER));
-       assertEquals("EJ04325S",             
metadata.get(Metadata.LAST_AUTHOR));
-       assertEquals("2011-08-22T13:32:58Z", 
metadata.get(TikaCoreProperties.MODIFIED));
-       assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
-       assertEquals("2011-08-22T13:30:53Z", 
metadata.get(TikaCoreProperties.CREATED));
-       assertEquals("2011-08-22T13:30:53Z", 
metadata.get(Metadata.CREATION_DATE));
-       assertEquals("1",                    metadata.get(Office.SLIDE_COUNT));
-       assertEquals("3",                    metadata.get(Office.WORD_COUNT));
-       assertEquals("Test extraction properties pptx", 
metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("true",                 
metadata.get("custom:myCustomBoolean"));
-       assertEquals("3",                    
metadata.get("custom:myCustomNumber"));
-       assertEquals("MyStringValue",        
metadata.get("custom:MyCustomString"));
-       assertEquals("2010-12-30T22:00:00Z", 
metadata.get("custom:MyCustomDate"));
-       assertEquals("2010-12-29T22:00:00Z", 
metadata.get("custom:myCustomSecondDate"));
+        InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_custom_props.ppt");
+        Metadata metadata = new Metadata();
+
+        try {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        } finally {
+            input.close();
+        }
+
+        assertEquals("application/vnd.ms-powerpoint", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("JOUVIN ETIENNE", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-08-22T13:32:58Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
+        assertEquals("2011-08-22T13:30:53Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-08-22T13:30:53Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Office.WORD_COUNT));
+        assertEquals("Test extraction properties pptx", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", 
metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", 
metadata.get("custom:myCustomSecondDate"));
     }
 
     // TIKA-1025
     @Test
     public void testEmbeddedPlacedholder() throws Exception {
-       XMLResult result = getXML("testPPT_embedded2.ppt");
-       assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
-       assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
+        XMLResult result = getXML("testPPT_embedded2.ppt");
+        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
     }
 
     // TIKA-817
     @Test
     public void testAutoDatePPT() throws Exception {
-       //decision was made in POI-52367 not to generate
-       //autodate automatically.  For pptx, where value is stored,
-       //value is extracted.  For ppt, however, no date is extracted.
-       XMLResult result = getXML("testPPT_autodate.ppt");
-       assertContains(
-               "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
-               result.xml);
+        //decision was made in POI-52367 not to generate
+        //autodate automatically.  For pptx, where value is stored,
+        //value is extracted.  For ppt, however, no date is extracted.
+        XMLResult result = getXML("testPPT_autodate.ppt");
+        assertContains(
+                "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
+                result.xml);
     }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
 Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,21 +31,21 @@ import org.xml.sax.ContentHandler;
 
 /**
  * Tests for Microsoft Project (MPP) Files.
- * 
+ *
  * Note - we don't currently have a dedicated Project
  *  Parser, all we have is the common office metadata
  */
 public class ProjectParserTest {
-  
+
     @Test
     public void testProject2003() throws Exception {
-       InputStream input = ProjectParserTest.class.getResourceAsStream(
-             "/test-documents/testPROJECT2003.mpp");
-       try {
-          doTestProject(input);
-       } finally {
-          input.close();
-       }
+        InputStream input = ProjectParserTest.class.getResourceAsStream(
+                "/test-documents/testPROJECT2003.mpp");
+        try {
+            doTestProject(input);
+        } finally {
+            input.close();
+        }
     }
 
     @Test
@@ -60,40 +60,40 @@ public class ProjectParserTest {
     }
 
     private void doTestProject(InputStream input) throws Exception {
-       Metadata metadata = new Metadata();
-       ContentHandler handler = new BodyContentHandler();
-       new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-       assertEquals(
-               "application/vnd.ms-project",
-               metadata.get(Metadata.CONTENT_TYPE));
-       
-       assertEquals("The quick brown fox jumps over the lazy dog", 
metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
-       assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(Metadata.SUBJECT));
-       assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
-       assertEquals("Pangram, fox, dog", 
metadata.get(TikaCoreProperties.KEYWORDS));
-       assertEquals("Comment Vulpes vulpes comment", 
metadata.get(TikaCoreProperties.COMMENTS));
-       
-       assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY));
-       assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER));
-       assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY));
-       
-       assertEquals("2011-11-24T10:58:00Z", 
metadata.get(TikaCoreProperties.CREATED));
-       assertEquals("2011-11-24T10:58:00Z", 
metadata.get(Metadata.CREATION_DATE));
-       assertEquals("2011-11-24T11:31:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
-       assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE));
-       
-       // Custom Project metadata is present with prefix
-       assertEquals("0%", metadata.get("custom:% Complete"));
-       assertEquals("0%", metadata.get("custom:% Work Complete"));
-       assertEquals("\u00a3"+"0.00", metadata.get("custom:Cost"));
-       assertEquals("2d?", metadata.get("custom:Duration"));
-       assertEquals("16h", metadata.get("custom:Work"));
-       
-       // Currently, we don't do textual contents of the file
-       String content = handler.toString();
-       assertEquals("", content);
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+        assertEquals(
+                "application/vnd.ms-project",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals("The quick brown fox jumps over the lazy dog", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Pangram, fox, dog", 
metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Comment Vulpes vulpes comment", 
metadata.get(TikaCoreProperties.COMMENTS));
+
+        assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY));
+        assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER));
+        assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY));
+
+        assertEquals("2011-11-24T10:58:00Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-11-24T10:58:00Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2011-11-24T11:31:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE));
+
+        // Custom Project metadata is present with prefix
+        assertEquals("0%", metadata.get("custom:% Complete"));
+        assertEquals("0%", metadata.get("custom:% Work Complete"));
+        assertEquals("\u00a3" + "0.00", metadata.get("custom:Cost"));
+        assertEquals("2d?", metadata.get("custom:Duration"));
+        assertEquals("16h", metadata.get("custom:Work"));
+
+        // Currently, we don't do textual contents of the file
+        String content = handler.toString();
+        assertEquals("", content);
     }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
 Fri May 29 14:36:21 2015
@@ -36,66 +36,66 @@ import org.xml.sax.ContentHandler;
  * Tests for the TNEF (winmail.dat) parser
  */
 public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
-   private static final String file = "testWINMAIL.dat";
-   
-   @Test
-   public void testBasics() throws Exception {
-      TikaInputStream stream = getTestFile(file);
-      Detector detector = new DefaultDetector();
-      try {
-         assertEquals(
-                 MediaType.application("vnd.ms-tnef"),
-                 detector.detect(stream, new Metadata()));
-     } finally {
-         stream.close();
-     }
-   }
-   
-   @Test
-   public void testMetadata() throws Exception {
-      TikaInputStream stream = getTestFile(file);
-      
-      Metadata metadata = new Metadata();
-      ContentHandler handler = new BodyContentHandler();
-      
-      TNEFParser tnef = new TNEFParser();
-      tnef.parse(stream, handler, metadata, new ParseContext());
-      
-      assertEquals("This is a test message", 
metadata.get(TikaCoreProperties.TITLE));
-      assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
-   }
-   
+    private static final String file = "testWINMAIL.dat";
+
+    @Test
+    public void testBasics() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+        Detector detector = new DefaultDetector();
+        try {
+            assertEquals(
+                    MediaType.application("vnd.ms-tnef"),
+                    detector.detect(stream, new Metadata()));
+        } finally {
+            stream.close();
+        }
+    }
+
+    @Test
+    public void testMetadata() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        TNEFParser tnef = new TNEFParser();
+        tnef.parse(stream, handler, metadata, new ParseContext());
+
+        assertEquals("This is a test message", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+    }
+
     /**
      * Check the Rtf and Attachments are returned
-     *  as expected
+     * as expected
      */
-   @Test
+    @Test
     public void testBodyAndAttachments() throws Exception {
-       ContainerExtractor extractor = new ParserContainerExtractor();
-       
-       // Process it with recursing
-       // Will have the message body RTF and the attachments
-       TrackingHandler handler = process(file, extractor, true);
-       assertEquals(6, handler.filenames.size());
-       assertEquals(6, handler.mediaTypes.size());
-       
-       // We know the filenames for all of them
-       assertEquals("message.rtf", handler.filenames.get(0));
-       assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
-       
-       assertEquals("quick.doc", handler.filenames.get(1));
-       assertEquals(MediaType.application("msword"), 
handler.mediaTypes.get(1));
-       
-       assertEquals("quick.html", handler.filenames.get(2));
-       assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
-       
-       assertEquals("quick.pdf", handler.filenames.get(3));
-       assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
-       
-       assertEquals("quick.txt", handler.filenames.get(4));
-       assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
-       
-       assertEquals("quick.xml", handler.filenames.get(5));
-       assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        // Process it with recursing
+        // Will have the message body RTF and the attachments
+        TrackingHandler handler = process(file, extractor, true);
+        assertEquals(6, handler.filenames.size());
+        assertEquals(6, handler.mediaTypes.size());
+
+        // We know the filenames for all of them
+        assertEquals("message.rtf", handler.filenames.get(0));
+        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+
+        assertEquals("quick.doc", handler.filenames.get(1));
+        assertEquals(MediaType.application("msword"), 
handler.mediaTypes.get(1));
+
+        assertEquals("quick.html", handler.filenames.get(2));
+        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+
+        assertEquals("quick.pdf", handler.filenames.get(3));
+        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+
+        assertEquals("quick.txt", handler.filenames.get(4));
+        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+
+        assertEquals("quick.xml", handler.filenames.get(5));
+        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
     }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Fri May 29 14:36:21 2015
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -89,8 +89,8 @@ public class WordParserTest extends Tika
         Metadata metadata = result.metadata;
 
         assertEquals(
-                     "application/msword",
-                     metadata.get(Metadata.CONTENT_TYPE));
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Sample Word Document", 
metadata.get(TikaCoreProperties.TITLE));
         assertEquals("Keith Bennett", 
metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
@@ -117,9 +117,9 @@ public class WordParserTest extends Tika
         xml = getXML("testWORD_3imgs.doc").xml;
 
         // Images 1-3
-        assertTrue("Image not found in:\n"+xml, 
xml.contains("src=\"embedded:image1.png\""));
-        assertTrue("Image not found in:\n"+xml, 
xml.contains("src=\"embedded:image2.jpg\""));
-        assertTrue("Image not found in:\n"+xml, 
xml.contains("src=\"embedded:image3.png\""));
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image1.png\""));
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image2.jpg\""));
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image3.png\""));
 
         // Text too
         assertTrue(xml.contains("<p>The end!"));
@@ -131,7 +131,7 @@ public class WordParserTest extends Tika
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-        assertTrue("Bold text wasn't contiguous: "+xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
 
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
@@ -140,7 +140,7 @@ public class WordParserTest extends Tika
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-        assertTrue("Bold text wasn't contiguous: "+xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
     @Test
@@ -277,19 +277,19 @@ public class WordParserTest extends Tika
      */
     @Test
     public void testNoFormat() throws Exception {
-       ContentHandler handler = new BodyContentHandler();
-       Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
 
-       InputStream stream = WordParserTest.class.getResourceAsStream(
-               "/test-documents/testWORD_no_format.doc");
-       try {
-           new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
-       } finally {
-           stream.close();
-       }
+        InputStream stream = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_no_format.doc");
+        try {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        } finally {
+            stream.close();
+        }
 
-       String content = handler.toString();
-       assertContains("Will generate an exception", content);
+        String content = handler.toString();
+        assertContains("Will generate an exception", content);
     }
 
     /**
@@ -297,55 +297,55 @@ public class WordParserTest extends Tika
      */
     @Test
     public void testCustomProperties() throws Exception {
-       InputStream input = WordParserTest.class.getResourceAsStream(
-             "/test-documents/testWORD_custom_props.doc");
-       Metadata metadata = new Metadata();
-
-       try {
-          ContentHandler handler = new BodyContentHandler(-1);
-          ParseContext context = new ParseContext();
-          context.set(Locale.class, Locale.US);
-          new OfficeParser().parse(input, handler, metadata, context);
-       } finally {
-          input.close();
-       }
-
-       assertEquals("application/msword",   
metadata.get(Metadata.CONTENT_TYPE));
-       assertEquals("EJ04325S",             
metadata.get(TikaCoreProperties.CREATOR));
-       assertEquals("Etienne Jouvin",       
metadata.get(TikaCoreProperties.MODIFIER));
-       assertEquals("Etienne Jouvin",       
metadata.get(Metadata.LAST_AUTHOR));
-       assertEquals("2012-01-03T22:14:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
-       assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
-       assertEquals("2010-10-05T09:03:00Z", 
metadata.get(TikaCoreProperties.CREATED));
-       assertEquals("2010-10-05T09:03:00Z", 
metadata.get(Metadata.CREATION_DATE));
-       assertEquals("Microsoft Office 
Word",metadata.get(OfficeOpenXMLExtended.APPLICATION));
-       assertEquals("1",                    metadata.get(Office.PAGE_COUNT));
-       assertEquals("2",                    metadata.get(Office.WORD_COUNT));
-       assertEquals("My Title",             
metadata.get(TikaCoreProperties.TITLE));
-       assertEquals("My Keyword",           
metadata.get(TikaCoreProperties.KEYWORDS));
-       assertEquals("Normal.dotm",          
metadata.get(OfficeOpenXMLExtended.TEMPLATE));
-       assertEquals("My Comments",          
metadata.get(TikaCoreProperties.COMMENTS));
-       // TODO: Move to OO subject in Tika 2.0
-       assertEquals("My subject",           metadata.get(Metadata.SUBJECT));
-       assertEquals("My subject",           
metadata.get(OfficeOpenXMLCore.SUBJECT));
-       assertEquals("EDF-DIT",              
metadata.get(OfficeOpenXMLExtended.COMPANY));
-       assertEquals("MyStringValue",        
metadata.get("custom:MyCustomString"));
-       assertEquals("2010-12-30T23:00:00Z", 
metadata.get("custom:MyCustomDate"));
+        InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_custom_props.doc");
+        Metadata metadata = new Metadata();
+
+        try {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        } finally {
+            input.close();
+        }
+
+        assertEquals("application/msword", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Etienne Jouvin", 
metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2012-01-03T22:14:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("2010-10-05T09:03:00Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2010-10-05T09:03:00Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("Microsoft Office Word", 
metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("1", metadata.get(Office.PAGE_COUNT));
+        assertEquals("2", metadata.get(Office.WORD_COUNT));
+        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", 
metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+        assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
+        // TODO: Move to OO subject in Tika 2.0
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T23:00:00Z", 
metadata.get("custom:MyCustomDate"));
     }
 
     @Test
     public void testExceptions1() throws Exception {
-      XMLResult xml;
-      Level logLevelStart = Logger.getRootLogger().getLevel();
-      Logger.getRootLogger().setLevel(Level.ERROR);
-      try {
-        xml = getXML("testException1.doc");
-        assertContains("total population", xml.xml);
-        xml = getXML("testException2.doc");
-        assertContains("electric charge", xml.xml);
-      } finally {
-        Logger.getRootLogger().setLevel(logLevelStart);
-      }
+        XMLResult xml;
+        Level logLevelStart = Logger.getRootLogger().getLevel();
+        Logger.getRootLogger().setLevel(Level.ERROR);
+        try {
+            xml = getXML("testException1.doc");
+            assertContains("total population", xml.xml);
+            xml = getXML("testException2.doc");
+            assertContains("electric charge", xml.xml);
+        } finally {
+            Logger.getRootLogger().setLevel(logLevelStart);
+        }
     }
 
     @Test
@@ -364,8 +364,8 @@ public class WordParserTest extends Tika
         Metadata metadata = result.metadata;
 
         assertEquals(
-                     "application/msword",
-                     metadata.get(Metadata.CONTENT_TYPE));
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
         assertContains("example.com", xml);
 
@@ -381,7 +381,7 @@ public class WordParserTest extends Tika
 
     @Test
     public void testControlCharacter() throws Exception {
-      assertContains("1. Introduzione<b> </a></b> </p>", 
getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+        assertContains("1. Introduzione<b> </a></b> </p>", 
getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
     }
 
     @Test

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
 Fri May 29 14:36:21 2015
@@ -27,7 +27,7 @@ import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
 public class WriteProtectedParserTest {
-  
+
     @Test
     public void testWriteProtected() throws Exception {
         InputStream input = ExcelParserTest.class.getResourceAsStream(

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
 Fri May 29 14:36:21 2015
@@ -29,11 +29,11 @@ import org.junit.Test;
 
 /**
  * Tests that the various POI OOXML powered parsers are
- *  able to extract their embedded contents.
+ * able to extract their embedded contents.
  */
 public class OOXMLContainerExtractionTest extends 
AbstractPOIContainerExtractionTest {
     private ContainerExtractor extractor;
-    
+
     @Before
     public void setUp() {
         Tika tika = new Tika();
@@ -41,231 +41,231 @@ public class OOXMLContainerExtractionTes
                 tika.getParser(), tika.getDetector());
     }
 
-   /**
+    /**
      * For office files which don't have anything embedded in them
      */
     @Test
     public void testWithoutEmbedded() throws Exception {
-       assertEmbeddedFiles(0, "testEXCEL.xlsx" );
-       assertEmbeddedFiles(0, "testWORD.docx" );
-       assertEmbeddedFiles(1 /* thumbnail as default */, "testPPT.pptx" ); 
+        assertEmbeddedFiles(0, "testEXCEL.xlsx");
+        assertEmbeddedFiles(0, "testWORD.docx");
+        assertEmbeddedFiles(1 /* thumbnail as default */, "testPPT.pptx");
     }
 
-    private void assertEmbeddedFiles(int expectedNbFiles, String file ) throws 
Exception {
-    // Process it without recursing
-      TrackingHandler handler = process(file, extractor, false);
-      
-      // Won't have fired
-      assertEquals(expectedNbFiles, handler.filenames.size());
-      assertEquals(expectedNbFiles, handler.mediaTypes.size());
-      
-      // Ditto with recursing
-      handler = process(file, extractor, true);
-      assertEquals(expectedNbFiles, handler.filenames.size());
-      assertEquals(expectedNbFiles, handler.mediaTypes.size());
+    private void assertEmbeddedFiles(int expectedNbFiles, String file) throws 
Exception {
+        // Process it without recursing
+        TrackingHandler handler = process(file, extractor, false);
+
+        // Won't have fired
+        assertEquals(expectedNbFiles, handler.filenames.size());
+        assertEquals(expectedNbFiles, handler.mediaTypes.size());
+
+        // Ditto with recursing
+        handler = process(file, extractor, true);
+        assertEquals(expectedNbFiles, handler.filenames.size());
+        assertEquals(expectedNbFiles, handler.mediaTypes.size());
     }
-    
+
     /**
      * Office files with embedded images, but no other
-     *  office files in them
+     * office files in them
      */
     @Test
     public void testEmbeddedImages() throws Exception {
-       TrackingHandler handler;
-       
-       // Excel with 1 image
-       handler = process("testEXCEL_1img.xlsx", extractor, false);
-       assertEquals(1, handler.filenames.size());
-       assertEquals(1, handler.mediaTypes.size());
-       
-       assertEquals("image1.png", handler.filenames.get(0));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-       
-       // PowerPoint with 2 images + sound
-       // TODO Figure out why we can't find the sound anywhere...
-       handler = process("testPPT_2imgs.pptx", extractor, false);
-       assertEquals(3 + 1 /*thumbnail */, handler.filenames.size());
-       assertEquals(3 + 1 /*thumbnail */, handler.mediaTypes.size());
-       
-       assertEquals("image1.png", handler.filenames.get(0));
-       assertEquals("image2.gif", handler.filenames.get(1));
-       assertEquals("image3.png", handler.filenames.get(2));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-       assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
-       
-       
-       // Word with 1 image
-       handler = process("testWORD_1img.docx", extractor, false);
-       assertEquals(1, handler.filenames.size());
-       assertEquals(1, handler.mediaTypes.size());
-       
-       assertEquals("image1.png", handler.filenames.get(0));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-       
-       // Word with 3 images
-       handler = process("testWORD_3imgs.docx", extractor, false);
-       assertEquals(3, handler.filenames.size());
-       assertEquals(3, handler.mediaTypes.size());
-       
-       assertEquals("image2.png", handler.filenames.get(0));
-       assertEquals("image3.jpeg", handler.filenames.get(1));
-       assertEquals("image4.png", handler.filenames.get(2));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+        TrackingHandler handler;
+
+        // Excel with 1 image
+        handler = process("testEXCEL_1img.xlsx", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // PowerPoint with 2 images + sound
+        // TODO Figure out why we can't find the sound anywhere...
+        handler = process("testPPT_2imgs.pptx", extractor, false);
+        assertEquals(3 + 1 /*thumbnail */, handler.filenames.size());
+        assertEquals(3 + 1 /*thumbnail */, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals("image2.gif", handler.filenames.get(1));
+        assertEquals("image3.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+
+
+        // Word with 1 image
+        handler = process("testWORD_1img.docx", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // Word with 3 images
+        handler = process("testWORD_3imgs.docx", extractor, false);
+        assertEquals(3, handler.filenames.size());
+        assertEquals(3, handler.mediaTypes.size());
+
+        assertEquals("image2.png", handler.filenames.get(0));
+        assertEquals("image3.jpeg", handler.filenames.get(1));
+        assertEquals("image4.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
     }
-    
+
     /**
      * Office files which have other office files
-     *  embedded into them. The embedded office files
-     *  will sometimes have images in them.
-     *  
-     *  eg xls
-     *       -> word
-     *           -> image
-     *           -> image
-     *       -> powerpoint
-     *       -> excel
-     *           -> image
+     * embedded into them. The embedded office files
+     * will sometimes have images in them.
+     * <p/>
+     * eg xls
+     * -> word
+     * -> image
+     * -> image
+     * -> powerpoint
+     * -> excel
+     * -> image
      */
     @Test
     public void testEmbeddedOfficeFiles() throws Exception {
-       TrackingHandler handler;
-       
-       
-       // Excel with a word doc and a powerpoint doc, both of which have 
images in them
-       // Without recursion, should see both documents + the images
-       handler = process("testEXCEL_embeded.xlsx", extractor, false);
-       assertEquals(7, handler.filenames.size());
-       assertEquals(7, handler.mediaTypes.size());
-       
-       // We know the rough filenames
-       assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", 
handler.filenames.get(0));
-       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(1));
-       assertEquals("Microsoft_Office_Word_Document2.docx", 
handler.filenames.get(2));
-       assertEquals("image1.png", handler.filenames.get(3));
-       assertEquals("image2.emf", handler.filenames.get(4));
-       assertEquals("image3.emf", handler.filenames.get(5));
-       assertEquals("image4.emf", handler.filenames.get(6));
-       // But we do know their types
-       assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(1));  // Embedded office 
doc
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(2)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded 
office doc
-       
-       
-       // With recursion, should get the images embedded in the office files 
too
-       handler = process("testEXCEL_embeded.xlsx", extractor, true);
-       assertEquals(23 + 1 /*thumbnail */, handler.filenames.size());
-       assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size());
-       
-       assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside 
.pptx
-       assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   PNG inside 
.pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside 
.pptx
-       assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); //   .xlsx inside 
.pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //     PNG inside 
.xlsx inside .pptx
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); //   .docx inside 
.pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); //     PNG inside 
.docx inside .pptx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); //     JPG inside 
.docx inside .pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); //     PNG inside 
.docx inside .pptx
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); //   .doc inside 
.pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); //    PNG inside 
.doc inside .pptx
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); //   Icon of item 
inside .pptx
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); //   Icon of item 
inside .pptx
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); //   Icon of item 
inside .pptx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(15));  // Embedded 
thumbnail
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(16));  // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(17));  //   PNG inside 
.doc
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(18)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(19));  //   PNG inside 
.docx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(20)); // Embedded image
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(21)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(22)); // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(23)); // Icon of embedded 
office doc
-       
-       
-       // Word with .docx, powerpoint and excel
-       handler = process("testWORD_embeded.docx", extractor, false);
-       assertEquals(9, handler.filenames.size());
-       assertEquals(9, handler.mediaTypes.size());
-       
-       // We know their rough filenames
-       assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", 
handler.filenames.get(0));
-       assertEquals("image6.emf", handler.filenames.get(1));
-       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(2));
-       assertEquals("image1.png", handler.filenames.get(3));
-       assertEquals("image2.jpeg", handler.filenames.get(4));
-       assertEquals("image3.png", handler.filenames.get(5));
-       assertEquals("image4.emf", handler.filenames.get(6));
-       assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", 
handler.filenames.get(7));
-       assertEquals("image5.emf", handler.filenames.get(8));
-       // But we do know their types
-       assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(1));  // Icon of embedded 
office doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(2));  // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  // Embedded image
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  // Embedded image
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of embedded 
office doc 
-       assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // Embeded office 
doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of embedded 
office doc
-       
-       
-       // With recursion, should get their images too
-       handler = process("testWORD_embeded.docx", extractor, true);
-       assertEquals(14 + 1 /* thumbnail */, handler.filenames.size());
-       assertEquals(14 + 1 /* thumbnail */, handler.mediaTypes.size());
-       
-       // But we do know their types
-       assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside 
.pptx
-       assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   GIF inside 
.pptx
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside 
.pptx
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  // Embedded 
thumbnail
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(5));  // Icon of embedded 
office doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(6));  // Embedded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //   PNG inside .doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(8));  // Embedded image
-       assertEquals(TYPE_JPG, handler.mediaTypes.get(9));  // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(10));  // Embedded image
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(11));  // Icon of 
embedded office doc 
-       assertEquals(TYPE_XLSX, handler.mediaTypes.get(12)); // Embeded office 
doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(13));  //   PNG inside 
.xlsx
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of embedded 
office doc
-       
-       
-       // PowerPoint with excel and word
-       handler = process("testPPT_embeded.pptx", extractor, false);
-       assertEquals(9 + 1 /* thumbnail */, handler.filenames.size());
-       assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size());
-       
-       // We don't know their exact filenames
-       assertEquals("image4.png", handler.filenames.get(0));
-       assertEquals("image5.gif", handler.filenames.get(1));
-       assertEquals("image6.png", handler.filenames.get(2));
-       assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", 
handler.filenames.get(3));
-       assertEquals("Microsoft_Office_Word_Document2.docx", 
handler.filenames.get(4));
-       assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(5));
-       assertEquals("image1.emf", handler.filenames.get(6));
-       assertEquals("image2.emf", handler.filenames.get(7));
-       assertEquals("image3.emf", handler.filenames.get(8));
-       // But we do know their types
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));  // Embedded image
-       assertEquals(TYPE_GIF, handler.mediaTypes.get(1));  // Embedded image
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));  // Embedded image
-       assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office 
doc
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office 
doc
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(5));  // Embedded office 
doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(7));  // Icon of embedded 
office doc
-       assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of embedded 
office doc
+        TrackingHandler handler;
+
+
+        // Excel with a word doc and a powerpoint doc, both of which have 
images in them
+        // Without recursion, should see both documents + the images
+        handler = process("testEXCEL_embeded.xlsx", extractor, false);
+        assertEquals(7, handler.filenames.size());
+        assertEquals(7, handler.mediaTypes.size());
+
+        // We know the rough filenames
+        assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", 
handler.filenames.get(0));
+        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(1));
+        assertEquals("Microsoft_Office_Word_Document2.docx", 
handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals("image4.emf", handler.filenames.get(6));
+        // But we do know their types
+        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(1));  // Embedded office 
doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(2)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded 
office doc
+
+
+        // With recursion, should get the images embedded in the office files 
too
+        handler = process("testEXCEL_embeded.xlsx", extractor, true);
+        assertEquals(23 + 1 /*thumbnail */, handler.filenames.size());
+        assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size());
+
+        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside 
.pptx
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   PNG inside 
.pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside 
.pptx
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); //   .xlsx inside 
.pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //     PNG inside 
.xlsx inside .pptx
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); //   .docx inside 
.pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); //     PNG inside 
.docx inside .pptx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); //     JPG inside 
.docx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); //     PNG inside 
.docx inside .pptx
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); //   .doc inside 
.pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); //    PNG inside 
.doc inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); //   Icon of item 
inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); //   Icon of item 
inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); //   Icon of item 
inside .pptx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(15));  // Embedded 
thumbnail
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(16));  // Embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(17));  //   PNG inside 
.doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(18)); // Embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(19));  //   PNG inside 
.docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(20)); // Embedded image
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(21)); // Icon of 
embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(22)); // Icon of 
embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(23)); // Icon of 
embedded office doc
+
+
+        // Word with .docx, powerpoint and excel
+        handler = process("testWORD_embeded.docx", extractor, false);
+        assertEquals(9, handler.filenames.size());
+        assertEquals(9, handler.mediaTypes.size());
+
+        // We know their rough filenames
+        assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", 
handler.filenames.get(0));
+        assertEquals("image6.emf", handler.filenames.get(1));
+        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.jpeg", handler.filenames.get(4));
+        assertEquals("image3.png", handler.filenames.get(5));
+        assertEquals("image4.emf", handler.filenames.get(6));
+        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", 
handler.filenames.get(7));
+        assertEquals("image5.emf", handler.filenames.get(8));
+        // But we do know their types
+        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1));  // Icon of 
embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(2));  // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  // Embedded image
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of 
embedded office doc
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // Embeded office 
doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of 
embedded office doc
+
+
+        // With recursion, should get their images too
+        handler = process("testWORD_embeded.docx", extractor, true);
+        assertEquals(14 + 1 /* thumbnail */, handler.filenames.size());
+        assertEquals(14 + 1 /* thumbnail */, handler.mediaTypes.size());
+
+        // But we do know their types
+        assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside 
.pptx
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   GIF inside 
.pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside 
.pptx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  // Embedded 
thumbnail
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5));  // Icon of 
embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(6));  // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //   PNG inside 
.doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8));  // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(9));  // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10));  // Embedded image
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(11));  // Icon of 
embedded office doc
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(12)); // Embeded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(13));  //   PNG inside 
.xlsx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of 
embedded office doc
+
+
+        // PowerPoint with excel and word
+        handler = process("testPPT_embeded.pptx", extractor, false);
+        assertEquals(9 + 1 /* thumbnail */, handler.filenames.size());
+        assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size());
+
+        // We don't know their exact filenames
+        assertEquals("image4.png", handler.filenames.get(0));
+        assertEquals("image5.gif", handler.filenames.get(1));
+        assertEquals("image6.png", handler.filenames.get(2));
+        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", 
handler.filenames.get(3));
+        assertEquals("Microsoft_Office_Word_Document2.docx", 
handler.filenames.get(4));
+        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", 
handler.filenames.get(5));
+        assertEquals("image1.emf", handler.filenames.get(6));
+        assertEquals("image2.emf", handler.filenames.get(7));
+        assertEquals("image3.emf", handler.filenames.get(8));
+        // But we do know their types
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));  // Embedded image
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(1));  // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));  // Embedded image
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office 
doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(5));  // Embedded office 
doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of 
embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(7));  // Icon of 
embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of 
embedded office doc
     }
 
     @Test


Reply via email to