tika-advanced-parser-m...

bob Sat, 16 Jan 2016 10:24:11 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static 
org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.getTestFile;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for the Old Excel (2-4) parser
+ */
+public class OldExcelParserTest extends TikaTest {
+    private static final String file = "testEXCEL_4.xls";
+
+    @Test
+    public void testDetection() throws Exception {
+        Detector detector = new DefaultDetector();
+        try (TikaInputStream stream = getTestFile(file)) {
+            assertEquals(
+                    MediaType.application("vnd.ms-excel.sheet.4"),
+                    detector.detect(stream, new Metadata()));
+        }
+    }
+
+    // Disabled, until we can get the POI code to tell us the version
+    @Test
+    @Ignore
+    public void testMetadata() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        OldExcelParser parser = new OldExcelParser();
+        parser.parse(stream, handler, metadata, new ParseContext());
+
+        // We can get the content type
+        assertEquals("application/vnd.ms-excel.sheet.4", 
metadata.get(Metadata.CONTENT_TYPE));
+
+        // But no other metadata
+        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(null, metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Check we can get the plain text properly
+     */
+    @Test
+    public void testPlainText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (TikaInputStream stream = getTestFile(file)) {
+            new OldExcelParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String text = handler.toString();
+
+        // Check we find a few words we expect in there
+        assertContains("Size", text);
+        assertContains("Returns", text);
+
+        // Check we find a few numbers we expect in there
+        assertContains("11", text);
+        assertContains("784", text);
+    }
+
+    /**
+     * Check the HTML version comes through correctly
+     */
+    @Test
+    public void testHTML() throws Exception {
+        XMLResult result = getXML(file);
+        String xml = result.xml;
+
+        // Sheet name not found - only 5+ have sheet names
+        assertNotContained("<p>Sheet 1</p>", xml);
+
+        // String cells
+        assertContains("<p>Table 10 -", xml);
+        assertContains("<p>Tax</p>", xml);
+        assertContains("<p>N/A</p>", xml);
+
+        // Number cells
+        assertContains("<p>(1)</p>", xml);
+        assertContains("<p>5.0</p>", xml);
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TikaTest {
+
+    @Test
+    public void testOutlookParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Microsoft Outlook Express 6",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(
+                "Nouvel utilisateur de Outlook Express",
+                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(Metadata.AUTHOR));
+
+        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+        assertEquals(
+                "2007-04-05T16:26:06Z",
+                metadata.get(TikaCoreProperties.CREATED));
+
+        String content = handler.toString();
+        assertContains("Microsoft Outlook Express 6", content);
+        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
+        assertContains("Nouvel utilisateur de Outlook Express", content);
+        assertContains("Messagerie et groupes de discussion", content);
+    }
+
+    /**
+     * Test case for TIKA-197
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-197";>TIKA-197</a>
+     */
+    @Test
+    public void testMultipleCopies() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = handler.toString();
+        Pattern pattern = Pattern.compile("From");
+        Matcher matcher = pattern.matcher(content);
+        assertTrue(matcher.find());
+        assertFalse(matcher.find());
+    }
+
+    /**
+     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-395";>TIKA-395</a>
+     */
+    @Test
+    public void testOutlookNew() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Welcome to Microsoft Office Outlook 2003",
+                metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("Outlook 2003", content);
+        assertContains("Streamlined Mail Experience", content);
+        assertContains("Navigation Pane", content);
+    }
+
+    @Test
+    public void testOutlookHTMLVersion() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_chinese.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString();
+        assertContains("<dd>[email protected]</dd>", content);
+        assertContains("<p>Alfresco MSG format testing", content);
+        assertContains("<li>1", content);
+        assertContains("<li>2", content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+
+        // Make sure that the Chinese actually came through
+        assertContains("\u5F35\u6BD3\u502B", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertContains("\u9673\u60E0\u73CD", content);
+    }
+
+    @Test
+    public void testOutlookForwarded() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_forwarded.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Make sure we don't have nested docs
+        String content = sw.toString();
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+
+    @Test
+    public void testOutlookHTMLfromRTF() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString().replaceAll("<p>\\s+", "<p>");
+        assertContains("<dd>New Outlook User</dd>", content);
+        assertContains("designed <i>to help you", content);
+        assertContains("<p><a 
href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\";>Cached
 Exchange Mode</a>", content);
+
+        // Link - check text around it, and the link itself
+        assertContains("sign up for a free subscription", content);
+        assertContains("Office Newsletter", content);
+        assertContains("newsletter will be sent to you", content);
+        
assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033";,
 content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+/**
+ * Tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public class POIContainerExtractionTest extends 
AbstractPOIContainerExtractionTest {
+
+    /**
+     * For office files which don't have anything embedded in them
+     */
+    @Test
+    public void testWithoutEmbedded() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        String[] files = new String[]{
+                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
+                "testVISIO.vsd", "test-outlook.msg"
+        };
+        for (String file : files) {
+            // Process it without recursing
+            TrackingHandler handler = process(file, extractor, false);
+
+            // Won't have fired
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+
+            // Ditto with recursing
+            handler = process(file, extractor, true);
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+        }
+    }
+
+    /**
+     * Office files with embedded images, but no other
+     * office files in them
+     */
+    @Test
+    public void testEmbeddedImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        // Excel with 1 image
+        handler = process("testEXCEL_1img.xls", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // PowerPoint with 2 images + sound
+        // TODO
+
+
+        // Word with 1 image
+        handler = process("testWORD_1img.doc", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // Word with 3 images
+        handler = process("testWORD_3imgs.doc", extractor, false);
+        assertEquals(3, handler.filenames.size());
+        assertEquals(3, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals("image2.jpg", handler.filenames.get(1));
+        assertEquals("image3.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+    }
+
+    /**
+     * Office files which have other office files
+     * embedded into them. The embedded office files
+     * will sometimes have images in them.
+     * <p/>
+     * eg xls
+     * -> word
+     * -> image
+     * -> image
+     * -> powerpoint
+     * -> excel
+     * -> image
+     */
+    @Test
+    public void testEmbeddedOfficeFiles() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+
+        // Excel with a word doc and a powerpoint doc, both of which have 
images in them
+        // Without recursion, should see both documents + the images
+        handler = process("testEXCEL_embeded.xls", extractor, false);
+        assertEquals(5, handler.filenames.size());
+        assertEquals(5, handler.mediaTypes.size());
+
+        // We don't know their filenames
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office 
doc
+
+
+        // With recursion, should get the images embedded in the office files 
too
+        handler = process("testEXCEL_embeded.xls", extractor, true);
+        assertEquals(17, handler.filenames.size());
+        assertEquals(17, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("1", handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals("2", handler.filenames.get(6));
+        assertEquals("image1.png", handler.filenames.get(7));
+        assertEquals("image2.jpg", handler.filenames.get(8));
+        assertEquals("image3.png", handler.filenames.get(9));
+        assertEquals("image1.png", handler.filenames.get(16));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded 
presentation
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
+
+        // Word with .docx, powerpoint and excel
+        handler = process("testWORD_embeded.doc", extractor, false);
+        assertEquals(9, handler.filenames.size());
+        assertEquals(9, handler.mediaTypes.size());
+
+        // Filenames are a bit iffy...
+        // Should really be 3*embedded pictures then 3*icons then embedded docs
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("_1345471035.ppt", handler.filenames.get(7));
+        assertEquals("_1345470949.xls", handler.filenames.get(8));
+
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc?
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office 
doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office 
doc
+
+
+        // With recursion, should get their images too
+        handler = process("testWORD_embeded.doc", extractor, true);
+        assertEquals(16, handler.filenames.size());
+        assertEquals(16, handler.mediaTypes.size());
+
+        // We don't know their filenames, except for doc images + docx
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("image2.png", handler.filenames.get(7));
+        assertEquals("image3.jpeg", handler.filenames.get(8));
+        assertEquals("image4.png", handler.filenames.get(9));
+        for (int i = 11; i < 14; i++) {
+            assertNull(handler.filenames.get(i));
+        }
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - 
logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - 
safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - 
try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded 
office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside 
.docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside 
.docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside 
.docx
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office 
doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside 
.xls
+
+
+        // PowerPoint with excel and word
+        handler = process("testPPT_embeded.ppt", extractor, false);
+        assertEquals(7, handler.filenames.size());
+        assertEquals(7, handler.mediaTypes.size());
+
+        // We don't get all that helpful filenames
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals("2", handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals(null, handler.filenames.get(3));
+        assertEquals(null, handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        // But we do know their types
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office 
doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
+
+        // Run again on PowerPoint but with recursion
+        handler = process("testPPT_embeded.ppt", extractor, true);
+        assertEquals(11, handler.filenames.size());
+        assertEquals(11, handler.mediaTypes.size());
+
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals("2", handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.jpg", handler.filenames.get(4));
+        assertEquals("image3.png", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals(null, handler.filenames.get(7));
+        assertEquals(null, handler.filenames.get(8));
+        assertEquals(null, handler.filenames.get(9));
+        assertEquals(null, handler.filenames.get(10));
+
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside 
.xls
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office 
doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside 
.docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside 
.docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside 
.docx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded 
office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded 
office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
+
+
+        // Word, with a non-office file (PDF)
+        handler = process("testWORD_embedded_pdf.doc", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("_1402837031.pdf", handler.filenames.get(1));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded 
pdf
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF 
itself
+
+
+        // Outlook with a text file and a word document
+        handler = process("testMSG_att_doc.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("test-unicode.doc", handler.filenames.get(0));
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
+
+        assertEquals("pj1.txt", handler.filenames.get(1));
+        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
+
+
+        // Outlook with a pdf and another outlook message
+        handler = process("testMSG_att_msg.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
+
+        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+    }
+
+    @Test
+    public void testEmbeddedOfficeFilesXML() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("EmbeddedDocument.docx", extractor, false);
+        
assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
+        assertEquals(2, handler.filenames.size());
+    }
+
+    @Test
+    public void testPowerpointImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("pictures.ppt", extractor, false);
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", 
"jpeg")));
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PowerPointParserTest extends TikaTest {
+
+    @Test
+    public void testPowerPointParser() throws Exception {
+        try (InputStream input = 
PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT.ppt")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertEquals(
+                    "application/vnd.ms-powerpoint",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Powerpoint Slide", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("Sample Powerpoint Slide", content);
+            assertContains("Powerpoint X for Mac", content);
+        }
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Metadata metadata = new Metadata();
+        String xml = getXML("testPPT_various.ppt", metadata).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        assertContains("<p>Bold ", xml);
+        assertContains("italic underline superscript subscript", xml);
+        assertContains("underline", xml);
+        assertContains("superscript", xml);
+        assertContains("subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p>This is a hyperlink", xml);
+        assertContains("<p>Here is a list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains("Â·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("<p>Bullet " + row, xml);
+        }
+        assertContains("Here is a numbered list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("<p>Number bullet " + row, xml);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, xml);
+            }
+        }
+        assertContains("Keyword1 Keyword2", xml);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", xml);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", xml);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
+        // 6 other characters
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
+                xml);
+
+        assertContains("And then some Gothic text:", xml);
+        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
+                xml);
+    }
+
+    @Test
+    public void testMasterFooter() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = 
PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterFooter.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Master footer is here", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * TIKA-712 Master Slide Text from PPT and PPTX files
+     *  should be extracted too
+     */
+    @Test
+    public void testMasterText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = 
PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    @Test
+    public void testMasterText2() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = 
PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText2.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = 
PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_custom_props.ppt")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals("application/vnd.ms-powerpoint", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("JOUVIN ETIENNE", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-08-22T13:32:58Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
+        assertEquals("2011-08-22T13:30:53Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-08-22T13:30:53Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Office.WORD_COUNT));
+        assertEquals("Test extraction properties pptx", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", 
metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", 
metadata.get("custom:myCustomSecondDate"));
+    }
+
+    // TIKA-1025
+    @Test
+    public void testEmbeddedPlacedholder() throws Exception {
+        XMLResult result = getXML("testPPT_embedded2.ppt");
+        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
+    }
+
+    // TIKA-817
+    @Test
+    public void testAutoDatePPT() throws Exception {
+        //decision was made in POI-52367 not to generate
+        //autodate automatically.  For pptx, where value is stored,
+        //value is extracted.  For ppt, however, no date is extracted.
+        XMLResult result = getXML("testPPT_autodate.ppt");
+        assertContains(
+                "<div class=\"slide-content\"><p>Now</p>",
+                result.xml);
+    }
+
+    @Test
+    public void testCommentAuthorship() throws Exception {
+        XMLResult r = getXML("testPPT_comment.ppt");
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. 
(ATB)", r.xml);
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for Microsoft Project (MPP) Files.
+ *
+ * Note - we don't currently have a dedicated Project
+ *  Parser, all we have is the common office metadata
+ */
+public class ProjectParserTest {
+
+    @Test
+    public void testProject2003() throws Exception {
+        try (InputStream input = ProjectParserTest.class.getResourceAsStream(
+                "/test-documents/testPROJECT2003.mpp")) {
+            doTestProject(input);
+        }
+    }
+
+    @Test
+    public void testProject2007() throws Exception {
+        try (InputStream input = ProjectParserTest.class.getResourceAsStream(
+                "/test-documents/testPROJECT2007.mpp")) {
+            doTestProject(input);
+        }
+    }
+
+    private void doTestProject(InputStream input) throws Exception {
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+        assertEquals(
+                "application/vnd.ms-project",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals("The quick brown fox jumps over the lazy dog", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Pangram, fox, dog", 
metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Comment Vulpes vulpes comment", 
metadata.get(TikaCoreProperties.COMMENTS));
+
+        assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY));
+        assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER));
+        assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY));
+
+        assertEquals("2011-11-24T10:58:00Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-11-24T10:58:00Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2011-11-24T11:31:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE));
+
+        // Custom Project metadata is present with prefix
+        assertEquals("0%", metadata.get("custom:% Complete"));
+        assertEquals("0%", metadata.get("custom:% Work Complete"));
+        assertEquals("\u00a3" + "0.00", metadata.get("custom:Cost"));
+        assertEquals("2d?", metadata.get("custom:Duration"));
+        assertEquals("16h", metadata.get("custom:Work"));
+
+        // Currently, we don't do textual contents of the file
+        String content = handler.toString();
+        assertEquals("", content);
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PublisherParserTest {
+
+    @Test
+    public void testPublisherParser() throws Exception {
+        try (InputStream input = PublisherParserTest.class.getResourceAsStream(
+                "/test-documents/testPUBLISHER.pub")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertEquals(
+                    "application/x-mspublisher",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Nick Burch", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("0123456789", content);
+            assertContains("abcdef", content);
+        }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for the TNEF (winmail.dat) parser
+ */
+public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
+    private static final String file = "testWINMAIL.dat";
+
+    @Test
+    public void testBasics() throws Exception {
+        Detector detector = new DefaultDetector();
+        try (TikaInputStream stream = getTestFile(file)) {
+            assertEquals(
+                    MediaType.application("vnd.ms-tnef"),
+                    detector.detect(stream, new Metadata()));
+        }
+    }
+
+    @Test
+    public void testMetadata() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        TNEFParser tnef = new TNEFParser();
+        tnef.parse(stream, handler, metadata, new ParseContext());
+
+        assertEquals("This is a test message", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Check the Rtf and Attachments are returned
+     * as expected
+     */
+    @Test
+    public void testBodyAndAttachments() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        // Process it with recursing
+        // Will have the message body RTF and the attachments
+        TrackingHandler handler = process(file, extractor, true);
+        assertEquals(6, handler.filenames.size());
+        assertEquals(6, handler.mediaTypes.size());
+
+        // We know the filenames for all of them
+        assertEquals("message.rtf", handler.filenames.get(0));
+        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+
+        assertEquals("quick.doc", handler.filenames.get(1));
+        assertEquals(MediaType.application("msword"), 
handler.mediaTypes.get(1));
+
+        assertEquals("quick.html", handler.filenames.get(2));
+        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+
+        assertEquals("quick.pdf", handler.filenames.get(3));
+        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+
+        assertEquals("quick.txt", handler.filenames.get(4));
+        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+
+        assertEquals("quick.xml", handler.filenames.get(5));
+        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class VisioParserTest {
+
+    @Test
+    public void testVisioParser() throws Exception {
+        try (InputStream input = VisioParserTest.class.getResourceAsStream(
+                "/test-documents/testVISIO.vsd")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertEquals(
+                    "application/vnd.visio",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
+            String content = handler.toString();
+            assertContains("Some random text, on a page", content);
+        }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,496 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class WordParserTest extends TikaTest {
+
+    @Test
+    public void testWordParser() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            assertContains("Sample Word Document", handler.toString());
+        }
+    }
+
+    @Test
+    public void testWordWithWAV() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/Doc1_ole.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertContains("MSj00974840000[1].wav", handler.toString());
+        }
+    }
+
+    /**
+     * Test that the word converter is able to generate the
+     *  correct HTML for the document
+     */
+    @Test
+    public void testWordHTML() throws Exception {
+
+        // Try with a document containing various tables and
+        // formattings
+        XMLResult result = getXML("testWORD.doc");
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Sample Word Document", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Keith Bennett", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+        assertTrue(xml.contains("Sample Word Document"));
+
+        // Check that custom headings came through
+        assertTrue(xml.contains("<h1 class=\"title\">"));
+        // Regular headings
+        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+        assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+        // Bold and italic
+        assertTrue(xml.contains("<b>BOLD</b>"));
+        assertTrue(xml.contains("<i>ITALIC</i>"));
+        // Table
+        assertTrue(xml.contains("<table>"));
+        assertTrue(xml.contains("<td>"));
+        // TODO - Check for the nested table
+        // Links
+        assertTrue(xml.contains("<a 
href=\"http://tika.apache.org/\";>Tika</a>"));
+        // Paragraphs with other styles
+        assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+        // Try with a document that contains images
+        xml = getXML("testWORD_3imgs.doc").xml;
+
+        // Images 1-3
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image1.png\""));
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image2.jpg\""));
+        assertTrue("Image not found in:\n" + xml, 
xml.contains("src=\"embedded:image3.png\""));
+
+        // Text too
+        assertTrue(xml.contains("<p>The end!"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs.doc").xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: " + xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs2.doc").xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+        assertTrue("Bold text wasn't contiguous: " + xml, 
xml.contains("F<b>oob</b>a<b>r</b>"));
+    }
+
+    @Test
+    public void testEmbeddedNames() throws Exception {
+        String result = getXML("testWORD_embedded_pdf.doc").xml;
+
+        // Make sure the embedded div comes out after "Here
+        // is the pdf file" and before "Bye Bye":
+        int i = result.indexOf("Here is the pdf file:");
+        assertTrue(i != -1);
+        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" 
/>");
+        assertTrue(j != -1);
+        int k = result.indexOf("Bye Bye");
+        assertTrue(k != -1);
+
+        assertTrue(i < j);
+        assertTrue(j < k);
+    }
+
+    // TIKA-982
+    @Test
+    public void testEmbeddedRTF() throws Exception {
+        String result = getXML("testWORD_embedded_rtf.doc").xml;
+        assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" 
/>"));
+        assertTrue(result.contains("_1404039792.rtf"));
+    }
+
+    // TIKA-1019
+    @Test
+    public void testDocumentLink() throws Exception {
+        String result = getXML("testDocumentLink.doc").xml;
+        assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" 
/>"));
+        assertTrue(result.contains("_1327495610.unknown"));
+    }
+
+    @Test
+    public void testWord6Parser() throws Exception {
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD6.doc")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("The quick brown fox jumps over the lazy dog", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+            assertEquals("Gym class featuring a brown fox and lazy dog", 
metadata.get(Metadata.SUBJECT));
+            assertEquals("Nevin Nollop", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+            assertContains("The quick brown fox jumps over the lazy dog", 
handler.toString());
+        }
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_various.doc")) {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String content = handler.toString();
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("Â·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: WordExtractor fails to number the bullets:
+            assertContains("Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        // TODO: Move to OO subject in Tika 2.0
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 content);
+
+        assertContains("And then some Gothic text:", content);
+        
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 content);
+    }
+
+    /**
+     * TIKA-1044 - Handle documents where parts of the
+     *  text have no formatting or styles applied to them
+     */
+    @Test
+    public void testNoFormat() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_no_format.doc")) {
+            new OfficeParser().parse(stream, handler, metadata, new 
ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Will generate an exception", content);
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_custom_props.doc")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals("application/msword", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Etienne Jouvin", 
metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2012-01-03T22:14:00Z", 
metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("2010-10-05T09:03:00Z", 
metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2010-10-05T09:03:00Z", 
metadata.get(Metadata.CREATION_DATE));
+        assertEquals("Microsoft Office Word", 
metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("1", metadata.get(Office.PAGE_COUNT));
+        assertEquals("2", metadata.get(Office.WORD_COUNT));
+        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", 
metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+        assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
+        // TODO: Move to OO subject in Tika 2.0
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T23:00:00Z", 
metadata.get("custom:MyCustomDate"));
+    }
+
+    @Test
+    public void testExceptions1() throws Exception {
+        XMLResult xml;
+        Level logLevelStart = Logger.getRootLogger().getLevel();
+        Logger.getRootLogger().setLevel(Level.ERROR);
+        try {
+            xml = getXML("testException1.doc");
+            assertContains("total population", xml.xml);
+            xml = getXML("testException2.doc");
+            assertContains("electric charge", xml.xml);
+        } finally {
+            Logger.getRootLogger().setLevel(logLevelStart);
+        }
+    }
+
+    @Test
+    public void testTabularSymbol() throws Exception {
+        assertContains("one two", 
getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
+    }
+
+    /**
+     * TIKA-1229 Hyperlinks in Headers should be output as such,
+     *  not plain text with control characters
+     */
+    @Test
+    public void testHeaderHyperlinks() throws Exception {
+        XMLResult result = getXML("testWORD_header_hyperlink.doc");
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
+        assertContains("example.com", xml);
+
+        // Check we don't have the special text HYPERLINK
+        assertFalse(xml.contains("HYPERLINK"));
+
+        // Check we do have the link
+        assertContains("<a href=\"http://tw-systemhaus.de\";>http:", xml);
+
+        // Check we do have the email
+        assertContains("<a href=\"mailto:[email protected]\";>ab@", xml);
+    }
+
+    @Test
+    public void testControlCharacter() throws Exception {
+        assertContains("1. Introduzione<b> </a></b> </p>", 
getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+    }
+
+    @Test
+    public void testParagraphsAfterTables() throws Exception {
+        XMLResult result = getXML("test_TIKA-1251.doc");
+
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+
+        assertEquals(
+                "application/msword",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        assertContains("<p>1. Organisering av vakten:</p>", xml);
+
+    }
+
+    @Test
+    public void testHyperlinkStringIOOBESmartQuote() throws Exception {
+        //TIKA-1512, one cause: closing double quote is a smart quote
+        //test file contributed by user
+        XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
+        
assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512";, 
result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongNoCloseQuote() throws Exception {
+        //TIKA-1512, one cause: no closing quote on really long string
+        //test file derived from govdocs1 012152.doc
+        XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
+        assertContains("href=\"http://www.lexis.com";, result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongCarriageReturn() throws Exception {
+        //TIKA-1512, one cause: no closing quote, but carriage return
+        //test file derived from govdocs1 040044.doc
+        XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
+        assertContains("href=\"http://www.nib.org";, result.xml);
+    }
+
+    @Test
+    public void testDOCParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_numbered_list.doc").xml;
+        assertContains("1) This", xml);
+        assertContains("a) Is", xml);
+        assertContains("i) A multi", xml);
+        assertContains("ii) Level", xml);
+        assertContains("1. Within cell 1", xml);
+        assertContains("b. Cell b", xml);
+        assertContains("iii) List", xml);
+        assertContains("2) foo", xml);
+        assertContains("ii) baz", xml);
+        assertContains("ii) foo", xml);
+        assertContains("II. bar", xml);
+        assertContains("6. six", xml);
+        assertContains("7. seven", xml);
+        assertContains("a. seven a", xml);
+        assertContains("e. seven e", xml);
+        assertContains("2. A ii 2", xml);
+        assertContains("3. page break list 3", xml);
+        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - 
alpha", xml);
+        assertContains("1.1.1. 1.1.1", xml);
+        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
+
+        assertContains("add a list here", xml);
+        //TODO: not currently pulling numbers out of comments
+        assertContains(">comment list 1", xml);
+
+    }
+
+    @Test
+    public void testDOCOverrideParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_override_list_numbering.doc").xml;
+
+        //Test 1
+        assertContains("1.1.1.1...1 1.1.1.1...1", xml);
+        assertContains("1st.2.3someText 1st.2.3someText", xml);
+        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+        assertContains("5th 5th", xml);
+
+
+        //Test 2
+        assertContains("1.a.I 1.a.I", xml);
+        //test no reset because level 2 is not sufficient to reset
+        assertContains("1.b.III 1.b.III", xml);
+        //test restarted because of level 0's increment to 2
+        assertContains("2.a.I 2.a.I", xml);
+        //test handling of skipped level
+        assertContains("2.b 2.b", xml);
+
+        //Test 3
+        assertContains("(1)) (1))", xml);
+        //tests start level 1 at 17 and
+        assertContains("2.17 2.17", xml);
+        //tests that isLegal turns everything into decimal
+        assertContains("2.18.2.1 2.18.2.1", xml);
+        assertContains(">2 2", xml);
+
+        //Test4
+        assertContains(">1 1", xml);
+        assertContains(">A A", xml);
+        assertContains(">B B", xml);
+        assertContains(">C C", xml);
+        assertContains(">4 4", xml);
+
+        //Test5
+        assertContains(">00 00", xml);
+        assertContains(">01 01", xml);
+        assertContains(">01. 01.", xml);
+        assertContains(">01..1 01..1", xml);
+        assertContains(">02 02", xml);
+    }
+
+    @Test
+    public void testMultiAuthorsManagers() throws Exception {
+        XMLResult r = getXML("testWORD_multi_authors.doc");
+        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+        assertEquals(3, authors.length);
+        assertEquals("author2", authors[1]);
+
+        String[] managers = 
r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+        assertEquals(2, managers.length);
+        assertEquals("manager1", managers[0]);
+        assertEquals("manager2", managers[1]);
+    }
+}
+

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class WriteProtectedParserTest {
+
+    @Test
+    public void testWriteProtected() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/protect.xlsx");
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        new OfficeParser().parse(input, handler, metadata, new ParseContext());
+        String content = handler.toString();
+        assertContains("Office", content);
+    }
+}

svn commit: r1725014 [15/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Reply via email to