Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.getTestFile; +import static org.junit.Assert.assertEquals; + +import org.apache.tika.TikaTest; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Tests for the Old Excel (2-4) parser + */ +public class OldExcelParserTest extends TikaTest { + private static final String file = "testEXCEL_4.xls"; + + @Test + public void testDetection() throws Exception { + Detector detector = new DefaultDetector(); + try (TikaInputStream stream = getTestFile(file)) { + assertEquals( + MediaType.application("vnd.ms-excel.sheet.4"), + detector.detect(stream, new Metadata())); + } + } + + // Disabled, until we can get the POI code to tell us the version + @Test + @Ignore + public void testMetadata() throws Exception { + TikaInputStream stream = getTestFile(file); + + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + OldExcelParser parser = new OldExcelParser(); + parser.parse(stream, handler, metadata, new ParseContext()); + + // We can get the content type + assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE)); + + // But no other metadata + assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); + assertEquals(null, metadata.get(Metadata.SUBJECT)); + } + + /** + * Check we can get the plain text properly + */ + @Test + public void testPlainText() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (TikaInputStream stream = getTestFile(file)) { + new OldExcelParser().parse(stream, handler, metadata, new ParseContext()); + } + + String text = handler.toString(); + + // Check we find a few words we expect in there + assertContains("Size", text); + assertContains("Returns", text); + + // Check we find a few numbers we expect in there + assertContains("11", text); + assertContains("784", text); + } + + /** + * Check the HTML version comes through correctly + */ + @Test + public void testHTML() throws Exception { + XMLResult result = getXML(file); + String xml = result.xml; + + // Sheet name not found - only 5+ have sheet names + assertNotContained("<p>Sheet 1</p>", xml); + + // String cells + assertContains("<p>Table 10 -", xml); + assertContains("<p>Tax</p>", xml); + assertContains("<p>N/A</p>", xml); + + // Number cells + assertContains("<p>(1)</p>", xml); + assertContains("<p>5.0</p>", xml); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import java.io.InputStream; +import java.io.StringWriter; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing Outlook files. + */ +public class OutlookParserTest extends TikaTest { + + @Test + public void testOutlookParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/test-outlook.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertEquals( + "application/vnd.ms-outlook", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals( + "Microsoft Outlook Express 6", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals( + "Nouvel utilisateur de Outlook Express", + metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); + assertEquals( + "L'\u00C9quipe Microsoft Outlook Express", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals( + "L'\u00C9quipe Microsoft Outlook Express", + metadata.get(Metadata.AUTHOR)); + + // Stored as Thu, 5 Apr 2007 09:26:06 -0700 + assertEquals( + "2007-04-05T16:26:06Z", + metadata.get(TikaCoreProperties.CREATED)); + + String content = handler.toString(); + assertContains("Microsoft Outlook Express 6", content); + assertContains("L'\u00C9quipe Microsoft Outlook Express", content); + assertContains("Nouvel utilisateur de Outlook Express", content); + assertContains("Messagerie et groupes de discussion", content); + } + + /** + * Test case for TIKA-197 + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a> + */ + @Test + public void testMultipleCopies() throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/testMSG.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertEquals( + "application/vnd.ms-outlook", + metadata.get(Metadata.CONTENT_TYPE)); + + String content = handler.toString(); + Pattern pattern = Pattern.compile("From"); + Matcher matcher = pattern.matcher(content); + assertTrue(matcher.find()); + assertFalse(matcher.find()); + } + + /** + * Test case for TIKA-395, to ensure parser works for new Outlook formats. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a> + */ + @Test + public void testOutlookNew() throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/test-outlook2003.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertEquals( + "application/vnd.ms-outlook", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals( + "Welcome to Microsoft Office Outlook 2003", + metadata.get(TikaCoreProperties.TITLE)); + + String content = handler.toString(); + assertContains("Outlook 2003", content); + assertContains("Streamlined Mail Experience", content); + assertContains("Navigation Pane", content); + } + + @Test + public void testOutlookHTMLVersion() throws Exception { + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + + // Check the HTML version + StringWriter sw = new StringWriter(); + SAXTransformerFactory factory = (SAXTransformerFactory) + SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); + handler.setResult(new StreamResult(sw)); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/testMSG_chinese.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // As the HTML version should have been processed, ensure + // we got some of the links + String content = sw.toString(); + assertContains("<dd>[email protected]</dd>", content); + assertContains("<p>Alfresco MSG format testing", content); + assertContains("<li>1", content); + assertContains("<li>2", content); + + // Make sure we don't have nested html docs + assertEquals(2, content.split("<body>").length); + assertEquals(2, content.split("<\\/body>").length); + + // Make sure that the Chinese actually came through + assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR)); + assertContains("\u9673\u60E0\u73CD", content); + } + + @Test + public void testOutlookForwarded() throws Exception { + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + + // Check the HTML version + StringWriter sw = new StringWriter(); + SAXTransformerFactory factory = (SAXTransformerFactory) + SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); + handler.setResult(new StreamResult(sw)); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/testMSG_forwarded.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // Make sure we don't have nested docs + String content = sw.toString(); + assertEquals(2, content.split("<body>").length); + assertEquals(2, content.split("<\\/body>").length); + } + + @Test + public void testOutlookHTMLfromRTF() throws Exception { + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + + // Check the HTML version + StringWriter sw = new StringWriter(); + SAXTransformerFactory factory = (SAXTransformerFactory) + SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); + handler.setResult(new StreamResult(sw)); + + try (InputStream stream = OutlookParserTest.class.getResourceAsStream( + "/test-documents/test-outlook2003.msg")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // As the HTML version should have been processed, ensure + // we got some of the links + String content = sw.toString().replaceAll("<p>\\s+", "<p>"); + assertContains("<dd>New Outlook User</dd>", content); + assertContains("designed <i>to help you", content); + assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content); + + // Link - check text around it, and the link itself + assertContains("sign up for a free subscription", content); + assertContains("Office Newsletter", content); + assertContains("newsletter will be sent to you", content); + assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content); + + // Make sure we don't have nested html docs + assertEquals(2, content.split("<body>").length); + assertEquals(2, content.split("<\\/body>").length); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.TikaTest.TrackingHandler; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.mime.MediaType; +import org.junit.Test; + +/** + * Tests that the various POI powered parsers are + * able to extract their embedded contents. + */ +public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest { + + /** + * For office files which don't have anything embedded in them + */ + @Test + public void testWithoutEmbedded() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + + String[] files = new String[]{ + "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", + "testVISIO.vsd", "test-outlook.msg" + }; + for (String file : files) { + // Process it without recursing + TrackingHandler handler = process(file, extractor, false); + + // Won't have fired + assertEquals(0, handler.filenames.size()); + assertEquals(0, handler.mediaTypes.size()); + + // Ditto with recursing + handler = process(file, extractor, true); + assertEquals(0, handler.filenames.size()); + assertEquals(0, handler.mediaTypes.size()); + } + } + + /** + * Office files with embedded images, but no other + * office files in them + */ + @Test + public void testEmbeddedImages() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + // Excel with 1 image + handler = process("testEXCEL_1img.xls", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals(null, handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // PowerPoint with 2 images + sound + // TODO + + + // Word with 1 image + handler = process("testWORD_1img.doc", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // Word with 3 images + handler = process("testWORD_3imgs.doc", extractor, false); + assertEquals(3, handler.filenames.size()); + assertEquals(3, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals("image2.jpg", handler.filenames.get(1)); + assertEquals("image3.png", handler.filenames.get(2)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + assertEquals(TYPE_JPG, handler.mediaTypes.get(1)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); + } + + /** + * Office files which have other office files + * embedded into them. The embedded office files + * will sometimes have images in them. + * <p/> + * eg xls + * -> word + * -> image + * -> image + * -> powerpoint + * -> excel + * -> image + */ + @Test + public void testEmbeddedOfficeFiles() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + + // Excel with a word doc and a powerpoint doc, both of which have images in them + // Without recursion, should see both documents + the images + handler = process("testEXCEL_embeded.xls", extractor, false); + assertEquals(5, handler.filenames.size()); + assertEquals(5, handler.mediaTypes.size()); + + // We don't know their filenames + assertEquals(null, handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); + assertEquals("MBD00032A24.doc", handler.filenames.get(4)); + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image + assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc + + + // With recursion, should get the images embedded in the office files too + handler = process("testEXCEL_embeded.xls", extractor, true); + assertEquals(17, handler.filenames.size()); + assertEquals(17, handler.mediaTypes.size()); + + assertEquals(null, handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); + assertEquals("1", handler.filenames.get(4)); + assertEquals(null, handler.filenames.get(5)); + assertEquals("2", handler.filenames.get(6)); + assertEquals("image1.png", handler.filenames.get(7)); + assertEquals("image2.jpg", handler.filenames.get(8)); + assertEquals("image3.png", handler.filenames.get(9)); + assertEquals("image1.png", handler.filenames.get(16)); + + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image + assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation + assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image + assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image + assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image + assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image + + // Word with .docx, powerpoint and excel + handler = process("testWORD_embeded.doc", extractor, false); + assertEquals(9, handler.filenames.size()); + assertEquals(9, handler.mediaTypes.size()); + + // Filenames are a bit iffy... + // Should really be 3*embedded pictures then 3*icons then embedded docs + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("image4.png", handler.filenames.get(1)); + assertEquals("image5.jpg", handler.filenames.get(2)); + assertEquals("image6.png", handler.filenames.get(3)); + assertEquals("image2.emf", handler.filenames.get(4)); + assertEquals("image3.emf", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals("_1345471035.ppt", handler.filenames.get(7)); + assertEquals("_1345470949.xls", handler.filenames.get(8)); + + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc? + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo + assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try + assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc? + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc? + assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc + assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc + + + // With recursion, should get their images too + handler = process("testWORD_embeded.doc", extractor, true); + assertEquals(16, handler.filenames.size()); + assertEquals(16, handler.mediaTypes.size()); + + // We don't know their filenames, except for doc images + docx + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("image4.png", handler.filenames.get(1)); + assertEquals("image5.jpg", handler.filenames.get(2)); + assertEquals("image6.png", handler.filenames.get(3)); + assertEquals("image2.emf", handler.filenames.get(4)); + assertEquals("image3.emf", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals("image2.png", handler.filenames.get(7)); + assertEquals("image3.jpeg", handler.filenames.get(8)); + assertEquals("image4.png", handler.filenames.get(9)); + for (int i = 11; i < 14; i++) { + assertNull(handler.filenames.get(i)); + } + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo + assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try + assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc + assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx + assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx + assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc + assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls + + + // PowerPoint with excel and word + handler = process("testPPT_embeded.ppt", extractor, false); + assertEquals(7, handler.filenames.size()); + assertEquals(7, handler.mediaTypes.size()); + + // We don't get all that helpful filenames + assertEquals("1", handler.filenames.get(0)); + assertEquals("2", handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals(null, handler.filenames.get(3)); + assertEquals(null, handler.filenames.get(4)); + assertEquals(null, handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + // But we do know their types + assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image + + // Run again on PowerPoint but with recursion + handler = process("testPPT_embeded.ppt", extractor, true); + assertEquals(11, handler.filenames.size()); + assertEquals(11, handler.mediaTypes.size()); + + assertEquals("1", handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals("2", handler.filenames.get(2)); + assertEquals("image1.png", handler.filenames.get(3)); + assertEquals("image2.jpg", handler.filenames.get(4)); + assertEquals("image3.png", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals(null, handler.filenames.get(7)); + assertEquals(null, handler.filenames.get(8)); + assertEquals(null, handler.filenames.get(9)); + assertEquals(null, handler.filenames.get(10)); + + assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .xls + assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .docx + assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside .docx + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .docx + assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image + + + // Word, with a non-office file (PDF) + handler = process("testWORD_embedded_pdf.doc", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("_1402837031.pdf", handler.filenames.get(1)); + + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf + assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself + + + // Outlook with a text file and a word document + handler = process("testMSG_att_doc.msg", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("test-unicode.doc", handler.filenames.get(0)); + assertEquals(TYPE_DOC, handler.mediaTypes.get(0)); + + assertEquals("pj1.txt", handler.filenames.get(1)); + assertEquals(TYPE_TXT, handler.mediaTypes.get(1)); + + + // Outlook with a pdf and another outlook message + handler = process("testMSG_att_msg.msg", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0)); + assertEquals(TYPE_MSG, handler.mediaTypes.get(0)); + + assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1)); + assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); + } + + @Test + public void testEmbeddedOfficeFilesXML() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + handler = process("EmbeddedDocument.docx", extractor, false); + assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin")); + assertEquals(2, handler.filenames.size()); + } + + @Test + public void testPowerpointImages() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + handler = process("pictures.ppt", extractor, false); + assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg"))); + assertTrue(handler.mediaTypes.contains(new MediaType("image", "png"))); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,241 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; +import java.util.Locale; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class PowerPointParserTest extends TikaTest { + + @Test + public void testPowerPointParser() throws Exception { + try (InputStream input = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT.ppt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.ms-powerpoint", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + String content = handler.toString(); + assertContains("Sample Powerpoint Slide", content); + assertContains("Powerpoint X for Mac", content); + } + } + + @Test + public void testVarious() throws Exception { + Metadata metadata = new Metadata(); + String xml = getXML("testPPT_various.ppt", metadata).xml; + assertContains("<p>Footnote appears here", xml); + assertContains("<p>[1] This is a footnote.", xml); + assertContains("<p>This is the header text.</p>", xml); + assertContains("<p>This is the footer text.</p>", xml); + assertContains("<p>Here is a text box</p>", xml); + assertContains("<p>Bold ", xml); + assertContains("italic underline superscript subscript", xml); + assertContains("underline", xml); + assertContains("superscript", xml); + assertContains("subscript", xml); + assertContains("<p>Here is a citation:", xml); + assertContains("Figure 1 This is a caption for Figure 1", xml); + assertContains("(Kramer)", xml); + assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml); + assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml); + assertContains("<p>Row 1 column 1</p>", xml); + assertContains("<p>Row 2 column 2</p>", xml); + assertContains("<p>This is a hyperlink", xml); + assertContains("<p>Here is a list:", xml); + for(int row=1;row<=3;row++) { + //assertContains("·\tBullet " + row, content); + //assertContains("\u00b7\tBullet " + row, content); + assertContains("<p>Bullet " + row, xml); + } + assertContains("Here is a numbered list:", xml); + for(int row=1;row<=3;row++) { + //assertContains(row + ")\tNumber bullet " + row, content); + //assertContains(row + ") Number bullet " + row, content); + // TODO: OOXMLExtractor fails to number the bullets: + assertContains("<p>Number bullet " + row, xml); + } + + for(int row=1;row<=2;row++) { + for(int col=1;col<=3;col++) { + assertContains("Row " + row + " Col " + col, xml); + } + } + assertContains("Keyword1 Keyword2", xml); + assertEquals("Keyword1 Keyword2", + metadata.get(TikaCoreProperties.KEYWORDS)); + + assertContains("Subject is here", xml); + assertEquals("Subject is here", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + // TODO: Remove subject in Tika 2.0 + assertEquals("Subject is here", + metadata.get(Metadata.SUBJECT)); + + assertContains("Suddenly some Japanese text:", xml); + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", xml); + // 6 other characters + assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", + xml); + + assertContains("And then some Gothic text:", xml); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", + xml); + } + + @Test + public void testMasterFooter() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT_masterFooter.ppt")) { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } + + String content = handler.toString(); + assertContains("Master footer is here", content); + + // Make sure boilerplate text didn't come through: + assertEquals(-1, content.indexOf("Click to edit Master")); + + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); + } + + /** + * TIKA-712 Master Slide Text from PPT and PPTX files + * should be extracted too + */ + @Test + public void testMasterText() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT_masterText.ppt")) { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } + + String content = handler.toString(); + assertContains("Text that I added to the master slide", content); + + // Make sure boilerplate text didn't come through: + assertEquals(-1, content.indexOf("Click to edit Master")); + + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); + } + + @Test + public void testMasterText2() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT_masterText2.ppt")) { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } + + String content = handler.toString(); + assertContains("Text that I added to the master slide", content); + + // Make sure boilerplate text didn't come through: + assertEquals(-1, content.indexOf("Click to edit Master")); + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); + } + + /** + * Ensures that custom OLE2 (HPSF) properties are extracted + */ + @Test + public void testCustomProperties() throws Exception { + Metadata metadata = new Metadata(); + + try (InputStream input = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT_custom_props.ppt")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } + + assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("1", metadata.get(Office.SLIDE_COUNT)); + assertEquals("3", metadata.get(Office.WORD_COUNT)); + assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + } + + // TIKA-1025 + @Test + public void testEmbeddedPlacedholder() throws Exception { + XMLResult result = getXML("testPPT_embedded2.ppt"); + assertContains("<div class=\"embedded\" id=\"1\" />", result.xml); + assertContains("<div class=\"embedded\" id=\"14\" />", result.xml); + } + + // TIKA-817 + @Test + public void testAutoDatePPT() throws Exception { + //decision was made in POI-52367 not to generate + //autodate automatically. For pptx, where value is stored, + //value is extracted. For ppt, however, no date is extracted. + XMLResult result = getXML("testPPT_autodate.ppt"); + assertContains( + "<div class=\"slide-content\"><p>Now</p>", + result.xml); + } + + @Test + public void testCommentAuthorship() throws Exception { + XMLResult r = getXML("testPPT_comment.ppt"); + assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Tests for Microsoft Project (MPP) Files. + * + * Note - we don't currently have a dedicated Project + * Parser, all we have is the common office metadata + */ +public class ProjectParserTest { + + @Test + public void testProject2003() throws Exception { + try (InputStream input = ProjectParserTest.class.getResourceAsStream( + "/test-documents/testPROJECT2003.mpp")) { + doTestProject(input); + } + } + + @Test + public void testProject2007() throws Exception { + try (InputStream input = ProjectParserTest.class.getResourceAsStream( + "/test-documents/testPROJECT2007.mpp")) { + doTestProject(input); + } + } + + private void doTestProject(InputStream input) throws Exception { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.ms-project", + metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT)); + assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Comment Vulpes vulpes comment", metadata.get(TikaCoreProperties.COMMENTS)); + + assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY)); + assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER)); + assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY)); + + assertEquals("2011-11-24T10:58:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2011-11-24T11:31:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE)); + + // Custom Project metadata is present with prefix + assertEquals("0%", metadata.get("custom:% Complete")); + assertEquals("0%", metadata.get("custom:% Work Complete")); + assertEquals("\u00a3" + "0.00", metadata.get("custom:Cost")); + assertEquals("2d?", metadata.get("custom:Duration")); + assertEquals("16h", metadata.get("custom:Work")); + + // Currently, we don't do textual contents of the file + String content = handler.toString(); + assertEquals("", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class PublisherParserTest { + + @Test + public void testPublisherParser() throws Exception { + try (InputStream input = PublisherParserTest.class.getResourceAsStream( + "/test-documents/testPUBLISHER.pub")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/x-mspublisher", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR)); + String content = handler.toString(); + assertContains("0123456789", content); + assertContains("abcdef", content); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; + +import org.apache.tika.TikaTest.TrackingHandler; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Tests for the TNEF (winmail.dat) parser + */ +public class TNEFParserTest extends AbstractPOIContainerExtractionTest { + private static final String file = "testWINMAIL.dat"; + + @Test + public void testBasics() throws Exception { + Detector detector = new DefaultDetector(); + try (TikaInputStream stream = getTestFile(file)) { + assertEquals( + MediaType.application("vnd.ms-tnef"), + detector.detect(stream, new Metadata())); + } + } + + @Test + public void testMetadata() throws Exception { + TikaInputStream stream = getTestFile(file); + + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + TNEFParser tnef = new TNEFParser(); + tnef.parse(stream, handler, metadata, new ParseContext()); + + assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("This is a test message", metadata.get(Metadata.SUBJECT)); + } + + /** + * Check the Rtf and Attachments are returned + * as expected + */ + @Test + public void testBodyAndAttachments() throws Exception { + ContainerExtractor extractor = new ParserContainerExtractor(); + + // Process it with recursing + // Will have the message body RTF and the attachments + TrackingHandler handler = process(file, extractor, true); + assertEquals(6, handler.filenames.size()); + assertEquals(6, handler.mediaTypes.size()); + + // We know the filenames for all of them + assertEquals("message.rtf", handler.filenames.get(0)); + assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0)); + + assertEquals("quick.doc", handler.filenames.get(1)); + assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1)); + + assertEquals("quick.html", handler.filenames.get(2)); + assertEquals(MediaType.text("html"), handler.mediaTypes.get(2)); + + assertEquals("quick.pdf", handler.filenames.get(3)); + assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3)); + + assertEquals("quick.txt", handler.filenames.get(4)); + assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4)); + + assertEquals("quick.xml", handler.filenames.get(5)); + assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class VisioParserTest { + + @Test + public void testVisioParser() throws Exception { + try (InputStream input = VisioParserTest.class.getResourceAsStream( + "/test-documents/testVISIO.vsd")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.visio", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR)); + String content = handler.toString(); + assertContains("Some random text, on a page", content); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,496 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.Locale; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class WordParserTest extends TikaTest { + + @Test + public void testWordParser() throws Exception { + try (InputStream input = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD.doc")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + assertContains("Sample Word Document", handler.toString()); + } + } + + @Test + public void testWordWithWAV() throws Exception { + try (InputStream input = WordParserTest.class.getResourceAsStream( + "/test-documents/Doc1_ole.doc")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertContains("MSj00974840000[1].wav", handler.toString()); + } + } + + /** + * Test that the word converter is able to generate the + * correct HTML for the document + */ + @Test + public void testWordHTML() throws Exception { + + // Try with a document containing various tables and + // formattings + XMLResult result = getXML("testWORD.doc"); + String xml = result.xml; + Metadata metadata = result.metadata; + + assertEquals( + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + assertTrue(xml.contains("Sample Word Document")); + + // Check that custom headings came through + assertTrue(xml.contains("<h1 class=\"title\">")); + // Regular headings + assertTrue(xml.contains("<h1>Heading Level 1</h1>")); + assertTrue(xml.contains("<h3>Heading Level 3</h3>")); + // Bold and italic + assertTrue(xml.contains("<b>BOLD</b>")); + assertTrue(xml.contains("<i>ITALIC</i>")); + // Table + assertTrue(xml.contains("<table>")); + assertTrue(xml.contains("<td>")); + // TODO - Check for the nested table + // Links + assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); + // Paragraphs with other styles + assertTrue(xml.contains("<p class=\"signature\">This one")); + + // Try with a document that contains images + xml = getXML("testWORD_3imgs.doc").xml; + + // Images 1-3 + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\"")); + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\"")); + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\"")); + + // Text too + assertTrue(xml.contains("<p>The end!")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs.doc").xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs2.doc").xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); + } + + @Test + public void testEmbeddedNames() throws Exception { + String result = getXML("testWORD_embedded_pdf.doc").xml; + + // Make sure the embedded div comes out after "Here + // is the pdf file" and before "Bye Bye": + int i = result.indexOf("Here is the pdf file:"); + assertTrue(i != -1); + int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />"); + assertTrue(j != -1); + int k = result.indexOf("Bye Bye"); + assertTrue(k != -1); + + assertTrue(i < j); + assertTrue(j < k); + } + + // TIKA-982 + @Test + public void testEmbeddedRTF() throws Exception { + String result = getXML("testWORD_embedded_rtf.doc").xml; + assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />")); + assertTrue(result.contains("_1404039792.rtf")); + } + + // TIKA-1019 + @Test + public void testDocumentLink() throws Exception { + String result = getXML("testDocumentLink.doc").xml; + assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />")); + assertTrue(result.contains("_1327495610.unknown")); + } + + @Test + public void testWord6Parser() throws Exception { + try (InputStream input = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD6.doc")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT)); + assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR)); + assertContains("The quick brown fox jumps over the lazy dog", handler.toString()); + } + } + + @Test + public void testVarious() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_various.doc")) { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } + + String content = handler.toString(); + //content = content.replaceAll("\\s+"," "); + assertContains("Footnote appears here", content); + assertContains("This is a footnote.", content); + assertContains("This is the header text.", content); + assertContains("This is the footer text.", content); + assertContains("Here is a text box", content); + assertContains("Bold", content); + assertContains("italic", content); + assertContains("underline", content); + assertContains("superscript", content); + assertContains("subscript", content); + assertContains("Here is a citation:", content); + assertContains("Figure 1 This is a caption for Figure 1", content); + assertContains("(Kramer)", content); + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," ")); + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," ")); + assertContains("This is a hyperlink", content); + assertContains("Here is a list:", content); + for(int row=1;row<=3;row++) { + //assertContains("·\tBullet " + row, content); + //assertContains("\u00b7\tBullet " + row, content); + assertContains("Bullet " + row, content); + } + assertContains("Here is a numbered list:", content); + for(int row=1;row<=3;row++) { + //assertContains(row + ")\tNumber bullet " + row, content); + //assertContains(row + ") Number bullet " + row, content); + // TODO: WordExtractor fails to number the bullets: + assertContains("Number bullet " + row, content); + } + + for(int row=1;row<=2;row++) { + for(int col=1;col<=3;col++) { + assertContains("Row " + row + " Col " + col, content); + } + } + + assertContains("Keyword1 Keyword2", content); + assertEquals("Keyword1 Keyword2", + metadata.get(TikaCoreProperties.KEYWORDS)); + + assertContains("Subject is here", content); + // TODO: Move to OO subject in Tika 2.0 + assertEquals("Subject is here", + metadata.get(Metadata.SUBJECT)); + assertEquals("Subject is here", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + + assertContains("Suddenly some Japanese text:", content); + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", content); + // 6 other characters + assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content); + + assertContains("And then some Gothic text:", content); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + } + + /** + * TIKA-1044 - Handle documents where parts of the + * text have no formatting or styles applied to them + */ + @Test + public void testNoFormat() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_no_format.doc")) { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } + + String content = handler.toString(); + assertContains("Will generate an exception", content); + } + + /** + * Ensures that custom OLE2 (HPSF) properties are extracted + */ + @Test + public void testCustomProperties() throws Exception { + Metadata metadata = new Metadata(); + + try (InputStream input = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_custom_props.doc")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } + + assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); + assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("2", metadata.get(Office.WORD_COUNT)); + assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); + assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS)); + // TODO: Move to OO subject in Tika 2.0 + assertEquals("My subject", metadata.get(Metadata.SUBJECT)); + assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY)); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); + } + + @Test + public void testExceptions1() throws Exception { + XMLResult xml; + Level logLevelStart = Logger.getRootLogger().getLevel(); + Logger.getRootLogger().setLevel(Level.ERROR); + try { + xml = getXML("testException1.doc"); + assertContains("total population", xml.xml); + xml = getXML("testException2.doc"); + assertContains("electric charge", xml.xml); + } finally { + Logger.getRootLogger().setLevel(logLevelStart); + } + } + + @Test + public void testTabularSymbol() throws Exception { + assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " ")); + } + + /** + * TIKA-1229 Hyperlinks in Headers should be output as such, + * not plain text with control characters + */ + @Test + public void testHeaderHyperlinks() throws Exception { + XMLResult result = getXML("testWORD_header_hyperlink.doc"); + String xml = result.xml; + Metadata metadata = result.metadata; + + assertEquals( + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR)); + assertContains("example.com", xml); + + // Check we don't have the special text HYPERLINK + assertFalse(xml.contains("HYPERLINK")); + + // Check we do have the link + assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml); + + // Check we do have the email + assertContains("<a href=\"mailto:[email protected]\">ab@", xml); + } + + @Test + public void testControlCharacter() throws Exception { + assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " ")); + } + + @Test + public void testParagraphsAfterTables() throws Exception { + XMLResult result = getXML("test_TIKA-1251.doc"); + + String xml = result.xml; + Metadata metadata = result.metadata; + + assertEquals( + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); + + assertContains("<p>1. Organisering av vakten:</p>", xml); + + } + + @Test + public void testHyperlinkStringIOOBESmartQuote() throws Exception { + //TIKA-1512, one cause: closing double quote is a smart quote + //test file contributed by user + XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc"); + assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml); + } + + @Test + @Ignore //until we determine whether we can include test docs or not + public void testHyperlinkStringLongNoCloseQuote() throws Exception { + //TIKA-1512, one cause: no closing quote on really long string + //test file derived from govdocs1 012152.doc + XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc"); + assertContains("href=\"http://www.lexis.com", result.xml); + } + + @Test + @Ignore //until we determine whether we can include test docs or not + public void testHyperlinkStringLongCarriageReturn() throws Exception { + //TIKA-1512, one cause: no closing quote, but carriage return + //test file derived from govdocs1 040044.doc + XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc"); + assertContains("href=\"http://www.nib.org", result.xml); + } + + @Test + public void testDOCParagraphNumbering() throws Exception { + String xml = getXML("testWORD_numbered_list.doc").xml; + assertContains("1) This", xml); + assertContains("a) Is", xml); + assertContains("i) A multi", xml); + assertContains("ii) Level", xml); + assertContains("1. Within cell 1", xml); + assertContains("b. Cell b", xml); + assertContains("iii) List", xml); + assertContains("2) foo", xml); + assertContains("ii) baz", xml); + assertContains("ii) foo", xml); + assertContains("II. bar", xml); + assertContains("6. six", xml); + assertContains("7. seven", xml); + assertContains("a. seven a", xml); + assertContains("e. seven e", xml); + assertContains("2. A ii 2", xml); + assertContains("3. page break list 3", xml); + assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml); + assertContains("1.1.1. 1.1.1", xml); + assertContains("1.1. 1.2->1.1 //set the value", xml); + + assertContains("add a list here", xml); + //TODO: not currently pulling numbers out of comments + assertContains(">comment list 1", xml); + + } + + @Test + public void testDOCOverrideParagraphNumbering() throws Exception { + String xml = getXML("testWORD_override_list_numbering.doc").xml; + + //Test 1 + assertContains("1.1.1.1...1 1.1.1.1...1", xml); + assertContains("1st.2.3someText 1st.2.3someText", xml); + assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml); + assertContains("5th 5th", xml); + + + //Test 2 + assertContains("1.a.I 1.a.I", xml); + //test no reset because level 2 is not sufficient to reset + assertContains("1.b.III 1.b.III", xml); + //test restarted because of level 0's increment to 2 + assertContains("2.a.I 2.a.I", xml); + //test handling of skipped level + assertContains("2.b 2.b", xml); + + //Test 3 + assertContains("(1)) (1))", xml); + //tests start level 1 at 17 and + assertContains("2.17 2.17", xml); + //tests that isLegal turns everything into decimal + assertContains("2.18.2.1 2.18.2.1", xml); + assertContains(">2 2", xml); + + //Test4 + assertContains(">1 1", xml); + assertContains(">A A", xml); + assertContains(">B B", xml); + assertContains(">C C", xml); + assertContains(">4 4", xml); + + //Test5 + assertContains(">00 00", xml); + assertContains(">01 01", xml); + assertContains(">01. 01.", xml); + assertContains(">01..1 01..1", xml); + assertContains(">02 02", xml); + } + + @Test + public void testMultiAuthorsManagers() throws Exception { + XMLResult r = getXML("testWORD_multi_authors.doc"); + String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR); + assertEquals(3, authors.length); + assertEquals("author2", authors[1]); + + String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER); + assertEquals(2, managers.length); + assertEquals("manager1", managers[0]); + assertEquals("manager2", managers[1]); + } +} + Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import static org.apache.tika.TikaTest.assertContains; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class WriteProtectedParserTest { + + @Test + public void testWriteProtected() throws Exception { + InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/protect.xlsx"); + + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + String content = handler.toString(); + assertContains("Office", content); + } +}
