Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri May 29 14:36:21 2015 @@ -16,11 +16,13 @@ */ package org.apache.tika.parser.microsoft.ooxml; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import javax.xml.transform.OutputKeys; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.PrintStream; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class OOXMLParserTest extends TikaTest { private Parser parser = new AutoDetectParser(); @@ -63,7 +62,7 @@ public class OOXMLParserTest extends Tik @Test public void testExcel() throws Exception { - Metadata metadata = new Metadata(); + Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); @@ -78,7 +77,7 @@ public class OOXMLParserTest extends Tik assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - + String content = handler.toString(); assertContains("Sample Excel Worksheet", content); assertContains("Numbers and their Squares", content); @@ -125,7 +124,7 @@ public class OOXMLParserTest extends Tik // Percentage assertContains("2.50%", content); // Excel rounds up to 3%, but that requires Java 1.6 or later - if(System.getProperty("java.version").startsWith("1.5")) { + if (System.getProperty("java.version").startsWith("1.5")) { assertContains("2%", content); } else { assertContains("3%", content); @@ -144,24 +143,24 @@ public class OOXMLParserTest extends Tik // Fraction (2.5): # ?/? assertContains("2 1/2", content); - + // Below assertions represent outstanding formatting issues to be addressed // they are included to allow the issues to be progressed with the Apache POI // team - See TIKA-103. /************************************************************************* - // Date Format: m/d/yy - assertContains("03/10/2009", content); + // Date Format: m/d/yy + assertContains("03/10/2009", content); - // Date/Time Format - assertContains("19/01/2008 04:35", content); + // Date/Time Format + assertContains("19/01/2008 04:35", content); - // Custom Number (0 "dollars and" .00 "cents") - assertContains("19 dollars and .99 cents", content); + // Custom Number (0 "dollars and" .00 "cents") + assertContains("19 dollars and .99 cents", content); - // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) - assertContains("At 4:20 AM on Thursday May 17, 2007", content); - **************************************************************************/ + // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) + assertContains("At 4:20 AM on Thursday May 17, 2007", content); + **************************************************************************/ } finally { input.close(); } @@ -170,7 +169,7 @@ public class OOXMLParserTest extends Tik @Test @Ignore("OOXML-Strict not currently supported by POI, see #57699") public void testExcelStrict() throws Exception { - Metadata metadata = new Metadata(); + Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); @@ -185,7 +184,7 @@ public class OOXMLParserTest extends Tik assertEquals("Sample Spreadsheet", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Spreadsheet for testing", metadata.get(TikaCoreProperties.DESCRIPTION)); - + String content = handler.toString(); assertContains("Test spreadsheet", content); assertContains("This one is red", content); @@ -201,17 +200,17 @@ public class OOXMLParserTest extends Tik /** * We have a number of different powerpoint files, - * such as presentation, macro-enabled etc + * such as presentation, macro-enabled etc */ @Test public void testPowerPoint() throws Exception { - String[] extensions = new String[] { - "pptx", "pptm", "ppsm", "ppsx", "potm" - //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 - //"xps" // TIKA-418: Not yet supported by POI - }; + String[] extensions = new String[]{ + "pptx", "pptm", "ppsm", "ppsx", "potm" + //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 + //"xps" // TIKA-418: Not yet supported by POI + }; - String[] mimeTypes = new String[] { + String[] mimeTypes = new String[]{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", @@ -219,7 +218,7 @@ public class OOXMLParserTest extends Tik "application/vnd.ms-powerpoint.template.macroenabled.12" }; - for (int i=0; i<extensions.length; i++) { + for (int i = 0; i < extensions.length; i++) { String extension = extensions[i]; String filename = "testPPT." + extension; @@ -227,11 +226,11 @@ public class OOXMLParserTest extends Tik Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); - + InputStream input = getTestDocument(filename); try { parser.parse(input, handler, metadata, context); - + assertEquals( "Mime-type checking for " + filename, mimeTypes[i], @@ -239,31 +238,31 @@ public class OOXMLParserTest extends Tik assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Rajiv", metadata.get(Metadata.AUTHOR)); - + String content = handler.toString(); // Theme files don't have the text in them - if(extension.equals("thmx")) { + if (extension.equals("thmx")) { assertEquals("", content); } else { assertTrue( - "Text missing for " + filename + "\n" + content, - content.contains("Attachment Test") + "Text missing for " + filename + "\n" + content, + content.contains("Attachment Test") ); assertTrue( - "Text missing for " + filename + "\n" + content, - content.contains("This is a test file data with the same content") + "Text missing for " + filename + "\n" + content, + content.contains("This is a test file data with the same content") ); assertTrue( - "Text missing for " + filename + "\n" + content, - content.contains("content parsing") + "Text missing for " + filename + "\n" + content, + content.contains("content parsing") ); assertTrue( - "Text missing for " + filename + "\n" + content, - content.contains("Different words to test against") + "Text missing for " + filename + "\n" + content, + content.contains("Different words to test against") ); assertTrue( - "Text missing for " + filename + "\n" + content, - content.contains("Mystery") + "Text missing for " + filename + "\n" + content, + content.contains("Mystery") ); } } finally { @@ -271,20 +270,20 @@ public class OOXMLParserTest extends Tik } } } - + /** * Test that the metadata is already extracted when the body is processed. * See TIKA-1109 */ @Test public void testPowerPointMetadataEarly() throws Exception { - String[] extensions = new String[] { - "pptx", "pptm", "ppsm", "ppsx", "potm" - //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 - //"xps" // TIKA-418: Not yet supported by POI - }; + String[] extensions = new String[]{ + "pptx", "pptm", "ppsm", "ppsx", "potm" + //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 + //"xps" // TIKA-418: Not yet supported by POI + }; - final String[] mimeTypes = new String[] { + final String[] mimeTypes = new String[]{ "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", @@ -292,32 +291,30 @@ public class OOXMLParserTest extends Tik "application/vnd.ms-powerpoint.template.macroenabled.12" }; - for (int i=0; i<extensions.length; i++) { + for (int i = 0; i < extensions.length; i++) { String extension = extensions[i]; final String filename = "testPPT." + extension; Parser parser = new AutoDetectParser(); final Metadata metadata = new Metadata(); - // Allow the value to be access from the inner class - final int currentI = i; - ContentHandler handler = new BodyContentHandler() - { - public void startDocument () - { - assertEquals( - "Mime-type checking for " + filename, - mimeTypes[currentI], - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Rajiv", metadata.get(Metadata.AUTHOR)); + // Allow the value to be access from the inner class + final int currentI = i; + ContentHandler handler = new BodyContentHandler() { + public void startDocument() { + assertEquals( + "Mime-type checking for " + filename, + mimeTypes[currentI], + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Rajiv", metadata.get(Metadata.AUTHOR)); - } + } - }; + }; ParseContext context = new ParseContext(); - + InputStream input = getTestDocument(filename); try { parser.parse(input, handler, metadata, context); @@ -326,48 +323,49 @@ public class OOXMLParserTest extends Tik } } } - + /** * For the PowerPoint formats we don't currently support, ensure that - * we don't break either + * we don't break either */ @Test public void testUnsupportedPowerPoint() throws Exception { - String[] extensions = new String[] { "xps", "thmx" }; - String[] mimeTypes = new String[] { - "application/vnd.ms-xpsdocument", - "application/vnd.openxmlformats-officedocument" // Is this right? - }; - - for (int i=0; i<extensions.length; i++) { - String extension = extensions[i]; - String filename = "testPPT." + extension; - - Parser parser = new AutoDetectParser(); - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, filename); - ContentHandler handler = new BodyContentHandler(); - ParseContext context = new ParseContext(); - - InputStream input = getTestDocument(filename); - try { - parser.parse(input, handler, metadata, context); - - // Should get the metadata - assertEquals( - "Mime-type checking for " + filename, - mimeTypes[i], - metadata.get(Metadata.CONTENT_TYPE)); + String[] extensions = new String[]{"xps", "thmx"}; + String[] mimeTypes = new String[]{ + "application/vnd.ms-xpsdocument", + "application/vnd.openxmlformats-officedocument" // Is this right? + }; + + for (int i = 0; i < extensions.length; i++) { + String extension = extensions[i]; + String filename = "testPPT." + extension; + + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + + InputStream input = getTestDocument(filename); + try { + parser.parse(input, handler, metadata, context); - // But that's about it - } finally { - input.close(); - } - } + // Should get the metadata + assertEquals( + "Mime-type checking for " + filename, + mimeTypes[i], + metadata.get(Metadata.CONTENT_TYPE)); + + // But that's about it + } finally { + input.close(); + } + } } - + /** * Test the plain text output of the Word converter + * * @throws Exception */ @Test @@ -393,6 +391,7 @@ public class OOXMLParserTest extends Tik /** * Test the plain text output of the Word converter + * * @throws Exception */ @Test @@ -415,69 +414,69 @@ public class OOXMLParserTest extends Tik /** * Test that the word converter is able to generate the - * correct HTML for the document + * correct HTML for the document */ @Test public void testWordHTML() throws Exception { - XMLResult result = getXML("testWORD.docx"); - String xml = result.xml; - Metadata metadata = result.metadata; - assertEquals( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - assertTrue(xml.contains("Sample Word Document")); - - // Check that custom headings came through - assertTrue(xml.contains("<h1 class=\"title\">")); - // Regular headings - assertTrue(xml.contains("<h1>Heading Level 1</h1>")); - assertTrue(xml.contains("<h2>Heading Level 2</h2>")); - // Headings with anchor tags in them - assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>")); - // Bold and italic - assertTrue(xml.contains("<b>BOLD</b>")); - assertTrue(xml.contains("<i>ITALIC</i>")); - // Table - assertTrue(xml.contains("<table>")); - assertTrue(xml.contains("<td>")); - // Links - assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); - // Anchor links - assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>")); - // Paragraphs with other styles - assertTrue(xml.contains("<p class=\"signature\">This one")); - - result = getXML("testWORD_3imgs.docx"); - xml = result.xml; - - // Images 2-4 (there is no 1!) - assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />")); - assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />")); - assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />")); - - // Text too - assertTrue(xml.contains("<p>The end!</p>")); - - // TIKA-692: test document containing multiple - // character runs within a bold tag: - xml = getXML("testWORD_bold_character_runs.docx").xml; - - // Make sure bold text arrived as single - // contiguous string even though Word parser - // handled this as 3 character runs - assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>")); - - // TIKA-692: test document containing multiple - // character runs within a bold tag: - xml = getXML("testWORD_bold_character_runs2.docx").xml; - - // Make sure bold text arrived as single - // contiguous string even though Word parser - // handled this as 3 character runs - assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>")); + XMLResult result = getXML("testWORD.docx"); + String xml = result.xml; + Metadata metadata = result.metadata; + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + assertTrue(xml.contains("Sample Word Document")); + + // Check that custom headings came through + assertTrue(xml.contains("<h1 class=\"title\">")); + // Regular headings + assertTrue(xml.contains("<h1>Heading Level 1</h1>")); + assertTrue(xml.contains("<h2>Heading Level 2</h2>")); + // Headings with anchor tags in them + assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>")); + // Bold and italic + assertTrue(xml.contains("<b>BOLD</b>")); + assertTrue(xml.contains("<i>ITALIC</i>")); + // Table + assertTrue(xml.contains("<table>")); + assertTrue(xml.contains("<td>")); + // Links + assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); + // Anchor links + assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>")); + // Paragraphs with other styles + assertTrue(xml.contains("<p class=\"signature\">This one")); + + result = getXML("testWORD_3imgs.docx"); + xml = result.xml; + + // Images 2-4 (there is no 1!) + assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />")); + assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />")); + assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />")); + + // Text too + assertTrue(xml.contains("<p>The end!</p>")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs.docx").xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs2.docx").xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); } /** @@ -490,7 +489,7 @@ public class OOXMLParserTest extends Tik StringWriter sw = new StringWriter(); SAXTransformerFactory factory = (SAXTransformerFactory) - SAXTransformerFactory.newInstance(); + SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); @@ -512,7 +511,7 @@ public class OOXMLParserTest extends Tik } /** - * Documents with some sheets are protected, but not all. + * Documents with some sheets are protected, but not all. * See TIKA-364. */ @Test @@ -539,7 +538,7 @@ public class OOXMLParserTest extends Tik } /** - * An excel document which is password protected. + * An excel document which is password protected. * See TIKA-437. */ @Test @@ -559,7 +558,7 @@ public class OOXMLParserTest extends Tik metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED)); - + String content = handler.toString(); assertContains("Office", content); } finally { @@ -750,7 +749,7 @@ public class OOXMLParserTest extends Tik /** * TIKA-712 Master Slide Text from PPT and PPTX files - * should be extracted too + * should be extracted too */ @Test public void testMasterText() throws Exception { @@ -807,151 +806,151 @@ public class OOXMLParserTest extends Tik */ @Test public void testExcelCustomProperties() throws Exception { - InputStream input = OOXMLParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_custom_props.xlsx"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OOXMLParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals( - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals(null, metadata.get(TikaCoreProperties.CREATOR)); - assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2006-09-12T15:06:44Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED)); - assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE)); - assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME)); - assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + InputStream input = OOXMLParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_custom_props.xlsx"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OOXMLParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(null, metadata.get(TikaCoreProperties.CREATOR)); + assertEquals(null, metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2006-09-12T15:06:44Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.LAST_MODIFIED)); + assertEquals("2011-08-22T14:24:38Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-08-22T14:24:38Z", metadata.get(Metadata.DATE)); + assertEquals("Microsoft Excel", metadata.get(Metadata.APPLICATION_NAME)); + assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } - + @Test public void testWordCustomProperties() throws Exception { - InputStream input = OOXMLParserTest.class.getResourceAsStream( - "/test-documents/testWORD_custom_props.docx"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OOXMLParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); - assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); - assertEquals("Microsoft Office Word",metadata.get(Metadata.APPLICATION_NAME)); - assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("1", metadata.get(Office.PAGE_COUNT)); - assertEquals("2", metadata.get(Office.WORD_COUNT)); - assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); - assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE)); - assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); - // TODO: Remove subject in Tika 2.0 - assertEquals("My subject", metadata.get(Metadata.SUBJECT)); - assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + InputStream input = OOXMLParserTest.class.getResourceAsStream( + "/test-documents/testWORD_custom_props.docx"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OOXMLParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); + assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME)); + assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("2", metadata.get(Office.WORD_COUNT)); + assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE)); + assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); + // TODO: Remove subject in Tika 2.0 + assertEquals("My subject", metadata.get(Metadata.SUBJECT)); + assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } - + @Test public void testPowerPointCustomProperties() throws Exception { - InputStream input = OOXMLParserTest.class.getResourceAsStream( - "/test-documents/testPPT_custom_props.pptx"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OOXMLParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals( - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); - assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE)); - assertEquals("1", metadata.get(Office.SLIDE_COUNT)); - assertEquals("3", metadata.get(Office.WORD_COUNT)); - assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + InputStream input = OOXMLParserTest.class.getResourceAsStream( + "/test-documents/testPPT_custom_props.pptx"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OOXMLParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals( + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE)); + assertEquals("1", metadata.get(Office.SLIDE_COUNT)); + assertEquals("3", metadata.get(Office.WORD_COUNT)); + assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } // TIKA-989: @Test public void testEmbeddedPDF() throws Exception { - InputStream input = OOXMLParserTest.class.getResourceAsStream( - "/test-documents/testWORD_embedded_pdf.docx"); - Metadata metadata = new Metadata(); - StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = (SAXTransformerFactory) + InputStream input = OOXMLParserTest.class.getResourceAsStream( + "/test-documents/testWORD_embedded_pdf.docx"); + Metadata metadata = new Metadata(); + StringWriter sw = new StringWriter(); + SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.setResult(new StreamResult(sw)); - - try { - new OOXMLParser().parse(input, handler, metadata, new ParseContext()); - } finally { - input.close(); - } - String xml = sw.toString(); - int i = xml.indexOf("Here is the pdf file:"); - int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>"); - int k = xml.indexOf("Bye Bye"); - int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>"); - int m = xml.indexOf("Bye for real."); - assertTrue(i != -1); - assertTrue(j != -1); - assertTrue(k != -1); - assertTrue(l != -1); - assertTrue(m != -1); - assertTrue(i < j); - assertTrue(j < k); - assertTrue(k < l); - assertTrue(l < m); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.setResult(new StreamResult(sw)); + + try { + new OOXMLParser().parse(input, handler, metadata, new ParseContext()); + } finally { + input.close(); + } + String xml = sw.toString(); + int i = xml.indexOf("Here is the pdf file:"); + int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>"); + int k = xml.indexOf("Bye Bye"); + int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>"); + int m = xml.indexOf("Bye for real."); + assertTrue(i != -1); + assertTrue(j != -1); + assertTrue(k != -1); + assertTrue(l != -1); + assertTrue(m != -1); + assertTrue(i < j); + assertTrue(j < k); + assertTrue(k < l); + assertTrue(l < m); } // TIKA-997: @@ -970,35 +969,35 @@ public class OOXMLParserTest extends Tik assertTrue(i < j); assertTrue(j < k); } - + // TIKA-1006 @Test public void testWordNullStyle() throws Exception { - String xml = getXML("testWORD_null_style.docx").xml; - assertContains("Test av styrt dokument", xml); + String xml = getXML("testWORD_null_style.docx").xml; + assertContains("Test av styrt dokument", xml); } /** * TIKA-1044 - Handle word documents where parts of the - * text have no formatting or styles applied to them + * text have no formatting or styles applied to them */ @Test public void testNoFormat() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); - InputStream stream = WordParserTest.class.getResourceAsStream( - "/test-documents/testWORD_no_format.docx"); - try { - new OOXMLParser().parse(stream, handler, metadata, new ParseContext()); - } finally { - stream.close(); - } + InputStream stream = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_no_format.docx"); + try { + new OOXMLParser().parse(stream, handler, metadata, new ParseContext()); + } finally { + stream.close(); + } - String content = handler.toString(); - assertContains("This is a piece of text that causes an exception", content); + String content = handler.toString(); + assertContains("This is a piece of text that causes an exception", content); } - + // TIKA-1005: @Test public void testTextInsideTextBox() throws Exception { @@ -1013,12 +1012,12 @@ public class OOXMLParserTest extends Tik @Test public void testEmbeddedPPTXTwoSlides() throws Exception { String xml = getXML("testPPT_embedded_two_slides.pptx").xml; - assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />" , xml); - assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />" , xml); + assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />", xml); + assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />", xml); } - + /** - * Test for missing text described in + * Test for missing text described in * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>. * and TIKA-1317 */ @@ -1045,37 +1044,37 @@ public class OOXMLParserTest extends Tik //TIKA-1100: @Test public void testExcelTextBox() throws Exception { - Metadata metadata = new Metadata(); + Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); InputStream input = getTestDocument("testEXCEL_textbox.xlsx"); parser.parse(input, handler, metadata, context); String content = handler.toString(); - assertContains("some autoshape", content); - } + assertContains("some autoshape", content); + } //TIKA-792; with room for future missing bean tests @Test - public void testWordMissingOOXMLBeans() throws Exception{ + public void testWordMissingOOXMLBeans() throws Exception { //If a bean is missing, POI prints stack trace to stderr String[] fileNames = new String[]{ - "testWORD_missing_ooxml_bean1.docx",//TIKA-792 + "testWORD_missing_ooxml_bean1.docx",//TIKA-792 }; PrintStream origErr = System.err; - for (String fileName : fileNames){ - Metadata metadata = new Metadata(); + for (String fileName : fileNames) { + Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); InputStream input = getTestDocument(fileName); - + //grab stderr ByteArrayOutputStream errContent = new ByteArrayOutputStream(); System.setErr(new PrintStream(errContent, true, IOUtils.UTF_8.name())); parser.parse(input, handler, metadata, context); - + //return stderr System.setErr(origErr); - + String err = errContent.toString(IOUtils.UTF_8.name()); assertTrue(err.length() == 0); input.close(); @@ -1089,39 +1088,39 @@ public class OOXMLParserTest extends Tik //not the auto-generated date. XMLResult result = getXML("testPPT_autodate.pptx"); - assertContains("<p>Now</p>\n"+ - "<p>2011-12-19 10:20:04 AM</p>\n", result.xml); - + assertContains("<p>Now</p>\n" + + "<p>2011-12-19 10:20:04 AM</p>\n", result.xml); + } - + @Test public void testDOCXThumbnail() throws Exception { String xml = getXML("testDOCX_Thumbnail.docx").xml; int a = xml.indexOf("This file contains a thumbnail"); int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />"); - + assertTrue(a != -1); assertTrue(b != -1); assertTrue(a < b); } - + @Test public void testXLSXThumbnail() throws Exception { String xml = getXML("testXLSX_Thumbnail.xlsx").xml; int a = xml.indexOf("This file contains an embedded thumbnail by default"); int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.wmf\" />"); - + assertTrue(a != -1); assertTrue(b != -1); assertTrue(a < b); } - + @Test public void testPPTXThumbnail() throws Exception { String xml = getXML("testPPTX_Thumbnail.pptx").xml; int a = xml.indexOf("<body><p>This file contains an embedded thumbnail</p>"); int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.jpeg\" />"); - + assertTrue(a != -1); assertTrue(b != -1); assertTrue(a < b); @@ -1171,7 +1170,7 @@ public class OOXMLParserTest extends Tik parser.parse(is, handler, m, context); } catch (EncryptedDocumentException ex) { exc = true; - } finally { + } finally { is.close(); } assertTrue(exc);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Fri May 29 14:36:21 2015 @@ -28,7 +28,7 @@ import org.junit.Test; public class AccessCheckerTest { @Test - public void testLegacy() throws AccessPermissionException{ + public void testLegacy() throws AccessPermissionException { Metadata m = getMetadata(false, false); //legacy behavior; don't bother checking Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri May 29 14:36:21 2015 @@ -59,12 +59,13 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.xml.sax.ContentHandler; + /** * Test case for parsing pdf files. */ public class PDFParserTest extends TikaTest { - public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN; + public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN; public static final MediaType TYPE_EMF = MediaType.application("x-emf"); public static final MediaType TYPE_PDF = MediaType.application("pdf"); public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); @@ -84,6 +85,21 @@ public class PDFParserTest extends TikaT Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL); } + private static int substringCount(String needle, String haystack) { + int upto = -1; + int count = 0; + while (true) { + final int next = haystack.indexOf(needle, upto); + if (next == -1) { + break; + } + count++; + upto = next + 1; + } + + return count; + } + @Test public void testPdfParsing() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! @@ -99,8 +115,8 @@ public class PDFParserTest extends TikaT assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR)); assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL)); assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE)); - - // Can't reliably test dates yet - see TIKA-451 + + // Can't reliably test dates yet - see TIKA-451 // assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE)); // assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED)); @@ -109,12 +125,12 @@ public class PDFParserTest extends TikaT assertContains("incubator", content); assertContains("Apache Software Foundation", content); // testing how the end of one paragraph is separated from start of the next one - assertTrue("should have word boundary after headline", + assertTrue("should have word boundary after headline", !content.contains("ToolkitApache")); - assertTrue("should have word boundary between paragraphs", + assertTrue("should have word boundary between paragraphs", !content.contains("libraries.Apache")); } - + @Test public void testPdfParsingMetadataOnly() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! @@ -149,80 +165,80 @@ public class PDFParserTest extends TikaT assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Document author", metadata.get(Metadata.AUTHOR)); assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE)); - + assertEquals("Custom Value", metadata.get("Custom Property")); - + assertEquals("Array Entry 1", metadata.get("Custom Array")); assertEquals(2, metadata.getValues("Custom Array").length); assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]); assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]); - + assertContains("Hello World!", content); } - + /** * PDFs can be "protected" with the default password. This means - * they're encrypted (potentially both text and metadata), - * but we can decrypt them easily. + * they're encrypted (potentially both text and metadata), + * but we can decrypt them easily. */ @Test public void testProtectedPDF() throws Exception { - Parser parser = new AutoDetectParser(); // Should auto-detect! - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); - InputStream stream = PDFParserTest.class.getResourceAsStream( - "/test-documents/testPDF_protected.pdf"); - try { - parser.parse(stream, handler, metadata, context); - } finally { - stream.close(); - } - - assertEquals("true", metadata.get("pdf:encrypted")); - assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR)); - assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); - assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); - - String content = handler.toString(); - assertContains("RETHINKING THE FINANCIAL NETWORK", content); - assertContains("On 16 November 2002", content); - assertContains("In many important respects", content); - - - // Try again with an explicit empty password - handler = new BodyContentHandler(); - metadata = new Metadata(); - - context = new ParseContext(); - context.set(PasswordProvider.class, new PasswordProvider() { - public String getPassword(Metadata metadata) { - return ""; - } - }); - - stream = PDFParserTest.class.getResourceAsStream( - "/test-documents/testPDF_protected.pdf"); - try { - parser.parse(stream, handler, metadata, context); - } finally { - stream.close(); - } - assertEquals("true", metadata.get("pdf:encrypted")); - - assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); - assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); - - assertContains("RETHINKING THE FINANCIAL NETWORK", content); - assertContains("On 16 November 2002", content); - assertContains("In many important respects", content); + InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf"); + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + + assertEquals("true", metadata.get("pdf:encrypted")); + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); + assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); + + String content = handler.toString(); + assertContains("RETHINKING THE FINANCIAL NETWORK", content); + assertContains("On 16 November 2002", content); + assertContains("In many important respects", content); + + + // Try again with an explicit empty password + handler = new BodyContentHandler(); + metadata = new Metadata(); + + context = new ParseContext(); + context.set(PasswordProvider.class, new PasswordProvider() { + public String getPassword(Metadata metadata) { + return ""; + } + }); + + stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf"); + try { + parser.parse(stream, handler, metadata, context); + } finally { + stream.close(); + } + assertEquals("true", metadata.get("pdf:encrypted")); + + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); + assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); + + assertContains("RETHINKING THE FINANCIAL NETWORK", content); + assertContains("On 16 November 2002", content); + assertContains("In many important respects", content); //now test wrong password handler = new BodyContentHandler(); @@ -292,7 +308,7 @@ public class PDFParserTest extends TikaT InputStream stream = PDFParserTest.class.getResourceAsStream( "/test-documents/testPDFTwoTextBoxes.pdf"); String content = getText(stream, parser); - content = content.replaceAll("\\s+"," "); + content = content.replaceAll("\\s+", " "); assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); } @@ -367,7 +383,7 @@ public class PDFParserTest extends TikaT Parser parser = new AutoDetectParser(); // Should auto-detect! InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); String content = getText(stream, parser); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); assertContains("Here is some text", content); assertContains("Here is a comment", content); @@ -376,7 +392,7 @@ public class PDFParserTest extends TikaT pdfParser.getPDFParserConfig().setExtractAnnotationText(false); stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); content = getText(stream, pdfParser); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); assertContains("Here is some text", content); assertEquals(-1, content.indexOf("Here is a comment")); @@ -387,15 +403,15 @@ public class PDFParserTest extends TikaT context.set(PDFParserConfig.class, config); stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); content = getText(stream, parser, context); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); assertContains("Here is some text", content); assertEquals(-1, content.indexOf("Here is a comment")); - - + + // TIKA-738: make sure no extra </p> tags String xml = getXML("testAnnotations.pdf").xml; assertEquals(substringCount("<p>", xml), - substringCount("</p>", xml)); + substringCount("</p>", xml)); } // TIKA-981 @@ -415,35 +431,20 @@ public class PDFParserTest extends TikaT assertContains("PDF2", xml); } - private static int substringCount(String needle, String haystack) { - int upto = -1; - int count = 0; - while(true) { - final int next = haystack.indexOf(needle, upto); - if (next == -1) { - break; - } - count++; - upto = next+1; - } - - return count; - } - @Test public void testPageNumber() throws Exception { final XMLResult result = getXML("testPageNumber.pdf"); - final String content = result.xml.replaceAll("\\s+",""); + final String content = result.xml.replaceAll("\\s+", ""); assertContains("<p>1</p>", content); } /** * Test to ensure that Links are extracted from the text - * + * <p/> * Note - the PDF contains the text "This is a hyperlink" which - * a hyperlink annotation, linking to the tika site, on it. This - * test will need updating when we're able to apply the annotation - * to the text itself, rather than following on afterwards as now + * a hyperlink annotation, linking to the tika site, on it. This + * test will need updating when we're able to apply the annotation + * to the text itself, rather than following on afterwards as now */ @Test public void testLinks() throws Exception { @@ -457,19 +458,19 @@ public class PDFParserTest extends TikaT parser.getPDFParserConfig().setEnableAutoSpace(false); InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); String content = getText(stream, parser); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); // Text is correct when autoSpace is off: assertContains("Here is some formatted text", content); parser.getPDFParserConfig().setEnableAutoSpace(true); stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); content = getText(stream, parser); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); // Text is correct when autoSpace is off: // Text has extra spaces when autoSpace is on assertEquals(-1, content.indexOf("Here is some formatted text")); - + //now try with autodetect Parser autoParser = new AutoDetectParser(); ParseContext context = new ParseContext(); @@ -478,18 +479,18 @@ public class PDFParserTest extends TikaT //default is true stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); content = getText(stream, autoParser, context); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); // Text has extra spaces when autoSpace is on assertEquals(-1, content.indexOf("Here is some formatted text")); config.setEnableAutoSpace(false); - + stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); content = getText(stream, parser, context); - content = content.replaceAll("[\\s\u00a0]+"," "); + content = content.replaceAll("[\\s\u00a0]+", " "); // Text is correct when autoSpace is off: assertContains("Here is some formatted text", content); - + } @Test @@ -505,7 +506,7 @@ public class PDFParserTest extends TikaT content = getText(stream, parser); // "Text the first" was dedup'd: assertContains("Text the first timesecond time", content); - + //now try with autodetect Parser autoParser = new AutoDetectParser(); ParseContext context = new ParseContext(); @@ -540,7 +541,7 @@ public class PDFParserTest extends TikaT content = content.replaceAll("\\s+", " "); // Column text is now interleaved: assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content); - + //now try setting autodetect via parsecontext AutoDetectParser autoParser = new AutoDetectParser(); ParseContext context = new ParseContext(); @@ -551,7 +552,7 @@ public class PDFParserTest extends TikaT content = getText(stream, autoParser, context); content = content.replaceAll("\\s+", " "); assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); - + config.setSortByPosition(true); context.set(PDFParserConfig.class, config); stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); @@ -559,7 +560,7 @@ public class PDFParserTest extends TikaT content = content.replaceAll("\\s+", " "); // Column text is now interleaved: assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content); - + } // TIKA-1035 @@ -572,7 +573,7 @@ public class PDFParserTest extends TikaT assertTrue(j != -1); assertTrue(i < j); } - + //TIKA-1124 @Test public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception { @@ -580,57 +581,57 @@ public class PDFParserTest extends TikaT docx/ pdf/ docx - */ - Parser parser = new AutoDetectParser(); // Should auto-detect! - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - String content = ""; - InputStream stream = null; - try { - context.set(org.apache.tika.parser.Parser.class, parser); - stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"); - parser.parse(stream, handler, metadata, context); - content = handler.toString(); - } finally { - stream.close(); - } - int outerHaystack = content.indexOf("Outer_haystack"); - int pdfHaystack = content.indexOf("pdf_haystack"); - int needle = content.indexOf("Needle"); - assertTrue(outerHaystack > -1); - assertTrue(pdfHaystack > -1); - assertTrue(needle > -1); - assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack); - - TrackingHandler tracker = new TrackingHandler(); - TikaInputStream tis; - ContainerExtractor ex = new ParserContainerExtractor(); - try{ - tis= TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx")); - ex.extract(tis, ex, tracker); - } finally { - stream.close(); - } - assertEquals(true, ex.isSupported(tis)); - assertEquals(3, tracker.filenames.size()); - assertEquals(3, tracker.mediaTypes.size()); - assertEquals("image1.emf", tracker.filenames.get(0)); - assertNull(tracker.filenames.get(1)); - assertEquals("Test.docx", tracker.filenames.get(2)); - assertEquals(TYPE_EMF, tracker.mediaTypes.get(0)); - assertEquals(TYPE_PDF, tracker.mediaTypes.get(1)); - assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2)); - } + */ + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + String content = ""; + InputStream stream = null; + try { + context.set(org.apache.tika.parser.Parser.class, parser); + stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"); + parser.parse(stream, handler, metadata, context); + content = handler.toString(); + } finally { + stream.close(); + } + int outerHaystack = content.indexOf("Outer_haystack"); + int pdfHaystack = content.indexOf("pdf_haystack"); + int needle = content.indexOf("Needle"); + assertTrue(outerHaystack > -1); + assertTrue(pdfHaystack > -1); + assertTrue(needle > -1); + assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack); + + TrackingHandler tracker = new TrackingHandler(); + TikaInputStream tis; + ContainerExtractor ex = new ParserContainerExtractor(); + try { + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx")); + ex.extract(tis, ex, tracker); + } finally { + stream.close(); + } + assertEquals(true, ex.isSupported(tis)); + assertEquals(3, tracker.filenames.size()); + assertEquals(3, tracker.mediaTypes.size()); + assertEquals("image1.emf", tracker.filenames.get(0)); + assertNull(tracker.filenames.get(1)); + assertEquals("Test.docx", tracker.filenames.get(2)); + assertEquals(TYPE_EMF, tracker.mediaTypes.get(0)); + assertEquals(TYPE_PDF, tracker.mediaTypes.get(1)); + assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2)); + } /** * tests for equality between traditional sequential parser * and newer nonsequential parser. - * + * <p/> * TODO: more testing */ @Test - public void testSequentialParser() throws Exception{ + public void testSequentialParser() throws Exception { Parser sequentialParser = new AutoDetectParser(); Parser nonSequentialParser = new AutoDetectParser(); @@ -659,14 +660,14 @@ public class PDFParserTest extends TikaT Set<String> knownContentDiffs = new HashSet<String>(); for (File f : testDocs.listFiles()) { - if (! f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) { + if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) { continue; } String sequentialContent = null; Metadata sequentialMetadata = new Metadata(); try { - sequentialContent = getText(new FileInputStream(f), + sequentialContent = getText(new FileInputStream(f), sequentialParser, seqContext, sequentialMetadata); } catch (EncryptedDocumentException e) { //silently skip a file that requires a user password @@ -680,8 +681,8 @@ public class PDFParserTest extends TikaT String nonSequentialContent = null; Metadata nonSequentialMetadata = new Metadata(); try { - nonSequentialContent = getText(new FileInputStream(f), - nonSequentialParser, nonSeqContext, nonSequentialMetadata); + nonSequentialContent = getText(new FileInputStream(f), + nonSequentialParser, nonSeqContext, nonSequentialMetadata); } catch (Exception e) { throw new TikaException("Non-Sequential Parser failed on test file " + f, e); } @@ -782,7 +783,7 @@ public class PDFParserTest extends TikaT context.set(org.apache.tika.parser.Parser.class, p); try { - tis= TikaInputStream.get( + tis = TikaInputStream.get( getResourceAsStream("/test-documents/testPDF_childAttachments.pdf")); p.parse(tis, new BodyContentHandler(-1), new Metadata(), context); } finally { @@ -821,13 +822,13 @@ public class PDFParserTest extends TikaT Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new EventCountingHandler(); - p.parse(is, h, m, c); - assertEquals(1, ((EventCountingHandler)h).getEndDocument()); + p.parse(is, h, m, c); + assertEquals(1, ((EventCountingHandler) h).getEndDocument()); } @Test public void testVersions() throws Exception { - + Map<String, String> dcFormat = new HashMap<String, String>(); dcFormat.put("4.x", "application/pdf; version=1.3"); dcFormat.put("5.x", "application/pdf; version=1.4"); @@ -847,7 +848,7 @@ public class PDFParserTest extends TikaT pdfVersions.put("9.x", "1.7"); pdfVersions.put("10.x", "1.7"); pdfVersions.put("11.x.PDFA-1b", "1.7"); - + Map<String, String> pdfExtensionVersions = new HashMap<String, String>(); pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3"); pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8"); @@ -855,9 +856,9 @@ public class PDFParserTest extends TikaT Parser p = new AutoDetectParser(); for (Map.Entry<String, String> e : dcFormat.entrySet()) { - String fName = "testPDF_Version."+e.getKey()+".pdf"; + String fName = "testPDF_Version." + e.getKey() + ".pdf"; InputStream is = PDFParserTest.class.getResourceAsStream( - "/test-documents/"+fName); + "/test-documents/" + fName); Metadata m = new Metadata(); ContentHandler h = new BodyContentHandler(); ParseContext c = new ParseContext(); @@ -873,8 +874,8 @@ public class PDFParserTest extends TikaT assertTrue("dc:format ::" + e.getValue(), foundDC); String extensionVersionTruth = pdfExtensionVersions.get(e.getKey()); if (extensionVersionTruth != null) { - assertEquals("pdf:PDFExtensionVersion :: "+extensionVersionTruth, - extensionVersionTruth, + assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth, + extensionVersionTruth, m.get("pdf:PDFExtensionVersion")); } assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()), @@ -883,7 +884,7 @@ public class PDFParserTest extends TikaT //now test full 11.x String fName = "testPDF_Version.11.x.PDFA-1b.pdf"; InputStream is = PDFParserTest.class.getResourceAsStream( - "/test-documents/"+fName); + "/test-documents/" + fName); Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(); @@ -893,14 +894,14 @@ public class PDFParserTest extends TikaT for (String fmt : m.getValues("dc:format")) { versions.add(fmt); } - - for (String hit : new String[]{ "application/pdf; version=1.7", - "application/pdf; version=\"A-1b\"", - "application/pdf; version=\"1.7 Adobe Extension Level 8\"" + + for (String hit : new String[]{"application/pdf; version=1.7", + "application/pdf; version=\"A-1b\"", + "application/pdf; version=\"1.7 Adobe Extension Level 8\"" }) { assertTrue(hit, versions.contains(hit)); } - + assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B"); assertEquals("pdfaid:part", m.get("pdfaid:part"), "1"); } @@ -909,15 +910,15 @@ public class PDFParserTest extends TikaT public void testMultipleAuthors() throws Exception { String fName = "testPDF_twoAuthors.pdf"; InputStream is = PDFParserTest.class.getResourceAsStream( - "/test-documents/"+fName); + "/test-documents/" + fName); Parser p = new AutoDetectParser(); Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(); p.parse(is, h, m, c); is.close(); - - String[] keys = new String[] { + + String[] keys = new String[]{ "dc:creator", "meta:author", "creator", @@ -926,7 +927,7 @@ public class PDFParserTest extends TikaT for (String k : keys) { String[] vals = m.getValues(k); - assertEquals("number of authors == 2 for key: "+ k, 2, vals.length); + assertEquals("number of authors == 2 for key: " + k, 2, vals.length); Set<String> set = new HashSet<String>(); set.add(vals[0]); set.add(vals[1]); @@ -955,7 +956,7 @@ public class PDFParserTest extends TikaT @Test public void testInlineSelector() throws Exception { - + PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); @@ -980,9 +981,9 @@ public class PDFParserTest extends TikaT for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { - if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){ + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { inline++; - } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){ + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { attach++; } } @@ -1007,9 +1008,9 @@ public class PDFParserTest extends TikaT for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { - if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){ + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { inline++; - } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){ + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { attach++; } } @@ -1022,7 +1023,7 @@ public class PDFParserTest extends TikaT @Test public void testInlineConfig() throws Exception { - + Parser defaultParser = new AutoDetectParser(); RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); @@ -1041,9 +1042,9 @@ public class PDFParserTest extends TikaT for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { - if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){ + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { inline++; - } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){ + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { attach++; } } @@ -1071,9 +1072,9 @@ public class PDFParserTest extends TikaT for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { - if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){ + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { inline++; - } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){ + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { attach++; } } @@ -1125,7 +1126,7 @@ public class PDFParserTest extends TikaT assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY)); assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); - + } @Test //TIKA-1427 @@ -1167,7 +1168,7 @@ public class PDFParserTest extends TikaT @Test public void testLegacyAccessChecking() throws Exception { //test that default behavior doesn't throw AccessPermissionException - for (String file : new String[] { + for (String file : new String[]{ "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) { @@ -1187,13 +1188,13 @@ public class PDFParserTest extends TikaT context.set(PasswordProvider.class, provider); Parser parser = new AutoDetectParser(); - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf", }) { InputStream stream = null; try { - stream = TikaInputStream.get(this.getClass().getResource("/test-documents/"+path)); + stream = TikaInputStream.get(this.getClass().getResource("/test-documents/" + path)); String text = getText(stream, parser, context); assertContains("Hello World", text); } finally { @@ -1213,11 +1214,11 @@ public class PDFParserTest extends TikaT context.set(PDFParserConfig.class, config); //test exception for empty password - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) { - assertException("/test-documents/"+path, parser, context, AccessPermissionException.class); + assertException("/test-documents/" + path, parser, context, AccessPermissionException.class); } config.setAccessChecker(new AccessChecker(true)); @@ -1226,7 +1227,7 @@ public class PDFParserTest extends TikaT InputStream is = null; try { - is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf"); + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_empty.pdf"); assertContains("Hello World", getText(is, parser, context)); } finally { IOUtils.closeQuietly(is); @@ -1253,41 +1254,41 @@ public class PDFParserTest extends TikaT Parser parser = new AutoDetectParser(); //test bad passwords - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) { - assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class); + assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class); } //bad password is still a bad password config.setAccessChecker(new AccessChecker(true)); - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf", }) { - assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class); + assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class); } //now test documents that require this "user" password - assertException("/test-documents/"+"testPDF_no_extract_no_accessibility_owner_user.pdf", + assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf", parser, context, AccessPermissionException.class); InputStream is = null; try { - is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); assertContains("Hello World", getText(is, parser, context)); } finally { IOUtils.closeQuietly(is); } config.setAccessChecker(new AccessChecker(false)); - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf", }) { - assertException("/test-documents/"+path, parser, context, AccessPermissionException.class); + assertException("/test-documents/" + path, parser, context, AccessPermissionException.class); } } @@ -1310,7 +1311,7 @@ public class PDFParserTest extends TikaT Parser parser = new AutoDetectParser(); //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf", "testPDF_no_extract_no_accessibility_owner_empty.pdf", @@ -1328,7 +1329,7 @@ public class PDFParserTest extends TikaT //really, with owner's password, all extraction is allowed config.setAccessChecker(new AccessChecker(false)); - for (String path : new String[] { + for (String path : new String[]{ "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf", "testPDF_no_extract_no_accessibility_owner_empty.pdf", @@ -1352,26 +1353,25 @@ public class PDFParserTest extends TikaT String text = getText(is, parser, context); noEx = true; } catch (Exception e) { - assertEquals("Not the right exception: "+path, expected, e.getClass()); + assertEquals("Not the right exception: " + path, expected, e.getClass()); } finally { IOUtils.closeQuietly(is); } assertFalse(path + " should have thrown exception", noEx); } + /** - * * Simple class to count end of document events. If functionality is useful, * move to org.apache.tika in src/test - * */ private class EventCountingHandler extends ContentHandlerDecorator { private int endDocument = 0; - + @Override public void endDocument() { endDocument++; } - + public int getEndDocument() { return endDocument; } @@ -1382,7 +1382,7 @@ public class PDFParserTest extends TikaT @Override public boolean select(Metadata metadata) { String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); - if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){ + if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { return false; } return true;
