This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2deadf4c4d3d396d4d9f3cc5cee6ed3cb0bce868 Author: tballison <[email protected]> AuthorDate: Mon Jul 3 08:37:04 2017 -0400 TIKA-2374 -- tika-app cli should extract inline images by default --- CHANGES.txt | 4 ++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 11 ++++++ .../test/java/org/apache/tika/cli/TikaCLITest.java | 42 ++++++++++++++++++--- .../test-data/testPDF_childAttachments.pdf | Bin 0 -> 2318262 bytes 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 9c22fe7..2c92fa4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,9 @@ Release 1.16 - ??/??/???? + * tika-app now extracts inline images in PDFs by + default, and it includes a warning to users that this is not the + default behavior elsewhere in Tika (TIKA-2374). + * Allow configurability of warnings for problems during parser initialization (TIKA-2389). diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 1c9f9ab..88be988 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -98,6 +98,7 @@ import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.utils.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; @@ -185,6 +186,16 @@ public class TikaCLI { p = new ForkParser(TikaCLI.class.getClassLoader(), p); } ContentHandler handler = getContentHandler(output, metadata); + if (config == null && context.get(PDFParserConfig.class) == null) { + PDFParserConfig pdfParserConfig = new PDFParserConfig(); + pdfParserConfig.setExtractInlineImages(true); + String warn = "As a convenience, TikaCLI has turned on extraction of\n" + + "inline images for the PDFParser (TIKA-2374).\n" + + "This is not the default option in Tika generally or in tika-server."; + LOG.info(warn); + System.err.println(warn); + context.set(PDFParserConfig.class, pdfParserConfig); + } p.parse(input, handler, metadata, context); // fix for TIKA-596: if a parser doesn't generate // XHTML output, the lack of an output document prevents diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 5d0e0b1..0e084f4 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -248,19 +248,19 @@ public class TikaCLITest { File tempFile = File.createTempFile("tika-test-", ""); tempFile.delete(); tempFile.mkdir(); // not really good method for production usage, but ok for tests - // google guava library has better solution + // google guava library has better solution try { String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/coffee.xls"}; - + TikaCLI.main(params); - + StringBuffer allFiles = new StringBuffer(); for (String f : tempFile.list()) { if (allFiles.length() > 0) allFiles.append(" : "); allFiles.append(f); } - + // ChemDraw file File expectedCDX = new File(tempFile, "MBD002B040A.cdx"); // Image of the ChemDraw molecule @@ -271,7 +271,7 @@ public class TikaCLITest { File expected262FE3 = new File(tempFile, "MBD00262FE3.txt"); // Image of one of the embedded resources File expectedEMF = new File(tempFile, "file0.emf"); - + assertExtracted(expectedCDX, allFiles.toString()); assertExtracted(expectedIMG, allFiles.toString()); assertExtracted(expectedOLE10, allFiles.toString()); @@ -325,6 +325,38 @@ public class TikaCLITest { } @Test + public void testExtractInlineImages() throws Exception { + File tempFile = File.createTempFile("tika-test-", ""); + tempFile.delete(); + tempFile.mkdir(); // not really good method for production usage, but ok for tests + // google guava library has better solution + + try { + String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/testPDF_childAttachments.pdf"}; + + TikaCLI.main(params); + + StringBuffer allFiles = new StringBuffer(); + for (String f : tempFile.list()) { + if (allFiles.length() > 0) allFiles.append(" : "); + allFiles.append(f); + } + + File jpeg = new File(tempFile, "image0.jpg"); + //tiff isn't extracted without optional image dependency +// File tiff = new File(tempFile, "image1.tif"); + File jobOptions = new File(tempFile, "Press Quality(1).joboptions"); + File doc = new File(tempFile, "Unit10.doc"); + + assertExtracted(jpeg, allFiles.toString()); + assertExtracted(jobOptions, allFiles.toString()); + assertExtracted(doc, allFiles.toString()); + } finally { + FileUtils.deleteDirectory(tempFile); + } + } + + @Test public void testDefaultConfigException() throws Exception { //default xml parser will throw TikaException //this and TestConfig() are broken into separate tests so that diff --git a/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf b/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf new file mode 100644 index 0000000..7b2158a Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf differ -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
