[tika] 02/02: TIKA-2374 -- tika-app cli should extract inline images by default

tallison Mon, 03 Jul 2017 05:41:02 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 2deadf4c4d3d396d4d9f3cc5cee6ed3cb0bce868
Author: tballison <[email protected]>
AuthorDate: Mon Jul 3 08:37:04 2017 -0400

    TIKA-2374 -- tika-app cli should extract inline images by default
---
 CHANGES.txt                                        |   4 ++
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  11 ++++++
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  42 ++++++++++++++++++---
 .../test-data/testPDF_childAttachments.pdf         | Bin 0 -> 2318262 bytes
 4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9c22fe7..2c92fa4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 Release 1.16 - ??/??/????
 
+  * tika-app now extracts inline images in PDFs by
+    default, and it includes a warning to users that this is not the
+    default behavior elsewhere in Tika (TIKA-2374).
+
   * Allow configurability of warnings for problems during
     parser initialization (TIKA-2389).
 
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 1c9f9ab..88be988 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -98,6 +98,7 @@ import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
@@ -185,6 +186,16 @@ public class TikaCLI {
                 p = new ForkParser(TikaCLI.class.getClassLoader(), p);
             }
             ContentHandler handler = getContentHandler(output, metadata);
+            if (config == null && context.get(PDFParserConfig.class) == null) {
+                PDFParserConfig pdfParserConfig = new PDFParserConfig();
+                pdfParserConfig.setExtractInlineImages(true);
+                String warn = "As a convenience, TikaCLI has turned on 
extraction of\n" +
+                        "inline images for the PDFParser (TIKA-2374).\n" +
+                        "This is not the default option in Tika generally or 
in tika-server.";
+                LOG.info(warn);
+                System.err.println(warn);
+                context.set(PDFParserConfig.class, pdfParserConfig);
+            }
             p.parse(input, handler, metadata, context);
             // fix for TIKA-596: if a parser doesn't generate
             // XHTML output, the lack of an output document prevents
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5d0e0b1..0e084f4 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -248,19 +248,19 @@ public class TikaCLITest {
         File tempFile = File.createTempFile("tika-test-", "");
         tempFile.delete();
         tempFile.mkdir(); // not really good method for production usage, but 
ok for tests
-                          // google guava library has better solution
+        // google guava library has better solution
 
         try {
             String[] params = 
{"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + 
"/coffee.xls"};
-            
+
             TikaCLI.main(params);
-            
+
             StringBuffer allFiles = new StringBuffer();
             for (String f : tempFile.list()) {
                 if (allFiles.length() > 0) allFiles.append(" : ");
                 allFiles.append(f);
             }
-            
+
             // ChemDraw file
             File expectedCDX = new File(tempFile, "MBD002B040A.cdx");
             // Image of the ChemDraw molecule
@@ -271,7 +271,7 @@ public class TikaCLITest {
             File expected262FE3 = new File(tempFile, "MBD00262FE3.txt");
             // Image of one of the embedded resources
             File expectedEMF = new File(tempFile, "file0.emf");
-            
+
             assertExtracted(expectedCDX, allFiles.toString());
             assertExtracted(expectedIMG, allFiles.toString());
             assertExtracted(expectedOLE10, allFiles.toString());
@@ -325,6 +325,38 @@ public class TikaCLITest {
     }
 
     @Test
+    public void testExtractInlineImages() throws Exception {
+        File tempFile = File.createTempFile("tika-test-", "");
+        tempFile.delete();
+        tempFile.mkdir(); // not really good method for production usage, but 
ok for tests
+        // google guava library has better solution
+
+        try {
+            String[] params = 
{"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + 
"/testPDF_childAttachments.pdf"};
+
+            TikaCLI.main(params);
+
+            StringBuffer allFiles = new StringBuffer();
+            for (String f : tempFile.list()) {
+                if (allFiles.length() > 0) allFiles.append(" : ");
+                allFiles.append(f);
+            }
+
+            File jpeg = new File(tempFile, "image0.jpg");
+            //tiff isn't extracted without optional image dependency
+//            File tiff = new File(tempFile, "image1.tif");
+            File jobOptions = new File(tempFile, "Press 
Quality(1).joboptions");
+            File doc = new File(tempFile, "Unit10.doc");
+
+            assertExtracted(jpeg, allFiles.toString());
+            assertExtracted(jobOptions, allFiles.toString());
+            assertExtracted(doc, allFiles.toString());
+        } finally {
+            FileUtils.deleteDirectory(tempFile);
+        }
+    }
+
+    @Test
     public void testDefaultConfigException() throws Exception {
         //default xml parser will throw TikaException
         //this and TestConfig() are broken into separate tests so that
diff --git a/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf 
b/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf
new file mode 100644
index 0000000..7b2158a
Binary files /dev/null and 
b/tika-app/src/test/resources/test-data/testPDF_childAttachments.pdf differ

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

[tika] 02/02: TIKA-2374 -- tika-app cli should extract inline images by default

Reply via email to