This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4354
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 632d2a54089a28c5cd328384a8fc6003e9c6b16d
Author: tallison <talli...@apache.org>
AuthorDate: Tue Nov 19 09:39:45 2024 -0500

    TIKA-4354 -- make incremental update metadata and parsing default in 
tika-cli
---
 tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java |   9 ++++++---
 .../src/test/java/org/apache/tika/cli/TikaCLITest.java  |   8 ++++++++
 .../resources/test-data/testPDF_incrementalUpdates.pdf  | Bin 0 -> 64872 bytes
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index f59d88743..ec7172562 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -340,9 +340,12 @@ public class TikaCLI {
         if (configFilePath == null && context.get(PDFParserConfig.class) == 
null) {
             PDFParserConfig pdfParserConfig = new PDFParserConfig();
             pdfParserConfig.setExtractInlineImages(true);
+            pdfParserConfig.setExtractIncrementalUpdateInfo(true);
             pdfParserConfig.setParseIncrementalUpdates(true);
-            String warn = "As a convenience, TikaCLI has turned on extraction 
of\n" + "inline images and incremental updates for the PDFParser (TIKA-2374 and 
" + "TIKA-4017).\n" +
-                    "Aside from the -z option, this is not the default 
behavior\n" + "in Tika generally or in tika-server.";
+            String warn = "As a convenience, TikaCLI has turned on extraction 
of\n" +
+                    "inline images and incremental updates for the PDFParser 
(TIKA-2374, " +
+                    "TIKA-4017 and TIKA-4354).\n" +
+                    "This is not the default behavior in Tika generally or in 
tika-server.";
             LOG.info(warn);
             context.set(PDFParserConfig.class, pdfParserConfig);
         }
@@ -446,7 +449,6 @@ public class TikaCLI {
             }
             extractDir = new File(dirPath);
         } else if (arg.equals("-z") || arg.equals("--extract")) {
-            configurePDFExtractSettings();
             type = NO_OUTPUT;
             context.set(EmbeddedDocumentExtractor.class, new 
FileEmbeddedDocumentExtractor());
         } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
@@ -475,6 +477,7 @@ public class TikaCLI {
                 } else {
                     url = new URL(arg);
                 }
+                configurePDFExtractSettings();
                 if (recursiveJSON) {
                     handleRecursiveJson(url, System.out);
                 } else {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index ec6e7df1a..e318ff80b 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -241,6 +241,14 @@ public class TikaCLITest {
         assertTrue(fb > -1 && title > -1 && fb > title);
     }
 
+    @Test
+    public void testDefaultPDFIncrementalUpdateSettings() throws Exception {
+        String json = getParamOutContent("-J",
+                resourcePrefix + "testPDF_incrementalUpdates.pdf");
+        assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\""));
+        assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
+    }
+
     /**
      * Tests -l option of the cli
      *
diff --git 
a/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf 
b/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf
new file mode 100644
index 000000000..8494cc839
Binary files /dev/null and 
b/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf differ

Reply via email to