This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4354 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 632d2a54089a28c5cd328384a8fc6003e9c6b16d Author: tallison <talli...@apache.org> AuthorDate: Tue Nov 19 09:39:45 2024 -0500 TIKA-4354 -- make incremental update metadata and parsing default in tika-cli --- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java | 9 ++++++--- .../src/test/java/org/apache/tika/cli/TikaCLITest.java | 8 ++++++++ .../resources/test-data/testPDF_incrementalUpdates.pdf | Bin 0 -> 64872 bytes 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index f59d88743..ec7172562 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -340,9 +340,12 @@ public class TikaCLI { if (configFilePath == null && context.get(PDFParserConfig.class) == null) { PDFParserConfig pdfParserConfig = new PDFParserConfig(); pdfParserConfig.setExtractInlineImages(true); + pdfParserConfig.setExtractIncrementalUpdateInfo(true); pdfParserConfig.setParseIncrementalUpdates(true); - String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images and incremental updates for the PDFParser (TIKA-2374 and " + "TIKA-4017).\n" + - "Aside from the -z option, this is not the default behavior\n" + "in Tika generally or in tika-server."; + String warn = "As a convenience, TikaCLI has turned on extraction of\n" + + "inline images and incremental updates for the PDFParser (TIKA-2374, " + + "TIKA-4017 and TIKA-4354).\n" + + "This is not the default behavior in Tika generally or in tika-server."; LOG.info(warn); context.set(PDFParserConfig.class, pdfParserConfig); } @@ -446,7 +449,6 @@ public class TikaCLI { } extractDir = new File(dirPath); } else if (arg.equals("-z") || arg.equals("--extract")) { - configurePDFExtractSettings(); type = NO_OUTPUT; context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor()); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { @@ -475,6 +477,7 @@ public class TikaCLI { } else { url = new URL(arg); } + configurePDFExtractSettings(); if (recursiveJSON) { handleRecursiveJson(url, System.out); } else { diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index ec6e7df1a..e318ff80b 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -241,6 +241,14 @@ public class TikaCLITest { assertTrue(fb > -1 && title > -1 && fb > title); } + @Test + public void testDefaultPDFIncrementalUpdateSettings() throws Exception { + String json = getParamOutContent("-J", + resourcePrefix + "testPDF_incrementalUpdates.pdf"); + assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\"")); + assertTrue(json.contains("embeddedResourceType\":\"VERSION\"")); + } + /** * Tests -l option of the cli * diff --git a/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf b/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf new file mode 100644 index 000000000..8494cc839 Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPDF_incrementalUpdates.pdf differ