This is an automated email from the ASF dual-hosted git repository. acosentino pushed a commit to branch CAMEL-23212 in repository https://gitbox.apache.org/repos/asf/camel.git
commit 62217e291915c96049774104c32a19d6c1e3d759 Author: Andrea Cosentino <[email protected]> AuthorDate: Wed Mar 18 12:14:52 2026 +0100 CAMEL-23212: Camel-Docling: Harden CLI argument injection validation in camel-docling Switch from blocklist to allowlist approach for custom CLI argument validation in DoclingProducer. Define a set of ~55 recognized docling CLI flags organized by category (format, pipeline, OCR, tables, PDF, enrichment, output, advanced, debug, performance, info). Any flag not in the allowlist is rejected with a clear error message. Add defense-in-depth rejection of shell metacharacters (;, |, `, $()) in all custom argument values, even though ProcessBuilder uses list-based invocation and does not interpret them. Enhance path traversal detection by normalizing path-like values via Path.normalize() to catch traversal sequences that bypass the literal ../ check (e.g., /safe/subdir/../../etc/passwd). Add 8 new unit tests covering semicolon, pipe, backtick, and $() injection attempts, unknown long/short flag rejection, -vv verbosity acceptance, and normalized path traversal detection. Document all allowed custom arguments in docling-component.adoc. Signed-off-by: Andrea Cosentino <[email protected]> --- .../src/main/docs/docling-component.adoc | 53 ++++++++ .../camel/component/docling/DoclingProducer.java | 151 ++++++++++++++++++--- .../docling/DoclingCustomArgsValidationTest.java | 122 ++++++++++++++++- 3 files changed, 307 insertions(+), 19 deletions(-) diff --git a/components/camel-ai/camel-docling/src/main/docs/docling-component.adoc b/components/camel-ai/camel-docling/src/main/docs/docling-component.adoc index 3459720929be..68595ddcdac4 100644 --- a/components/camel-ai/camel-docling/src/main/docs/docling-component.adoc +++ b/components/camel-ai/camel-docling/src/main/docs/docling-component.adoc @@ -381,6 +381,59 @@ YAML:: ---- ==== +=== Custom argument validation + +When passing custom CLI arguments via the `CamelDoclingCustomArguments` header, the component enforces an **allowlist** of recognized docling CLI flags. +Only the following flags are permitted: + +[width="100%",cols="2,4",options="header"] +|=== +| Category | Allowed flags + +| Input/output format +| `--from`, `--to` + +| Pipeline +| `--pipeline`, `--vlm-model`, `--asr-model` + +| OCR +| `--ocr`, `--no-ocr`, `--force-ocr`, `--no-force-ocr`, `--ocr-engine`, `--ocr-lang`, `--psm` + +| Tables +| `--tables`, `--no-tables`, `--table-mode` + +| PDF +| `--pdf-backend`, `--pdf-password` + +| Enrichment +| `--enrich-code`, `--no-enrich-code`, `--enrich-formula`, `--no-enrich-formula`, `--enrich-picture-classes`, `--no-enrich-picture-classes`, `--enrich-picture-description`, `--no-enrich-picture-description`, `--enrich-chart-extraction`, `--no-enrich-chart-extraction` + +| Output formatting +| `--image-export-mode`, `--show-layout`, `--no-show-layout` + +| Advanced +| `--headers`, `--artifacts-path`, `--enable-remote-services`, `--no-enable-remote-services`, `--allow-external-plugins`, `--no-allow-external-plugins`, `--show-external-plugins`, `--no-show-external-plugins`, `--document-timeout`, `--device`, `--num-threads`, `--page-batch-size` + +| Debug +| `--verbose`, `-v` / `-vv` / `-vvv`, `--debug-visualize-cells`, `--no-debug-visualize-cells`, `--debug-visualize-ocr`, `--no-debug-visualize-ocr`, `--debug-visualize-layout`, `--no-debug-visualize-layout`, `--debug-visualize-tables`, `--no-debug-visualize-tables` + +| Performance +| `--abort-on-error`, `--no-abort-on-error`, `--profiling`, `--no-profiling`, `--save-profiling`, `--no-save-profiling` + +| Info +| `--version`, `--help`, `--logo` + +|=== + +The `--output` (`-o`) flag is **not permitted** because the output directory is managed by the producer. +Use the `CamelDoclingOutputFilePath` header or endpoint configuration instead. + +Additionally, the following are rejected: + +- **Shell metacharacters**: `;`, `|`, `` ` ``, `$()` — blocked as defense-in-depth even though ProcessBuilder does not interpret them. +- **Path traversal**: `../`, `..\`, and paths that resolve to traversal after normalization. +- **Unknown flags**: any flag not in the allowlist above. + === Extracting document metadata [tabs] diff --git a/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java b/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java index 17bfc05e851b..437fa8f540cb 100644 --- a/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java +++ b/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java @@ -30,6 +30,7 @@ import java.util.Base64; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionStage; import java.util.concurrent.ConcurrentHashMap; @@ -86,6 +87,56 @@ public class DoclingProducer extends DefaultProducer { private static final Logger LOG = LoggerFactory.getLogger(DoclingProducer.class); + /** + * Recognized docling CLI flags. Only these flags are permitted in custom arguments (allowlist approach). Flags + * managed by the producer ({@code --output}, {@code -o}) are excluded and checked separately. + */ + private static final Set<String> ALLOWED_DOCLING_FLAGS = Set.of( + // Input/output format + "--from", "--to", + // Pipeline + "--pipeline", "--vlm-model", "--asr-model", + // OCR + "--ocr", "--no-ocr", "--force-ocr", "--no-force-ocr", + "--ocr-engine", "--ocr-lang", "--psm", + // Tables + "--tables", "--no-tables", "--table-mode", + // PDF + "--pdf-backend", "--pdf-password", + // Enrichment + "--enrich-code", "--no-enrich-code", + "--enrich-formula", "--no-enrich-formula", + "--enrich-picture-classes", "--no-enrich-picture-classes", + "--enrich-picture-description", "--no-enrich-picture-description", + "--enrich-chart-extraction", "--no-enrich-chart-extraction", + // Output formatting + "--image-export-mode", + "--show-layout", "--no-show-layout", + // Advanced + "--headers", "--artifacts-path", + "--enable-remote-services", "--no-enable-remote-services", + "--allow-external-plugins", "--no-allow-external-plugins", + "--show-external-plugins", "--no-show-external-plugins", + "--document-timeout", "--device", "--num-threads", "--page-batch-size", + // Debug + "--verbose", + "--debug-visualize-cells", "--no-debug-visualize-cells", + "--debug-visualize-ocr", "--no-debug-visualize-ocr", + "--debug-visualize-layout", "--no-debug-visualize-layout", + "--debug-visualize-tables", "--no-debug-visualize-tables", + // Performance / error handling + "--abort-on-error", "--no-abort-on-error", + "--profiling", "--no-profiling", + "--save-profiling", "--no-save-profiling", + // Info + "--version", "--help", "--logo"); + + /** + * Flags managed by the producer that must not be overridden through custom arguments. The output directory is + * controlled by the producer via endpoint configuration or the {@link DoclingHeaders#OUTPUT_FILE_PATH} header. + */ + private static final Set<String> PRODUCER_MANAGED_FLAGS = Set.of("--output", "-o"); + private DoclingConfiguration configuration; private DoclingServeApi doclingServeApi; private ObjectMapper objectMapper; @@ -1876,14 +1927,10 @@ public class DoclingProducer extends DefaultProducer { } /** - * Validates custom CLI arguments to ensure they do not conflict with producer-managed options such as the output - * directory. + * Validates custom CLI arguments using an allowlist approach. Only recognized docling CLI flags are permitted. + * Producer-managed flags, shell metacharacters, and path traversal sequences are rejected. */ private void validateCustomArguments(List<String> customArgs) { - // The output directory is managed by the producer via endpoint configuration - // or the OUTPUT_FILE_PATH header, so it must not be overridden through custom arguments. - List<String> blockedFlags = List.of("--output", "-o"); - for (int i = 0; i < customArgs.size(); i++) { String arg = customArgs.get(i); @@ -1891,20 +1938,88 @@ public class DoclingProducer extends DefaultProducer { throw new IllegalArgumentException("Custom argument at index " + i + " is null"); } - String argLower = arg.toLowerCase(); - for (String blocked : blockedFlags) { - if (argLower.equals(blocked) || argLower.startsWith(blocked + "=")) { - throw new IllegalArgumentException( - "Custom argument '" + blocked - + "' is not allowed because the output directory is managed by the producer. " - + "Use the " + DoclingHeaders.OUTPUT_FILE_PATH - + " header or endpoint configuration instead."); - } + rejectShellMetacharacters(arg, i); + + if (arg.startsWith("--")) { + validateLongFlag(arg, i); + } else if (arg.startsWith("-")) { + validateShortFlag(arg, i); + } else { + validatePathSafety(arg, i); } + } + } - if (arg.contains("../") || arg.contains("..\\")) { - throw new IllegalArgumentException( - "Custom argument at index " + i + " contains a relative path traversal sequence"); + private void validateLongFlag(String arg, int index) { + String flag = arg.contains("=") ? arg.substring(0, arg.indexOf('=')) : arg; + String flagLower = flag.toLowerCase(); + + if (PRODUCER_MANAGED_FLAGS.contains(flagLower)) { + throw new IllegalArgumentException( + "Custom argument '" + flag + + "' is not allowed because the output directory is managed by the producer. " + + "Use the " + DoclingHeaders.OUTPUT_FILE_PATH + + " header or endpoint configuration instead."); + } + + if (!ALLOWED_DOCLING_FLAGS.contains(flagLower)) { + throw new IllegalArgumentException( + "Custom argument '" + flag + + "' is not a recognized docling CLI flag. " + + "Only known docling flags are permitted as custom arguments."); + } + + if (arg.contains("=")) { + String value = arg.substring(arg.indexOf('=') + 1); + validatePathSafety(value, index); + } + } + + private void validateShortFlag(String arg, int index) { + String flagLower = arg.toLowerCase(); + + // Allow -v, -vv, -vvv (verbosity levels) + if (flagLower.matches("-v+")) { + return; + } + + if (PRODUCER_MANAGED_FLAGS.contains(flagLower)) { + throw new IllegalArgumentException( + "Custom argument '" + arg + + "' is not allowed because the output directory is managed by the producer. " + + "Use the " + DoclingHeaders.OUTPUT_FILE_PATH + + " header or endpoint configuration instead."); + } + + throw new IllegalArgumentException( + "Custom argument '" + arg + + "' is not a recognized docling CLI flag. " + + "Only known docling flags are permitted as custom arguments."); + } + + private static void rejectShellMetacharacters(String arg, int index) { + if (arg.contains(";") || arg.contains("|") || arg.contains("`") || arg.contains("$(")) { + throw new IllegalArgumentException( + "Custom argument at index " + index + + " contains a disallowed character or pattern. " + + "Shell metacharacters (;, |, `, $()) are not permitted."); + } + } + + private static void validatePathSafety(String value, int index) { + if (value.contains("../") || value.contains("..\\")) { + throw new IllegalArgumentException( + "Custom argument at index " + index + " contains a relative path traversal sequence"); + } + // Normalize path-like values to detect traversal via redundant separators + if (value.contains("/") || value.contains("\\")) { + Path normalized = Paths.get(value).normalize(); + for (Path component : normalized) { + if ("..".equals(component.toString())) { + throw new IllegalArgumentException( + "Custom argument at index " + index + + " resolves to a path containing traversal after normalization"); + } } } } diff --git a/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/DoclingCustomArgsValidationTest.java b/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/DoclingCustomArgsValidationTest.java index 3699b5fc451f..d541d5155b71 100644 --- a/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/DoclingCustomArgsValidationTest.java +++ b/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/DoclingCustomArgsValidationTest.java @@ -89,7 +89,7 @@ class DoclingCustomArgsValidationTest extends CamelTestSupport { CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { template.requestBodyAndHeaders("direct:cli-convert", inputFile.toString(), - java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--some-flag", "../../etc/passwd"))); + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--artifacts-path", "../../etc/passwd"))); }); assertInstanceOf(IllegalArgumentException.class, ex.getCause()); @@ -145,6 +145,126 @@ class DoclingCustomArgsValidationTest extends CamelTestSupport { "No custom arguments should not trigger argument validation"); } + // -- Shell metacharacter injection tests -- + + @Test + void customArgsWithSemicolonAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--verbose", "; rm -rf /"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("disallowed")); + } + + @Test + void customArgsWithPipeAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--verbose", "| cat /etc/passwd"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("disallowed")); + } + + @Test + void customArgsWithBacktickAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--verbose", "`whoami`"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("disallowed")); + } + + @Test + void customArgsWithCommandSubstitutionAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--verbose", "$(id)"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("disallowed")); + } + + // -- Allowlist enforcement tests -- + + @Test + void customArgsWithUnknownFlagAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("--unknown-flag"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("not a recognized docling CLI flag")); + } + + @Test + void customArgsWithUnknownShortFlagAreRejected() throws Exception { + Path inputFile = createInputFile(); + + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("-x"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("not a recognized docling CLI flag")); + } + + @Test + void customArgsWithVerbosityLevelsAreAccepted() throws Exception { + Path inputFile = createInputFile(); + + // -v and -vv should pass validation (verbosity levels) + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, List.of("-vv"))); + }); + + assertFalse(ex.getCause() instanceof IllegalArgumentException, + "-vv should pass validation; failure should come from process execution"); + } + + @Test + void customArgsWithNormalizedPathTraversalAreRejected() throws Exception { + Path inputFile = createInputFile(); + + // Path traversal that would be caught only after normalization + CamelExecutionException ex = assertThrows(CamelExecutionException.class, () -> { + template.requestBodyAndHeaders("direct:cli-convert", + inputFile.toString(), + java.util.Map.of(DoclingHeaders.CUSTOM_ARGUMENTS, + List.of("--artifacts-path", "/safe/path/subdir/../../etc/passwd"))); + }); + + assertInstanceOf(IllegalArgumentException.class, ex.getCause()); + assertTrue(ex.getCause().getMessage().contains("path traversal") || + ex.getCause().getMessage().contains("traversal after normalization")); + } + private Path createInputFile() throws Exception { Path file = tempDir.resolve("test-input.txt"); Files.writeString(file, "test content");
