This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4645-usability-scripts in repository https://gitbox.apache.org/repos/asf/tika.git
commit 53274892bbd0a23c2832890d19eb02bca59468cb Author: tallison <[email protected]> AuthorDate: Mon Feb 2 06:47:29 2026 -0500 TIKA-4645 - usability scripts --- docs/advanced/integration-testing/tika-server.adoc | 473 +++++++++++++++++++++ .../org/apache/tika/async/cli/PluginsWriter.java | 52 ++- .../apache/tika/server/core/TikaServerProcess.java | 265 ++++++++---- .../server/core/resource/PipesParsingHelper.java | 288 +++++++------ .../org/apache/tika/server/core/CXFTestBase.java | 39 +- 5 files changed, 907 insertions(+), 210 deletions(-) diff --git a/docs/advanced/integration-testing/tika-server.adoc b/docs/advanced/integration-testing/tika-server.adoc new file mode 100644 index 0000000000..85bca5f1fa --- /dev/null +++ b/docs/advanced/integration-testing/tika-server.adoc @@ -0,0 +1,473 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika-Server Integration Testing + +Integration tests for `tika-server` to be run from a distribution ZIP. + +== Setup + +[source,bash] +---- +# Create test directory +mkdir -p /tmp/tika-server-test +cd /tmp/tika-server-test + +# Copy and extract distribution +cp /path/to/tika-server-standard-4.0.0-SNAPSHOT-bin.zip . +unzip tika-server-standard-4.0.0-SNAPSHOT-bin.zip + +# Copy test files +cp /path/to/test-documents/testPDF.pdf . +cp /path/to/test-documents/testHTML.html . +cp /path/to/test-documents/test_recursive_embedded.docx . +---- + +== Part 1: Default Mode Tests + +Start server in default mode (config endpoints disabled): + +[source,bash] +---- +java -jar tika-server.jar --port 9998 & +sleep 8 +curl -s http://localhost:9998/version +---- + +=== Test 1: GET /version + +[source,bash] +---- +curl -s http://localhost:9998/version +---- + +*Expected:* `Apache Tika X.X.X` + +=== Test 2: PUT /detect/stream + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/detect/stream +---- + +*Expected:* `application/pdf` + +=== Test 3: PUT /tika/text + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/text +---- + +*Expected:* Plain text content extracted from PDF. + +=== Test 4: PUT /tika/html + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/html +---- + +*Expected:* HTML with metadata in `<meta>` tags and content in `<body>`. + +=== Test 5: PUT /tika/xml + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/xml +---- + +*Expected:* XHTML content (starts with `<html xmlns=...>`). + +=== Test 6: PUT /tika/json + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/json +---- + +*Expected:* JSON object with metadata and X-TIKA:content field. + +=== Test 7: PUT /meta + +[source,bash] +---- +curl -s -X PUT -H "Accept: application/json" -T testPDF.pdf http://localhost:9998/meta +---- + +*Expected:* JSON object with metadata only (no content). + +=== Test 8: PUT /meta/{field} + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/meta/Content-Type +---- + +*Expected:* `Content-Type,application/pdf` + +=== Test 9: PUT /rmeta + +[source,bash] +---- +curl -s -X PUT -T test_recursive_embedded.docx http://localhost:9998/rmeta +---- + +*Expected:* JSON array with metadata for main document and all embedded documents. + +=== Test 10: PUT /rmeta/text + +[source,bash] +---- +curl -s -X PUT -T test_recursive_embedded.docx http://localhost:9998/rmeta/text +---- + +*Expected:* JSON array with ToTextContentHandler content. + +=== Test 11: PUT /language/stream + +[source,bash] +---- +curl -s -X PUT -T testPDF.pdf http://localhost:9998/language/stream +---- + +*Expected:* Two-letter language code (e.g., `en`, `th`). + +=== Test 12: PUT /unpack/all + +[source,bash] +---- +curl -s -X PUT -T test_recursive_embedded.docx http://localhost:9998/unpack/all -o /tmp/unpack.zip +unzip -l /tmp/unpack.zip +---- + +*Expected:* ZIP file containing extracted embedded files plus `__TEXT__` and `__METADATA__` files. + +=== Test 13: GET /parsers + +[source,bash] +---- +curl -s -H "Accept: text/plain" http://localhost:9998/parsers +---- + +*Expected:* Hierarchical list of available parsers. + +=== Test 14: GET /detectors + +[source,bash] +---- +curl -s -H "Accept: text/plain" http://localhost:9998/detectors +---- + +*Expected:* List of available detectors. + +=== Test 15: GET /mime-types + +[source,bash] +---- +curl -s -H "Accept: application/json" http://localhost:9998/mime-types +---- + +*Expected:* JSON object with all known MIME types. + +=== Test 16: POST /meta/form + +[source,bash] +---- +curl -s -X POST -F "[email protected]" -H "Accept: application/json" http://localhost:9998/meta/form +---- + +*Expected:* JSON metadata from multipart form upload. + +=== Test 17: POST /rmeta/form + +[source,bash] +---- +curl -s -X POST -F "upload=@test_recursive_embedded.docx" http://localhost:9998/rmeta/form +---- + +*Expected:* JSON array with recursive metadata from multipart upload. + +=== Test 18: Config Endpoints Blocked (Default Mode) + +[source,bash] +---- +curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]" http://localhost:9998/meta/config +curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]" http://localhost:9998/rmeta/config +curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]" http://localhost:9998/tika/config +curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]" http://localhost:9998/unpack/config +---- + +*Expected:* All return HTTP 403 with message: "Config endpoints are disabled. Set enableUnsecureFeatures=true in server config." + +== Part 2: Tests with enableUnsecureFeatures + +Stop the default server and create a config file: + +[source,bash] +---- +pkill -f "tika-server.jar" + +cat > tika-config-unsecure.json << 'EOF' +{ + "server": { + "port": 9998, + "host": "localhost", + "enableUnsecureFeatures": true + }, + "parsers": [ + {"default-parser": {}} + ], + "plugin-roots": "/tmp/tika-server-test/plugins" +} +EOF + +java -jar tika-server.jar -c tika-config-unsecure.json & +sleep 10 +curl -s http://localhost:9998/version +---- + +=== Test 19: POST /meta/config + +[source,bash] +---- +curl -s -X POST -F "[email protected]" -H "Accept: application/json" http://localhost:9998/meta/config +---- + +*Expected:* JSON metadata. + +=== Test 20: POST /meta/config with custom parser config + +[source,bash] +---- +curl -s -X POST -F "[email protected]" \ + -F 'config={"parsers":[{"pdf-parser":{"ocrStrategy":"NO_OCR"}}]}' \ + -H "Accept: application/json" \ + http://localhost:9998/meta/config +---- + +*Expected:* JSON metadata with custom PDF parser config applied. + +=== Test 21: POST /unpack/config + +[source,bash] +---- +curl -s -X POST -F "file=@test_recursive_embedded.docx" http://localhost:9998/unpack/config -o /tmp/unpack-config.zip +unzip -l /tmp/unpack-config.zip +---- + +*Expected:* ZIP with extracted embedded files. + +=== Test 22: POST /unpack/all/config + +[source,bash] +---- +curl -s -X POST -F "file=@test_recursive_embedded.docx" http://localhost:9998/unpack/all/config -o /tmp/unpack-all.zip +unzip -l /tmp/unpack-all.zip +---- + +*Expected:* ZIP with all recursively extracted files. + +== Server Options + +=== Test 23: Custom Port + +[source,bash] +---- +java -jar tika-server.jar --port 9999 & +sleep 8 +curl -s http://localhost:9999/version +---- + +*Expected:* Server responds on port 9999. + +=== Test 24: Custom Host + +[source,bash] +---- +java -jar tika-server.jar --host 0.0.0.0 --port 9998 & +---- + +*Expected:* Server binds to all interfaces. + +=== Test 25: With Config File + +[source,bash] +---- +java -jar tika-server.jar -c tika-config.json & +---- + +*Expected:* Server uses custom configuration. + +== Headers + +=== Test 26: X-Tika-OCRskipOcr Header + +[source,bash] +---- +curl -s -X PUT -H "X-Tika-OCRskipOcr: true" -T testPDF.pdf http://localhost:9998/tika/text +---- + +*Expected:* Text extraction without OCR. + +=== Test 27: Content-Disposition Filename + +[source,bash] +---- +curl -s -X PUT -H "Content-Disposition: attachment; filename=myfile.pdf" -T testPDF.pdf http://localhost:9998/meta/resourceName +---- + +*Expected:* Returns the filename from Content-Disposition header. + +== Error Handling + +=== Test 28: Non-existent Endpoint + +[source,bash] +---- +curl -s -w "\nHTTP Status: %{http_code}\n" http://localhost:9998/nonexistent +---- + +*Expected:* 404 Not Found. + +=== Test 29: Invalid Method + +[source,bash] +---- +curl -s -w "\nHTTP Status: %{http_code}\n" -X DELETE http://localhost:9998/tika/text +---- + +*Expected:* 405 Method Not Allowed. + +== Cleanup + +[source,bash] +---- +pkill -f "tika-server.jar" +rm -rf /tmp/tika-server-test +---- + +== Usability Test Results + +The following endpoints were tested and verified working: + +=== Default Mode (enableUnsecureFeatures=false) + +[cols="1,1,1", options="header"] +|=== +|Endpoint |Method |Status + +|`/version` |GET |PASS +|`/detect/stream` |PUT |PASS +|`/tika` |PUT |PASS +|`/tika/text` |PUT |PASS +|`/tika/html` |PUT |PASS +|`/tika/xml` |PUT |PASS +|`/tika/json` |PUT |PASS +|`/meta` |PUT |PASS +|`/meta/{field}` |PUT |PASS +|`/rmeta` |PUT |PASS +|`/rmeta/text` |PUT |PASS +|`/language/stream` |PUT |PASS +|`/unpack/all` |PUT |PASS +|`/parsers` |GET |PASS +|`/detectors` |GET |PASS +|`/mime-types` |GET |PASS +|`/meta/form` |POST |PASS +|`/rmeta/form` |POST |PASS +|`/meta/config` |POST |BLOCKED (403) - Expected +|`/rmeta/config` |POST |BLOCKED (403) - Expected +|`/tika/config` |POST |BLOCKED (403) - Expected +|`/unpack/config` |POST |BLOCKED (403) - Expected +|=== + +=== With enableUnsecureFeatures=true + +[cols="1,1,1", options="header"] +|=== +|Endpoint |Method |Status + +|`/meta/config` |POST |PASS +|`/rmeta/config` |POST |PASS +|`/tika/config` |POST |PASS +|`/unpack/config` |POST |PASS +|`/unpack/all/config` |POST |PASS +|=== + +== Known Issues + +=== Issue 1: Language Detection Accuracy + +Short texts may not be detected reliably. The `/language/stream` endpoint works best with substantial text content. + +== Quick Reference + +=== Basic Parsing +[source,bash] +---- +# Text output +curl -X PUT -T file.pdf http://localhost:9998/tika/text + +# HTML output +curl -X PUT -T file.pdf http://localhost:9998/tika/html + +# JSON output (metadata + content) +curl -X PUT -T file.pdf http://localhost:9998/tika/json +---- + +=== Metadata Only +[source,bash] +---- +curl -X PUT -H "Accept: application/json" -T file.pdf http://localhost:9998/meta +---- + +=== Recursive Metadata +[source,bash] +---- +curl -X PUT -T file.docx http://localhost:9998/rmeta +curl -X PUT -T file.docx http://localhost:9998/rmeta/text +---- + +=== Detection +[source,bash] +---- +curl -X PUT -T file.pdf http://localhost:9998/detect/stream +---- + +=== Extract Embedded Files +[source,bash] +---- +curl -X PUT -T file.docx http://localhost:9998/unpack/all -o output.zip +---- + +== Implementation Notes + +=== Automatic Component Configuration + +The server automatically configures the required fetcher and emitter for pipes-based parsing: + +* **tika-server-fetcher**: A file-system-fetcher with `basePath` pointing to a dedicated temp directory for input files. This enables the `/tika`, `/rmeta`, and `/meta` endpoints to work with uploaded files. + +* **unpack-emitter**: A file-system-emitter with `basePath` pointing to a dedicated temp directory for unpacked files. This is only created when the `/unpack` endpoint is enabled (default). This enables the `/unpack/all` endpoint to return embedded files as a ZIP. + +Both temp directories are cleaned up on server shutdown. + +If a user config file does not include `plugin-roots`, the server automatically adds a default value pointing to a `plugins` directory in the current working directory. + +=== Security Boundary + +Child processes (pipes workers) are configured with `basePath` rather than `allowAbsolutePaths`, ensuring they can only access files within their designated temp directories. This provides a security boundary between the parent server process and forked child processes. diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java index c6e7a30af8..1257c48e4c 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java @@ -17,7 +17,6 @@ package org.apache.tika.async.cli; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -52,30 +51,59 @@ public class PluginsWriter { } } try { - String jsonTemplate = new String(getClass().getResourceAsStream("/config-template.json").readAllBytes(), StandardCharsets.UTF_8); - String json = jsonTemplate.replace("FETCHER_BASE_PATH", baseInput.toAbsolutePath().toString()); - json = json.replace("EMITTER_BASE_PATH", baseOutput.toAbsolutePath().toString()); - String pluginString = StringUtils.isBlank(simpleAsyncConfig.getPluginsDir()) ? "plugins" : simpleAsyncConfig.getPluginsDir(); + ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper(); + ObjectNode root = (ObjectNode) objectMapper.readTree( + getClass().getResourceAsStream("/config-template.json")); + + // Set fetcher basePath + ObjectNode fetchers = (ObjectNode) root.get("fetchers"); + if (fetchers != null && fetchers.has("fsf")) { + ObjectNode fsf = (ObjectNode) fetchers.get("fsf"); + if (fsf != null && fsf.has("file-system-fetcher")) { + ObjectNode fsFetcher = (ObjectNode) fsf.get("file-system-fetcher"); + fsFetcher.put("basePath", baseInput.toAbsolutePath().toString()); + } + } + + // Set emitter basePath + ObjectNode emitters = (ObjectNode) root.get("emitters"); + if (emitters != null && emitters.has("fse")) { + ObjectNode fse = (ObjectNode) emitters.get("fse"); + if (fse != null && fse.has("file-system-emitter")) { + ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter"); + fsEmitter.put("basePath", baseOutput.toAbsolutePath().toString()); + } + } + + // Set pipes-iterator basePath + ObjectNode pipesIterator = (ObjectNode) root.get("pipes-iterator"); + if (pipesIterator != null && pipesIterator.has("file-system-pipes-iterator")) { + ObjectNode fsIterator = (ObjectNode) pipesIterator.get("file-system-pipes-iterator"); + fsIterator.put("basePath", baseInput.toAbsolutePath().toString()); + } + + // Set plugin-roots + String pluginString = StringUtils.isBlank(simpleAsyncConfig.getPluginsDir()) ? + "plugins" : simpleAsyncConfig.getPluginsDir(); Path plugins = Paths.get(pluginString); if (Files.isDirectory(plugins)) { pluginString = plugins.toAbsolutePath().toString(); } - json = json.replace("PLUGIN_ROOTS", pluginString).replace("\\", "/"); - PipesConfig pipesConfig = new PipesConfig(); - - pipesConfig.setNumClients(simpleAsyncConfig.getNumClients() == null ? 2 : simpleAsyncConfig.getNumClients()); + root.put("plugin-roots", pluginString); + // Set pipes config + PipesConfig pipesConfig = new PipesConfig(); + pipesConfig.setNumClients(simpleAsyncConfig.getNumClients() == null ? + 2 : simpleAsyncConfig.getNumClients()); if (simpleAsyncConfig.getXmx() != null) { pipesConfig.setForkedJvmArgs(new ArrayList<>(List.of(simpleAsyncConfig.getXmx()))); } if (simpleAsyncConfig.getTimeoutMs() != null) { pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs()); } - ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper(); - ObjectNode root = (ObjectNode) objectMapper.readTree(json.getBytes(StandardCharsets.UTF_8)); root.set("pipes", objectMapper.valueToTree(pipesConfig)); - Files.writeString(output, root.toString()); + objectMapper.writerWithDefaultPrettyPrinter().writeValue(output.toFile(), root); } catch (Exception e) { throw new IOException(e); } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index d28cbb96c8..fdc8883f3b 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -27,7 +27,6 @@ import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Set; import org.apache.commons.cli.CommandLine; @@ -444,6 +443,18 @@ public class TikaServerProcess { return endpoints.contains("tika") || endpoints.contains("rmeta"); } + /** + * Determines if the /unpack endpoint is enabled based on configured endpoints. + */ + private static boolean isUnpackEndpointEnabled(TikaServerConfig tikaServerConfig) { + List<String> endpoints = tikaServerConfig.getEndpoints(); + // If no endpoints specified, all default endpoints are loaded (including unpack) + if (endpoints == null || endpoints.isEmpty()) { + return true; + } + return endpoints.contains("unpack"); + } + /** * Initializes the PipesParsingHelper for pipes-based parsing with process isolation. * <p> @@ -452,22 +463,42 @@ public class TikaServerProcess { * <p> * If no config file is provided, a minimal default configuration will be created. * The plugin-roots will default to a "plugins" directory at the same level as the server jar. + * <p> + * A dedicated temp directory is created for input files, and a file-system-fetcher + * is configured with basePath pointing to that directory. This ensures child processes + * can only access files in the designated temp directory (security boundary). * * @param tikaServerConfig the server configuration * @return the PipesParsingHelper * @throws Exception if pipes initialization fails */ private static PipesParsingHelper initPipesParsingHelper(TikaServerConfig tikaServerConfig) throws Exception { - // Load or create config + // Create dedicated temp directory for input files + Path inputTempDirectory = Files.createTempDirectory("tika-server-input-"); + LOG.info("Created input temp directory: {}", inputTempDirectory); + + // Only create unpack temp directory if /unpack endpoint is enabled + Path unpackTempDirectory = null; + if (isUnpackEndpointEnabled(tikaServerConfig)) { + unpackTempDirectory = Files.createTempDirectory("tika-server-unpack-"); + LOG.info("Created unpack temp directory: {}", unpackTempDirectory); + } + + // Load or create config, adding the fetcher (and emitter if unpack is enabled) Path configPath; if (tikaServerConfig.hasConfigFile()) { configPath = tikaServerConfig.getConfigPath(); } else { - configPath = createDefaultConfig(); + configPath = createDefaultConfig(inputTempDirectory, unpackTempDirectory); } TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath); + // Ensure fetcher (and emitter if unpack is enabled) are configured with correct basePaths + configPath = ensureServerComponents(configPath, tikaJsonConfig, + inputTempDirectory, unpackTempDirectory); + tikaJsonConfig = TikaJsonConfig.load(configPath); + // Load or create PipesConfig with defaults PipesConfig pipesConfig = tikaJsonConfig.deserialize("pipes", PipesConfig.class); if (pipesConfig == null) { @@ -480,13 +511,13 @@ public class TikaServerProcess { // Create PipesParser PipesParser pipesParser = PipesParser.load(tikaJsonConfig, pipesConfig, configPath); - // Try to determine unpack emitter basePath from config - Path unpackEmitterBasePath = getUnpackEmitterBasePath(tikaJsonConfig); - // Create and return the helper - PipesParsingHelper helper = new PipesParsingHelper(pipesParser, pipesConfig, unpackEmitterBasePath); + PipesParsingHelper helper = new PipesParsingHelper(pipesParser, pipesConfig, + inputTempDirectory, unpackTempDirectory); - // Register shutdown hook to clean up PipesParser + // Register shutdown hook to clean up PipesParser and temp directories + final Path inputDirToClean = inputTempDirectory; + final Path unpackDirToClean = unpackTempDirectory; Runtime.getRuntime().addShutdownHook(new Thread(() -> { try { LOG.info("Shutting down PipesParser"); @@ -494,62 +525,32 @@ public class TikaServerProcess { } catch (Exception e) { LOG.warn("Error closing PipesParser", e); } + // Clean up temp directories + cleanupTempDirectory(inputDirToClean); + if (unpackDirToClean != null) { + cleanupTempDirectory(unpackDirToClean); + } })); return helper; } - /** - * Attempts to determine the basePath for the unpack-emitter from the config. - * Returns null if the emitter is not configured or basePath cannot be determined. - */ - private static Path getUnpackEmitterBasePath(TikaJsonConfig tikaJsonConfig) { + private static void cleanupTempDirectory(Path tempDir) { try { - java.util.Map<String, com.fasterxml.jackson.databind.JsonNode> emitters = - tikaJsonConfig.getComponents("emitters"); - if (emitters == null || !emitters.containsKey(PipesParsingHelper.UNPACK_EMITTER_ID)) { - LOG.debug("No unpack-emitter configured, UNPACK mode will not be available"); - return null; - } - - com.fasterxml.jackson.databind.JsonNode emitterConfig = - emitters.get(PipesParsingHelper.UNPACK_EMITTER_ID); - com.fasterxml.jackson.databind.JsonNode basePath = findBasePath(emitterConfig); - if (basePath != null && basePath.isTextual()) { - Path path = Path.of(basePath.asText()); - if (Files.isDirectory(path)) { - LOG.info("UNPACK mode enabled with basePath: {}", path); - return path; - } else { - LOG.warn("unpack-emitter basePath does not exist: {}", path); - } + if (Files.exists(tempDir)) { + Files.walk(tempDir) + .sorted((a, b) -> -a.compareTo(b)) // Delete files before directories + .forEach(p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + LOG.warn("Failed to delete: {}", p); + } + }); } - } catch (Exception e) { - LOG.warn("Failed to determine unpack-emitter basePath", e); + } catch (IOException e) { + LOG.warn("Error cleaning up temp directory: {}", tempDir, e); } - return null; - } - - /** - * Recursively searches for "basePath" in a JSON node. - */ - private static com.fasterxml.jackson.databind.JsonNode findBasePath( - com.fasterxml.jackson.databind.JsonNode node) { - if (node == null) { - return null; - } - if (node.has("basePath")) { - return node.get("basePath"); - } - for (com.fasterxml.jackson.databind.JsonNode child : node) { - if (child.isObject()) { - com.fasterxml.jackson.databind.JsonNode result = findBasePath(child); - if (result != null) { - return result; - } - } - } - return null; } /** @@ -559,36 +560,150 @@ public class TikaServerProcess { /** * Creates a default configuration file with plugin-roots set to the "plugins" directory - * relative to the current working directory. + * relative to the current working directory, the tika-server-fetcher configured + * with basePath pointing to the input temp directory, and optionally the unpack-emitter + * configured with basePath pointing to the unpack temp directory. + * + * @param inputTempDirectory the temp directory for input files + * @param unpackTempDirectory the temp directory for unpack output files (may be null) */ - private static Path createDefaultConfig() throws IOException { + private static Path createDefaultConfig(Path inputTempDirectory, + Path unpackTempDirectory) throws IOException { Path pluginsDir = Path.of(DEFAULT_PLUGINS_DIR).toAbsolutePath(); - String configJson = String.format(Locale.ROOT, """ - { - "fetchers": { - "file-system-fetcher": { - "file-system-fetcher": { - "allowAbsolutePaths": true - } - } - }, - "pipes": { - "numClients": 4, - "timeoutMillis": 60000 - }, - "plugin-roots": "%s" - } - """, pluginsDir.toString().replace("\\", "/")); + com.fasterxml.jackson.databind.ObjectMapper mapper = + new com.fasterxml.jackson.databind.ObjectMapper(); + com.fasterxml.jackson.databind.node.ObjectNode rootNode = mapper.createObjectNode(); + + // Create fetchers section + com.fasterxml.jackson.databind.node.ObjectNode fetchersNode = mapper.createObjectNode(); + com.fasterxml.jackson.databind.node.ObjectNode fetcherNode = mapper.createObjectNode(); + com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig = mapper.createObjectNode(); + fetcherTypeConfig.put("basePath", inputTempDirectory.toAbsolutePath().toString()); + fetcherNode.set("file-system-fetcher", fetcherTypeConfig); + fetchersNode.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode); + rootNode.set("fetchers", fetchersNode); + + // Create emitters section if unpack is enabled + if (unpackTempDirectory != null) { + com.fasterxml.jackson.databind.node.ObjectNode emittersNode = mapper.createObjectNode(); + com.fasterxml.jackson.databind.node.ObjectNode emitterNode = mapper.createObjectNode(); + com.fasterxml.jackson.databind.node.ObjectNode emitterTypeConfig = mapper.createObjectNode(); + emitterTypeConfig.put("basePath", unpackTempDirectory.toAbsolutePath().toString()); + emitterTypeConfig.put("onExists", "REPLACE"); + emitterNode.set("file-system-emitter", emitterTypeConfig); + emittersNode.set(PipesParsingHelper.UNPACK_EMITTER_ID, emitterNode); + rootNode.set("emitters", emittersNode); + } + + // Create pipes section + com.fasterxml.jackson.databind.node.ObjectNode pipesNode = mapper.createObjectNode(); + pipesNode.put("numClients", 4); + pipesNode.put("timeoutMillis", 60000); + rootNode.set("pipes", pipesNode); + + // Set plugin-roots + rootNode.put("plugin-roots", pluginsDir.toString()); Path tempConfig = Files.createTempFile("tika-server-default-config-", ".json"); - Files.writeString(tempConfig, configJson); + mapper.writerWithDefaultPrettyPrinter().writeValue(tempConfig.toFile(), rootNode); tempConfig.toFile().deleteOnExit(); LOG.info("Created default config with plugin-roots: {}", pluginsDir); return tempConfig; } + /** + * Ensures the tika-server-fetcher exists in the config with basePath pointing to + * the input temp directory. If unpackTempDirectory is provided, also ensures the + * unpack-emitter exists. + * <p> + * The fetcher is used by legacy endpoints (/tika, /rmeta, etc.) to read uploaded files + * that have been spooled to the input temp directory. + * <p> + * The emitter is used by /unpack endpoints to write unpacked files that are then + * streamed back to the client. + * <p> + * Both components are configured with basePath (not allowAbsolutePaths) so child processes + * can only access files within their designated temp directories (security boundary). + * + * @param originalConfigPath the original config file path + * @param tikaJsonConfig the parsed Tika JSON config + * @param inputTempDirectory the temp directory for input files + * @param unpackTempDirectory the temp directory for unpack output files (may be null) + * @return the config path to use (always a new merged config with fetcher and optionally emitter) + */ + private static Path ensureServerComponents(Path originalConfigPath, TikaJsonConfig tikaJsonConfig, + Path inputTempDirectory, + Path unpackTempDirectory) throws IOException { + LOG.info("Configuring {} with basePath={}", PipesParsingHelper.DEFAULT_FETCHER_ID, inputTempDirectory); + + // Read original config as a mutable tree + com.fasterxml.jackson.databind.ObjectMapper mapper = + new com.fasterxml.jackson.databind.ObjectMapper(); + com.fasterxml.jackson.databind.node.ObjectNode rootNode = + (com.fasterxml.jackson.databind.node.ObjectNode) mapper.readTree(originalConfigPath.toFile()); + + // Get or create the fetchers section + com.fasterxml.jackson.databind.node.ObjectNode fetchersNode; + if (rootNode.has("fetchers") && rootNode.get("fetchers").isObject()) { + fetchersNode = (com.fasterxml.jackson.databind.node.ObjectNode) rootNode.get("fetchers"); + } else { + fetchersNode = mapper.createObjectNode(); + rootNode.set("fetchers", fetchersNode); + } + + // Create the fetcher config with basePath + // Structure: "tika-server-fetcher": { "file-system-fetcher": { "basePath": "/tmp/..." } } + com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig = mapper.createObjectNode(); + fetcherTypeConfig.put("basePath", inputTempDirectory.toAbsolutePath().toString()); + + com.fasterxml.jackson.databind.node.ObjectNode fetcherNode = mapper.createObjectNode(); + fetcherNode.set("file-system-fetcher", fetcherTypeConfig); + + fetchersNode.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode); + + // Only add unpack-emitter if unpack endpoint is enabled + if (unpackTempDirectory != null) { + LOG.info("Configuring {} with basePath={}", PipesParsingHelper.UNPACK_EMITTER_ID, unpackTempDirectory); + + // Get or create the emitters section + com.fasterxml.jackson.databind.node.ObjectNode emittersNode; + if (rootNode.has("emitters") && rootNode.get("emitters").isObject()) { + emittersNode = (com.fasterxml.jackson.databind.node.ObjectNode) rootNode.get("emitters"); + } else { + emittersNode = mapper.createObjectNode(); + rootNode.set("emitters", emittersNode); + } + + // Create the emitter config with basePath + // Structure: "unpack-emitter": { "file-system-emitter": { "basePath": "/tmp/...", "onExists": "REPLACE" } } + com.fasterxml.jackson.databind.node.ObjectNode emitterTypeConfig = mapper.createObjectNode(); + emitterTypeConfig.put("basePath", unpackTempDirectory.toAbsolutePath().toString()); + emitterTypeConfig.put("onExists", "REPLACE"); + + com.fasterxml.jackson.databind.node.ObjectNode emitterNode = mapper.createObjectNode(); + emitterNode.set("file-system-emitter", emitterTypeConfig); + + emittersNode.set(PipesParsingHelper.UNPACK_EMITTER_ID, emitterNode); + } + + // Ensure plugin-roots is set (required for child processes) + if (!rootNode.has("plugin-roots")) { + Path pluginsDir = Path.of(DEFAULT_PLUGINS_DIR).toAbsolutePath(); + rootNode.put("plugin-roots", pluginsDir.toString()); + LOG.info("Added default plugin-roots: {}", pluginsDir); + } + + // Write merged config to temp file + Path mergedConfig = Files.createTempFile("tika-server-merged-config-", ".json"); + mapper.writerWithDefaultPrettyPrinter().writeValue(mergedConfig.toFile(), rootNode); + mergedConfig.toFile().deleteOnExit(); + + LOG.debug("Created merged config: {}", mergedConfig); + return mergedConfig; + } + private static class ServerDetails { JAXRSServerFactoryBean sf; String serverId; diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java index c88a1ec799..6b1a6fe699 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Collections; import java.util.List; import java.util.UUID; @@ -50,18 +49,9 @@ import org.apache.tika.server.core.TikaServerParseException; * Helper class for pipes-based parsing in tika-server endpoints. * Handles temp file management, FetchEmitTuple creation, and result processing. * <p> - * To use pipes-based parsing, your tika-config.json must include a file-system fetcher - * with allowAbsolutePaths enabled: - * <pre> - * { - * "fetchers": { - * "file-system-fetcher": { - * "class": "org.apache.tika.pipes.fetcher.fs.FileSystemFetcher", - * "allowAbsolutePaths": true - * } - * } - * } - * </pre> + * The helper manages a dedicated temp directory for input files. A file-system-fetcher + * is configured with basePath pointing to this directory, ensuring child processes + * can only access files within the designated temp directory (no absolute paths). */ public class PipesParsingHelper { @@ -69,9 +59,9 @@ public class PipesParsingHelper { /** * The fetcher ID used for reading temp files. - * This fetcher must be configured in the JSON config with allowAbsolutePaths=true. + * This fetcher is configured with basePath = inputTempDirectory. */ - public static final String DEFAULT_FETCHER_ID = "file-system-fetcher"; + public static final String DEFAULT_FETCHER_ID = "tika-server-fetcher"; private final PipesParser pipesParser; private final PipesConfig pipesConfig; @@ -83,33 +73,42 @@ public class PipesParsingHelper { * * @param pipesParser the PipesParser instance * @param pipesConfig the PipesConfig instance + * @param inputTempDirectory the temp directory for input files. The file-system-fetcher + * is configured with basePath = this directory. * @param unpackEmitterBasePath the basePath where the unpack-emitter writes files. * This is where the server will find the zip files created * by UNPACK mode. May be null if UNPACK mode won't be used. */ - public PipesParsingHelper(PipesParser pipesParser, PipesConfig pipesConfig, Path unpackEmitterBasePath) { + public PipesParsingHelper(PipesParser pipesParser, PipesConfig pipesConfig, + Path inputTempDirectory, Path unpackEmitterBasePath) { this.pipesParser = pipesParser; this.pipesConfig = pipesConfig; + this.inputTempDirectory = inputTempDirectory; this.unpackEmitterBasePath = unpackEmitterBasePath; - // Determine input temp directory - String configTempDir = pipesConfig.getTempDirectory(); - if (configTempDir != null && !configTempDir.isBlank()) { - this.inputTempDirectory = Paths.get(configTempDir); - if (!Files.isDirectory(this.inputTempDirectory)) { - throw new IllegalArgumentException( - "Configured tempDirectory does not exist or is not a directory: " + configTempDir); - } - } else { - this.inputTempDirectory = null; // Use system default + if (inputTempDirectory == null || !Files.isDirectory(inputTempDirectory)) { + throw new IllegalArgumentException( + "inputTempDirectory must be a valid directory: " + inputTempDirectory); } + LOG.info("PipesParsingHelper initialized with inputTempDirectory: {}", inputTempDirectory); + } + + /** + * Gets the input temp directory path. + * @return the input temp directory + */ + public Path getInputTempDirectory() { + return inputTempDirectory; } /** * Parses content using pipes-based parsing with process isolation. * <p> - * The TikaInputStream should already be spooled to a temp file via {@link TikaInputStream#getPath()}. - * The caller is responsible for closing the TikaInputStream, which will clean up any temp files. + * This method spools the input to the dedicated temp directory and uses a relative + * filename in the FetchKey. The file-system-fetcher is configured with basePath + * pointing to this directory, so the child process can only access files there. + * <p> + * The caller is responsible for closing the TikaInputStream. * * @param tis the TikaInputStream containing the content to parse * @param metadata metadata to pass to the parser (may include filename, content-type, etc.) @@ -122,17 +121,22 @@ public class PipesParsingHelper { public List<Metadata> parse(TikaInputStream tis, Metadata metadata, ParseContext parseContext, ParseMode parseMode) throws IOException { String requestId = UUID.randomUUID().toString(); + Path tempFile = null; try { - // Get the backing file path from the spooled TikaInputStream - Path inputFile = tis.getPath(); - LOG.debug("parse: using file {} ({} bytes)", inputFile, Files.size(inputFile)); + // Spool input to our dedicated temp directory with proper suffix + String suffix = getSuffix(metadata); + tempFile = Files.createTempFile(inputTempDirectory, "tika-", suffix); + Files.copy(tis, tempFile, java.nio.file.StandardCopyOption.REPLACE_EXISTING); + + String relativeName = tempFile.getFileName().toString(); + LOG.debug("parse: spooled to {} ({} bytes)", relativeName, Files.size(tempFile)); // Set parse mode in context parseContext.set(ParseMode.class, parseMode); - // Create FetchEmitTuple - use NO_EMIT since we're using PASSBACK_ALL - FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, inputFile.toAbsolutePath().toString()); + // Create FetchEmitTuple with relative filename (basePath is configured in fetcher) + FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, relativeName); FetchEmitTuple tuple = new FetchEmitTuple( requestId, @@ -153,9 +157,33 @@ public class PipesParsingHelper { throw new TikaServerParseException("Parsing interrupted"); } catch (PipesException e) { throw new TikaServerParseException(e); + } finally { + // Clean up temp file + if (tempFile != null) { + try { + Files.deleteIfExists(tempFile); + } catch (IOException e) { + LOG.warn("Failed to delete temp file: {}", tempFile, e); + } + } } } + /** + * Extracts file suffix from metadata (resource name or content-type). + */ + private String getSuffix(Metadata metadata) { + String resourceName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (resourceName != null) { + int lastDot = resourceName.lastIndexOf('.'); + if (lastDot > 0 && lastDot < resourceName.length() - 1) { + return resourceName.substring(lastDot); + } + } + // Default suffix + return ".tmp"; + } + /** * Processes the PipesResult and returns the metadata list. */ @@ -260,10 +288,11 @@ public class PipesParsingHelper { * extracted embedded documents. * <p> * This method: - * 1. Configures UnpackConfig with zipEmbeddedFiles=true - * 2. The pipes child process extracts embedded files and creates a zip - * 3. The zip is emitted to the configured file-system emitter - * 4. Returns the path to the zip file for streaming + * 1. Spools input to the dedicated temp directory + * 2. Configures UnpackConfig with zipEmbeddedFiles=true + * 3. The pipes child process extracts embedded files and creates a zip + * 4. The zip is emitted to the configured file-system emitter + * 5. Returns the path to the zip file for streaming * <p> * The caller is responsible for deleting the zip file after streaming. * @@ -277,42 +306,47 @@ public class PipesParsingHelper { public UnpackResult parseUnpack(TikaInputStream tis, Metadata metadata, ParseContext parseContext, boolean saveAll) throws IOException { String requestId = UUID.randomUUID().toString(); + Path tempFile = null; - // Get the backing file path from the spooled TikaInputStream - Path inputFile = tis.getPath(); - LOG.debug("parseUnpack: using file {} ({} bytes), requestId={}", - inputFile, Files.size(inputFile), requestId); - - // Set parse mode to UNPACK - parseContext.set(ParseMode.class, ParseMode.UNPACK); - - // Configure UnpackConfig - use existing or create new - UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); - if (unpackConfig == null) { - unpackConfig = new UnpackConfig(); - } + try { + // Spool input to our dedicated temp directory with proper suffix + String suffix = getSuffix(metadata); + tempFile = Files.createTempFile(inputTempDirectory, "tika-unpack-", suffix); + Files.copy(tis, tempFile, java.nio.file.StandardCopyOption.REPLACE_EXISTING); + + String relativeName = tempFile.getFileName().toString(); + LOG.debug("parseUnpack: spooled to {} ({} bytes), requestId={}", + relativeName, Files.size(tempFile), requestId); + + // Set parse mode to UNPACK + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + // Configure UnpackConfig - use existing or create new + UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); + if (unpackConfig == null) { + unpackConfig = new UnpackConfig(); + } - // Enable zip creation in the child process - unpackConfig.setZipEmbeddedFiles(true); + // Enable zip creation in the child process + unpackConfig.setZipEmbeddedFiles(true); - // Set suffix strategy to DETECTED so files get their proper extensions (e.g., .wav, .jpg) - unpackConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED); + // Set suffix strategy to DETECTED so files get their proper extensions (e.g., .wav, .jpg) + unpackConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED); - // Set emitter to our file-system emitter - unpackConfig.setEmitter(UNPACK_EMITTER_ID); + // Set emitter to our file-system emitter + unpackConfig.setEmitter(UNPACK_EMITTER_ID); - // Include original document if saveAll is requested - if (saveAll) { - unpackConfig.setIncludeOriginal(true); - unpackConfig.setIncludeMetadataInZip(true); - } + // Include original document if saveAll is requested + if (saveAll) { + unpackConfig.setIncludeOriginal(true); + unpackConfig.setIncludeMetadataInZip(true); + } - parseContext.set(UnpackConfig.class, unpackConfig); + parseContext.set(UnpackConfig.class, unpackConfig); - // Create FetchEmitTuple - the emitKey will be used to determine the zip file location - // The zip file will be written to: emitter.basePath + "/" + emitKey + "-embedded.zip" - FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, inputFile.toAbsolutePath().toString()); - EmitKey emitKey = new EmitKey(UNPACK_EMITTER_ID, requestId); + // Create FetchEmitTuple with relative filename (basePath is configured in fetcher) + FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, relativeName); + EmitKey emitKey = new EmitKey(UNPACK_EMITTER_ID, requestId); FetchEmitTuple tuple = new FetchEmitTuple( requestId, @@ -322,70 +356,80 @@ public class PipesParsingHelper { parseContext ); - // Execute parse via pipes - PipesResult result; - try { - result = pipesParser.parse(tuple); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new TikaServerParseException("Parsing interrupted"); - } catch (PipesException e) { - throw new TikaServerParseException(e); - } + // Execute parse via pipes + PipesResult result; + try { + result = pipesParser.parse(tuple); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new TikaServerParseException("Parsing interrupted"); + } catch (PipesException e) { + throw new TikaServerParseException(e); + } - // Check for errors - if (result.isProcessCrash() || result.isFatal() || result.isInitializationFailure()) { - LOG.warn("UNPACK parse failed: {} - {}", result.status(), result.message()); - throw new WebApplicationException( - "Parse failed: " + result.status(), - mapStatusToHttpResponse(result.status())); - } + // Check for errors + if (result.isProcessCrash() || result.isFatal() || result.isInitializationFailure()) { + LOG.warn("UNPACK parse failed: {} - {}", result.status(), result.message()); + throw new WebApplicationException( + "Parse failed: " + result.status(), + mapStatusToHttpResponse(result.status())); + } - if (result.isTaskException()) { - LOG.warn("UNPACK task exception: {} - {}", result.status(), result.message()); - throw new WebApplicationException( - "Parse failed: " + result.message(), - Response.Status.INTERNAL_SERVER_ERROR); - } + if (result.isTaskException()) { + LOG.warn("UNPACK task exception: {} - {}", result.status(), result.message()); + throw new WebApplicationException( + "Parse failed: " + result.message(), + Response.Status.INTERNAL_SERVER_ERROR); + } - // Get metadata list from result - List<Metadata> metadataList = Collections.emptyList(); - EmitData emitData = result.emitData(); - if (emitData != null && emitData.getMetadataList() != null) { - metadataList = emitData.getMetadataList(); - } + // Get metadata list from result + List<Metadata> metadataList = Collections.emptyList(); + EmitData emitData = result.emitData(); + if (emitData != null && emitData.getMetadataList() != null) { + metadataList = emitData.getMetadataList(); + } - // Check for parse exceptions in the container document metadata - // These should return appropriate HTTP status codes - if (!metadataList.isEmpty()) { - Metadata containerMetadata = metadataList.get(0); - String containerException = containerMetadata.get(TikaCoreProperties.CONTAINER_EXCEPTION); - if (containerException != null) { - // Map exception type to HTTP status - // 422 (Unprocessable Entity) for parse-related exceptions - int status = 422; // Default for parse exceptions - if (containerException.contains("EncryptedDocumentException") || - containerException.contains("TikaException") || - containerException.contains("NullPointerException") || - containerException.contains("IllegalStateException")) { - status = 422; + // Check for parse exceptions in the container document metadata + // These should return appropriate HTTP status codes + if (!metadataList.isEmpty()) { + Metadata containerMetadata = metadataList.get(0); + String containerException = containerMetadata.get(TikaCoreProperties.CONTAINER_EXCEPTION); + if (containerException != null) { + // Map exception type to HTTP status + // 422 (Unprocessable Entity) for parse-related exceptions + int status = 422; // Default for parse exceptions + if (containerException.contains("EncryptedDocumentException") || + containerException.contains("TikaException") || + containerException.contains("NullPointerException") || + containerException.contains("IllegalStateException")) { + status = 422; + } + // Build response with exception string as body for stack trace support + Response response = Response.status(status) + .entity(containerException) + .type("text/plain") + .build(); + throw new WebApplicationException(response); } - // Build response with exception string as body for stack trace support - Response response = Response.status(status) - .entity(containerException) - .type("text/plain") - .build(); - throw new WebApplicationException(response); } - } - // Determine the zip file path - // Regular format: emitter.basePath + "/" + emitKey + "-embedded.zip" - // Frictionless format: emitter.basePath + "/" + emitKey + "-frictionless.zip" - boolean isFrictionless = unpackConfig.getOutputFormat() == UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS; - Path zipFile = getEmittedZipPath(requestId, isFrictionless); + // Determine the zip file path + // Regular format: emitter.basePath + "/" + emitKey + "-embedded.zip" + // Frictionless format: emitter.basePath + "/" + emitKey + "-frictionless.zip" + boolean isFrictionless = unpackConfig.getOutputFormat() == UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS; + Path zipFile = getEmittedZipPath(requestId, isFrictionless); - return new UnpackResult(zipFile, metadataList); + return new UnpackResult(zipFile, metadataList); + } finally { + // Clean up temp file + if (tempFile != null) { + try { + Files.deleteIfExists(tempFile); + } catch (IOException e) { + LOG.warn("Failed to delete temp file: {}", tempFile, e); + } + } + } } /** diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index d11d21984d..9cbdb7a11d 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@ -196,7 +196,12 @@ public abstract class CXFTestBase { this.tika = TikaLoader.load(tmp); + // Create input temp directory for pipes-based parsing + Path inputTempDirectory = Files.createTempDirectory("tika-server-test-input-"); + // Initialize PipesParsingHelper for pipes-based parsing + // Merge the fetcher config with basePath pointing to the temp directory + this.pipesConfigPath = mergeFetcherConfig(this.pipesConfigPath, inputTempDirectory); TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(this.pipesConfigPath); PipesConfig pipesConfig = tikaJsonConfig.deserialize("pipes", PipesConfig.class); if (pipesConfig == null) { @@ -204,7 +209,8 @@ public abstract class CXFTestBase { } pipesConfig.setEmitStrategy(new EmitStrategyConfig(EmitStrategy.PASSBACK_ALL)); this.pipesParser = PipesParser.load(tikaJsonConfig, pipesConfig, this.pipesConfigPath); - PipesParsingHelper pipesParsingHelper = new PipesParsingHelper(this.pipesParser, pipesConfig, getUnpackEmitterBasePath()); + PipesParsingHelper pipesParsingHelper = new PipesParsingHelper(this.pipesParser, pipesConfig, + inputTempDirectory, getUnpackEmitterBasePath()); TikaResource.init(tika, new ServerStatus(), pipesParsingHelper); } finally { @@ -259,6 +265,37 @@ public abstract class CXFTestBase { return tempConfig; } + /** + * Merges the tika-server-fetcher configuration into the pipes config. + * The fetcher is configured with basePath pointing to the input temp directory. + */ + private Path mergeFetcherConfig(Path configPath, Path inputTempDirectory) throws IOException { + ObjectMapper mapper = new ObjectMapper(); + com.fasterxml.jackson.databind.node.ObjectNode root = + (com.fasterxml.jackson.databind.node.ObjectNode) mapper.readTree(configPath.toFile()); + + // Get or create fetchers section + com.fasterxml.jackson.databind.node.ObjectNode fetchers = + (com.fasterxml.jackson.databind.node.ObjectNode) root.get("fetchers"); + if (fetchers == null) { + fetchers = mapper.createObjectNode(); + root.set("fetchers", fetchers); + } + + // Create the tika-server-fetcher with basePath + com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig = mapper.createObjectNode(); + fetcherTypeConfig.put("basePath", inputTempDirectory.toAbsolutePath().toString()); + + com.fasterxml.jackson.databind.node.ObjectNode fetcherNode = mapper.createObjectNode(); + fetcherNode.set("file-system-fetcher", fetcherTypeConfig); + + fetchers.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode); + + Path tempConfig = Files.createTempFile("tika-server-pipes-fetcher-", ".json"); + mapper.writerWithDefaultPrettyPrinter().writeValue(tempConfig.toFile(), root); + return tempConfig; + } + /** * Creates a default test config with pipes configuration. * If the tika config contains metadata-filters, they are merged into the pipes config.
