This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 1c06d308a9 TIKA-4645-usability-scripts and bug fixes (#2577)
1c06d308a9 is described below
commit 1c06d308a9e2ddba4744d4d860fb23e9c4210516
Author: Tim Allison <[email protected]>
AuthorDate: Mon Feb 2 14:30:43 2026 -0500
TIKA-4645-usability-scripts and bug fixes (#2577)
---
docs/advanced/integration-testing/tika-app.adoc | 398 +++++++++++++++++
docs/advanced/integration-testing/tika-server.adoc | 473 +++++++++++++++++++++
docs/modules/ROOT/pages/migration-to-4x/index.adoc | 17 +
.../pages/migration-to-4x/migrating-to-4x.adoc | 17 +-
.../main/java/org/apache/tika/cli/AsyncHelper.java | 16 +
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 55 ++-
.../java/org/apache/tika/cli/AsyncHelperTest.java | 43 ++
.../test/java/org/apache/tika/cli/TikaCLITest.java | 51 +++
.../src/test/resources/s3/tika-config-s3.json | 5 +-
.../ocr/configs/tika-config-restricted-gdal.json | 5 -
.../apache/tika/parser/ocr/tesseract-config.json | 5 -
.../org/apache/tika/async/cli/PluginsWriter.java | 52 ++-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 58 ++-
.../src/main/resources/config-template.json | 5 +-
.../apache/tika/async/cli/AsyncCliParserTest.java | 90 ++++
.../apache/tika/server/core/TikaServerProcess.java | 265 ++++++++----
.../server/core/resource/PipesParsingHelper.java | 288 +++++++------
.../org/apache/tika/server/core/CXFTestBase.java | 39 +-
18 files changed, 1617 insertions(+), 265 deletions(-)
diff --git a/docs/advanced/integration-testing/tika-app.adoc
b/docs/advanced/integration-testing/tika-app.adoc
new file mode 100644
index 0000000000..ea0b846173
--- /dev/null
+++ b/docs/advanced/integration-testing/tika-app.adoc
@@ -0,0 +1,398 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika-App Integration Testing
+
+Integration tests for `tika-app` to be run from a distribution ZIP.
+
+== Setup
+
+[source,bash]
+----
+# Create test directory
+mkdir -p /tmp/tika-app-test
+cd /tmp/tika-app-test
+
+# Copy and extract distribution
+cp /path/to/tika-app-4.0.0-SNAPSHOT.zip .
+unzip tika-app-4.0.0-SNAPSHOT.zip
+cd tika-app-4.0.0-SNAPSHOT
+
+# Get test files
+cp
/path/to/tika-main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testPDF.pdf
.
+cp
/path/to/tika-main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/test_recursive_embedded.docx
.
+cp
/path/to/tika-main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testHTML.html
.
+----
+
+== Test Cases
+
+=== Test 1: Basic Text Extraction
+
+[source,bash]
+----
+java -jar tika-app.jar --text testPDF.pdf
+----
+
+*Expected:* Outputs extracted text from PDF.
+
+=== Test 2: Metadata Extraction
+
+[source,bash]
+----
+java -jar tika-app.jar --metadata testPDF.pdf
+----
+
+*Expected:* Outputs key=value metadata pairs.
+
+=== Test 3: JSON Output with Pretty Print
+
+[source,bash]
+----
+java -jar tika-app.jar --json --pretty-print testPDF.pdf
+----
+
+*Expected:* Clean, readable JSON output with metadata.
+
+=== Test 4: File Type Detection
+
+[source,bash]
+----
+java -jar tika-app.jar --detect testPDF.pdf
+----
+
+*Expected:* Returns `application/pdf`
+
+=== Test 5: Non-existent File Handling
+
+[source,bash]
+----
+java -jar tika-app.jar --text nonexistent_file.pdf
+----
+
+*Expected:* Clear error message (currently shows confusing
"MalformedURLException: no protocol").
+
+=== Test 6: Recursive JSON Output
+
+[source,bash]
+----
+java -jar tika-app.jar --jsonRecursive test_recursive_embedded.docx
+----
+
+*Expected:* JSON array with metadata and content for main doc and all embedded
documents.
+
+=== Test 7: Stdin Input
+
+[source,bash]
+----
+echo "Hello World" | java -jar tika-app.jar --text
+----
+
+*Expected:* Outputs "Hello World"
+
+=== Test 8: Extract Attachments (-z)
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/extract-out
+java -jar tika-app.jar -z --extract-dir=/tmp/tika-app-test/extract-out
test_recursive_embedded.docx
+ls /tmp/tika-app-test/extract-out
+----
+
+*Expected:* Creates .json metadata file and extracts embedded files to
extract-out directory.
+
+=== Test 9: Recursive Extract (-Z)
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/extract-recursive
+java -jar tika-app.jar -Z --extract-dir=/tmp/tika-app-test/extract-recursive
test_recursive_embedded.docx
+ls -R /tmp/tika-app-test/extract-recursive
+----
+
+*Expected:* Extracts all nested embedded documents recursively.
+
+=== Test 10: Batch Mode (Simple)
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/batch-input
+mkdir -p /tmp/tika-app-test/batch-output
+cp testPDF.pdf testHTML.html /tmp/tika-app-test/batch-input/
+java -jar tika-app.jar /tmp/tika-app-test/batch-input
/tmp/tika-app-test/batch-output
+ls /tmp/tika-app-test/batch-output
+----
+
+*Expected:* Creates .json files for each input file in output directory.
+
+=== Test 10b: Batch Mode with Output Options
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/batch-output2
+java -jar tika-app.jar -J -t /tmp/tika-app-test/batch-input
/tmp/tika-app-test/batch-output2
+ls /tmp/tika-app-test/batch-output2
+----
+
+*Expected:* Creates .json files with text content (X-TIKA:content_handler
should be ToTextContentHandler).
+
+=== Test 11: Version Check
+
+[source,bash]
+----
+java -jar tika-app.jar --version
+----
+
+*Expected:* Returns `Apache Tika X.X.X`
+
+=== Test 12: List Parsers
+
+[source,bash]
+----
+java -jar tika-app.jar --list-parsers
+----
+
+*Expected:* Hierarchical list of available parsers.
+
+=== Test 13: Language Detection
+
+[source,bash]
+----
+java -jar tika-app.jar --language testPDF.pdf
+----
+
+*Expected:* Returns detected language code.
+
+=== Test 14: Digest Computation
+
+[source,bash]
+----
+java -jar tika-app.jar --digest=md5 --json testPDF.pdf
+----
+
+*Expected:* JSON output includes `X-TIKA:digest:MD5` field.
+
+=== Test 15: URL Input
+
+[source,bash]
+----
+java -jar tika-app.jar --detect https://www.apache.org/
+----
+
+*Expected:* Returns `text/html`
+
+=== Test 16: XMP Output
+
+[source,bash]
+----
+java -jar tika-app.jar --xmp testPDF.pdf
+----
+
+*Expected:* Valid XMP metadata in RDF/XML format.
+
+=== Test 17: Boilerpipe Main Content
+
+[source,bash]
+----
+java -jar tika-app.jar --text-main testHTML.html
+----
+
+*Expected:* Returns only main content, not boilerplate.
+
+=== Test 18: Depth Limiting
+
+[source,bash]
+----
+java -jar tika-app.jar --maxEmbeddedDepth=1 --text test_recursive_embedded.docx
+----
+
+*Expected:* Limited depth of embedded document extraction.
+
+=== Test 19: GUI Mode
+
+[source,bash]
+----
+java -jar tika-app.jar
+----
+
+*Expected:* Opens GUI (skip in headless environments).
+
+== Advanced Tests: Custom Config
+
+These tests require creating a custom tika-config.json file.
+
+=== Test 20: Create Custom Config File
+
+Create `/tmp/tika-app-test/my-config.json`:
+[source,json]
+----
+{
+ "content-handler-factory": {
+ "basic-content-handler-factory": {
+ "type": "TEXT",
+ "writeLimit": 100000,
+ "throwOnWriteLimitReached": false
+ }
+ },
+ "parsers": [
+ {
+ "default-parser": {}
+ },
+ {
+ "pdf-parser": {
+ "extractActions": true,
+ "extractInlineImages": true,
+ "ocrStrategy": "NO_OCR"
+ }
+ },
+ {
+ "ooxml-parser": {
+ "includeDeletedContent": true,
+ "includeMoveFromContent": true,
+ "extractMacros": true
+ }
+ }
+ ],
+ "fetchers": {
+ "fsf": {
+ "file-system-fetcher": {
+ "basePath": "/tmp/tika-app-test/batch-input",
+ "extractFileSystemMetadata": true
+ }
+ }
+ },
+ "emitters": {
+ "fse": {
+ "file-system-emitter": {
+ "basePath": "/tmp/tika-app-test/config-output",
+ "fileExtension": "json",
+ "onExists": "REPLACE"
+ }
+ }
+ },
+ "pipes-iterator": {
+ "file-system-pipes-iterator": {
+ "basePath": "/tmp/tika-app-test/batch-input",
+ "countTotal": true,
+ "fetcherId": "fsf",
+ "emitterId": "fse"
+ }
+ },
+ "pipes": {
+ "parseMode": "RMETA",
+ "numClients": 2,
+ "timeoutMillis": 60000
+ },
+ "plugin-roots": "/tmp/tika-app-test/plugins"
+}
+----
+
+=== Test 21: Run with Custom Config
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/config-output
+java -jar tika-app.jar /tmp/tika-app-test/my-config.json
+ls /tmp/tika-app-test/config-output
+----
+
+*Expected:* Processes all files in batch-input using custom parser settings.
+
+=== Test 22: Async Mode with Config Flag
+
+[source,bash]
+----
+java -jar tika-app.jar -a --config=/tmp/tika-app-test/my-config.json
+----
+
+*Expected:* Same as Test 21 but using explicit async flag.
+
+=== Test 23: Unpack with Frictionless Format
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/frictionless-out
+java -jar tika-app.jar -Z --extract-dir=/tmp/tika-app-test/frictionless-out
--unpack-format=FRICTIONLESS --unpack-include-metadata
test_recursive_embedded.docx
+ls /tmp/tika-app-test/frictionless-out
+----
+
+*Expected:* Extracts embedded files in Frictionless data package format with
metadata.json.
+
+=== Test 24: Unpack to Directory (not zipped)
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/unpack-dir-out
+java -jar tika-app.jar -Z --extract-dir=/tmp/tika-app-test/unpack-dir-out
--unpack-mode=DIRECTORY test_recursive_embedded.docx
+ls -R /tmp/tika-app-test/unpack-dir-out
+----
+
+*Expected:* Extracts embedded files to directory structure instead of zipped.
+
+=== Test 25: Batch with Multiple Workers
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/multi-worker-out
+java -jar tika-app.jar -n 4 /tmp/tika-app-test/batch-input
/tmp/tika-app-test/multi-worker-out
+----
+
+*Expected:* Processes files using 4 parallel forked clients.
+
+=== Test 26: Batch with Custom Timeout
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/timeout-out
+java -jar tika-app.jar -T 30000 /tmp/tika-app-test/batch-input
/tmp/tika-app-test/timeout-out
+----
+
+*Expected:* Processes files with 30 second timeout per file.
+
+=== Test 27: Batch with Custom Heap
+
+[source,bash]
+----
+mkdir -p /tmp/tika-app-test/heap-out
+java -jar tika-app.jar -X 2g /tmp/tika-app-test/batch-input
/tmp/tika-app-test/heap-out
+----
+
+*Expected:* Forked processes use 2GB heap.
+
+== Known Issues
+
+=== Issue 1: Confusing "no protocol" Error
+
+When a file doesn't exist, the error message is misleading:
+[source]
+----
+MalformedURLException: no protocol: nonexistent_file.pdf
+----
+
+Should say "File not found".
+
+=== Issue 2: INFO Message on Every Command
+
+Every command prints an INFO message to stderr about convenience features. Use
`2>/dev/null` to suppress.
+
+=== Issue 3: Config Dump Options Not Implemented
+
+These options are not yet implemented in 4.x:
+
+* `--dump-minimal-config`
+* `--dump-current-config`
+* `--dump-static-config`
+* `--dump-static-full-config`
diff --git a/docs/advanced/integration-testing/tika-server.adoc
b/docs/advanced/integration-testing/tika-server.adoc
new file mode 100644
index 0000000000..85bca5f1fa
--- /dev/null
+++ b/docs/advanced/integration-testing/tika-server.adoc
@@ -0,0 +1,473 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika-Server Integration Testing
+
+Integration tests for `tika-server` to be run from a distribution ZIP.
+
+== Setup
+
+[source,bash]
+----
+# Create test directory
+mkdir -p /tmp/tika-server-test
+cd /tmp/tika-server-test
+
+# Copy and extract distribution
+cp /path/to/tika-server-standard-4.0.0-SNAPSHOT-bin.zip .
+unzip tika-server-standard-4.0.0-SNAPSHOT-bin.zip
+
+# Copy test files
+cp /path/to/test-documents/testPDF.pdf .
+cp /path/to/test-documents/testHTML.html .
+cp /path/to/test-documents/test_recursive_embedded.docx .
+----
+
+== Part 1: Default Mode Tests
+
+Start server in default mode (config endpoints disabled):
+
+[source,bash]
+----
+java -jar tika-server.jar --port 9998 &
+sleep 8
+curl -s http://localhost:9998/version
+----
+
+=== Test 1: GET /version
+
+[source,bash]
+----
+curl -s http://localhost:9998/version
+----
+
+*Expected:* `Apache Tika X.X.X`
+
+=== Test 2: PUT /detect/stream
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/detect/stream
+----
+
+*Expected:* `application/pdf`
+
+=== Test 3: PUT /tika/text
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/text
+----
+
+*Expected:* Plain text content extracted from PDF.
+
+=== Test 4: PUT /tika/html
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/html
+----
+
+*Expected:* HTML with metadata in `<meta>` tags and content in `<body>`.
+
+=== Test 5: PUT /tika/xml
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/xml
+----
+
+*Expected:* XHTML content (starts with `<html xmlns=...>`).
+
+=== Test 6: PUT /tika/json
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/tika/json
+----
+
+*Expected:* JSON object with metadata and X-TIKA:content field.
+
+=== Test 7: PUT /meta
+
+[source,bash]
+----
+curl -s -X PUT -H "Accept: application/json" -T testPDF.pdf
http://localhost:9998/meta
+----
+
+*Expected:* JSON object with metadata only (no content).
+
+=== Test 8: PUT /meta/{field}
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/meta/Content-Type
+----
+
+*Expected:* `Content-Type,application/pdf`
+
+=== Test 9: PUT /rmeta
+
+[source,bash]
+----
+curl -s -X PUT -T test_recursive_embedded.docx http://localhost:9998/rmeta
+----
+
+*Expected:* JSON array with metadata for main document and all embedded
documents.
+
+=== Test 10: PUT /rmeta/text
+
+[source,bash]
+----
+curl -s -X PUT -T test_recursive_embedded.docx http://localhost:9998/rmeta/text
+----
+
+*Expected:* JSON array with ToTextContentHandler content.
+
+=== Test 11: PUT /language/stream
+
+[source,bash]
+----
+curl -s -X PUT -T testPDF.pdf http://localhost:9998/language/stream
+----
+
+*Expected:* Two-letter language code (e.g., `en`, `th`).
+
+=== Test 12: PUT /unpack/all
+
+[source,bash]
+----
+curl -s -X PUT -T test_recursive_embedded.docx
http://localhost:9998/unpack/all -o /tmp/unpack.zip
+unzip -l /tmp/unpack.zip
+----
+
+*Expected:* ZIP file containing extracted embedded files plus `__TEXT__` and
`__METADATA__` files.
+
+=== Test 13: GET /parsers
+
+[source,bash]
+----
+curl -s -H "Accept: text/plain" http://localhost:9998/parsers
+----
+
+*Expected:* Hierarchical list of available parsers.
+
+=== Test 14: GET /detectors
+
+[source,bash]
+----
+curl -s -H "Accept: text/plain" http://localhost:9998/detectors
+----
+
+*Expected:* List of available detectors.
+
+=== Test 15: GET /mime-types
+
+[source,bash]
+----
+curl -s -H "Accept: application/json" http://localhost:9998/mime-types
+----
+
+*Expected:* JSON object with all known MIME types.
+
+=== Test 16: POST /meta/form
+
+[source,bash]
+----
+curl -s -X POST -F "[email protected]" -H "Accept: application/json"
http://localhost:9998/meta/form
+----
+
+*Expected:* JSON metadata from multipart form upload.
+
+=== Test 17: POST /rmeta/form
+
+[source,bash]
+----
+curl -s -X POST -F "upload=@test_recursive_embedded.docx"
http://localhost:9998/rmeta/form
+----
+
+*Expected:* JSON array with recursive metadata from multipart upload.
+
+=== Test 18: Config Endpoints Blocked (Default Mode)
+
+[source,bash]
+----
+curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]"
http://localhost:9998/meta/config
+curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]"
http://localhost:9998/rmeta/config
+curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]"
http://localhost:9998/tika/config
+curl -s -w "\nHTTP Status: %{http_code}\n" -X POST -F "[email protected]"
http://localhost:9998/unpack/config
+----
+
+*Expected:* All return HTTP 403 with message: "Config endpoints are disabled.
Set enableUnsecureFeatures=true in server config."
+
+== Part 2: Tests with enableUnsecureFeatures
+
+Stop the default server and create a config file:
+
+[source,bash]
+----
+pkill -f "tika-server.jar"
+
+cat > tika-config-unsecure.json << 'EOF'
+{
+ "server": {
+ "port": 9998,
+ "host": "localhost",
+ "enableUnsecureFeatures": true
+ },
+ "parsers": [
+ {"default-parser": {}}
+ ],
+ "plugin-roots": "/tmp/tika-server-test/plugins"
+}
+EOF
+
+java -jar tika-server.jar -c tika-config-unsecure.json &
+sleep 10
+curl -s http://localhost:9998/version
+----
+
+=== Test 19: POST /meta/config
+
+[source,bash]
+----
+curl -s -X POST -F "[email protected]" -H "Accept: application/json"
http://localhost:9998/meta/config
+----
+
+*Expected:* JSON metadata.
+
+=== Test 20: POST /meta/config with custom parser config
+
+[source,bash]
+----
+curl -s -X POST -F "[email protected]" \
+ -F 'config={"parsers":[{"pdf-parser":{"ocrStrategy":"NO_OCR"}}]}' \
+ -H "Accept: application/json" \
+ http://localhost:9998/meta/config
+----
+
+*Expected:* JSON metadata with custom PDF parser config applied.
+
+=== Test 21: POST /unpack/config
+
+[source,bash]
+----
+curl -s -X POST -F "file=@test_recursive_embedded.docx"
http://localhost:9998/unpack/config -o /tmp/unpack-config.zip
+unzip -l /tmp/unpack-config.zip
+----
+
+*Expected:* ZIP with extracted embedded files.
+
+=== Test 22: POST /unpack/all/config
+
+[source,bash]
+----
+curl -s -X POST -F "file=@test_recursive_embedded.docx"
http://localhost:9998/unpack/all/config -o /tmp/unpack-all.zip
+unzip -l /tmp/unpack-all.zip
+----
+
+*Expected:* ZIP with all recursively extracted files.
+
+== Server Options
+
+=== Test 23: Custom Port
+
+[source,bash]
+----
+java -jar tika-server.jar --port 9999 &
+sleep 8
+curl -s http://localhost:9999/version
+----
+
+*Expected:* Server responds on port 9999.
+
+=== Test 24: Custom Host
+
+[source,bash]
+----
+java -jar tika-server.jar --host 0.0.0.0 --port 9998 &
+----
+
+*Expected:* Server binds to all interfaces.
+
+=== Test 25: With Config File
+
+[source,bash]
+----
+java -jar tika-server.jar -c tika-config.json &
+----
+
+*Expected:* Server uses custom configuration.
+
+== Headers
+
+=== Test 26: X-Tika-OCRskipOcr Header
+
+[source,bash]
+----
+curl -s -X PUT -H "X-Tika-OCRskipOcr: true" -T testPDF.pdf
http://localhost:9998/tika/text
+----
+
+*Expected:* Text extraction without OCR.
+
+=== Test 27: Content-Disposition Filename
+
+[source,bash]
+----
+curl -s -X PUT -H "Content-Disposition: attachment; filename=myfile.pdf" -T
testPDF.pdf http://localhost:9998/meta/resourceName
+----
+
+*Expected:* Returns the filename from Content-Disposition header.
+
+== Error Handling
+
+=== Test 28: Non-existent Endpoint
+
+[source,bash]
+----
+curl -s -w "\nHTTP Status: %{http_code}\n" http://localhost:9998/nonexistent
+----
+
+*Expected:* 404 Not Found.
+
+=== Test 29: Invalid Method
+
+[source,bash]
+----
+curl -s -w "\nHTTP Status: %{http_code}\n" -X DELETE
http://localhost:9998/tika/text
+----
+
+*Expected:* 405 Method Not Allowed.
+
+== Cleanup
+
+[source,bash]
+----
+pkill -f "tika-server.jar"
+rm -rf /tmp/tika-server-test
+----
+
+== Usability Test Results
+
+The following endpoints were tested and verified working:
+
+=== Default Mode (enableUnsecureFeatures=false)
+
+[cols="1,1,1", options="header"]
+|===
+|Endpoint |Method |Status
+
+|`/version` |GET |PASS
+|`/detect/stream` |PUT |PASS
+|`/tika` |PUT |PASS
+|`/tika/text` |PUT |PASS
+|`/tika/html` |PUT |PASS
+|`/tika/xml` |PUT |PASS
+|`/tika/json` |PUT |PASS
+|`/meta` |PUT |PASS
+|`/meta/{field}` |PUT |PASS
+|`/rmeta` |PUT |PASS
+|`/rmeta/text` |PUT |PASS
+|`/language/stream` |PUT |PASS
+|`/unpack/all` |PUT |PASS
+|`/parsers` |GET |PASS
+|`/detectors` |GET |PASS
+|`/mime-types` |GET |PASS
+|`/meta/form` |POST |PASS
+|`/rmeta/form` |POST |PASS
+|`/meta/config` |POST |BLOCKED (403) - Expected
+|`/rmeta/config` |POST |BLOCKED (403) - Expected
+|`/tika/config` |POST |BLOCKED (403) - Expected
+|`/unpack/config` |POST |BLOCKED (403) - Expected
+|===
+
+=== With enableUnsecureFeatures=true
+
+[cols="1,1,1", options="header"]
+|===
+|Endpoint |Method |Status
+
+|`/meta/config` |POST |PASS
+|`/rmeta/config` |POST |PASS
+|`/tika/config` |POST |PASS
+|`/unpack/config` |POST |PASS
+|`/unpack/all/config` |POST |PASS
+|===
+
+== Known Issues
+
+=== Issue 1: Language Detection Accuracy
+
+Short texts may not be detected reliably. The `/language/stream` endpoint
works best with substantial text content.
+
+== Quick Reference
+
+=== Basic Parsing
+[source,bash]
+----
+# Text output
+curl -X PUT -T file.pdf http://localhost:9998/tika/text
+
+# HTML output
+curl -X PUT -T file.pdf http://localhost:9998/tika/html
+
+# JSON output (metadata + content)
+curl -X PUT -T file.pdf http://localhost:9998/tika/json
+----
+
+=== Metadata Only
+[source,bash]
+----
+curl -X PUT -H "Accept: application/json" -T file.pdf
http://localhost:9998/meta
+----
+
+=== Recursive Metadata
+[source,bash]
+----
+curl -X PUT -T file.docx http://localhost:9998/rmeta
+curl -X PUT -T file.docx http://localhost:9998/rmeta/text
+----
+
+=== Detection
+[source,bash]
+----
+curl -X PUT -T file.pdf http://localhost:9998/detect/stream
+----
+
+=== Extract Embedded Files
+[source,bash]
+----
+curl -X PUT -T file.docx http://localhost:9998/unpack/all -o output.zip
+----
+
+== Implementation Notes
+
+=== Automatic Component Configuration
+
+The server automatically configures the required fetcher and emitter for
pipes-based parsing:
+
+* **tika-server-fetcher**: A file-system-fetcher with `basePath` pointing to a
dedicated temp directory for input files. This enables the `/tika`, `/rmeta`,
and `/meta` endpoints to work with uploaded files.
+
+* **unpack-emitter**: A file-system-emitter with `basePath` pointing to a
dedicated temp directory for unpacked files. This is only created when the
`/unpack` endpoint is enabled (default). This enables the `/unpack/all`
endpoint to return embedded files as a ZIP.
+
+Both temp directories are cleaned up on server shutdown.
+
+If a user config file does not include `plugin-roots`, the server
automatically adds a default value pointing to a `plugins` directory in the
current working directory.
+
+=== Security Boundary
+
+Child processes (pipes workers) are configured with `basePath` rather than
`allowAbsolutePaths`, ensuring they can only access files within their
designated temp directories. This provides a security boundary between the
parent server process and forked child processes.
diff --git a/docs/modules/ROOT/pages/migration-to-4x/index.adoc
b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
index 20a7c5cf48..eebf29f3db 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/index.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
@@ -31,3 +31,20 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and
support schedules.
* xref:migration-to-4x/design-notes-4x.adoc[Design Notes] - Architectural
decisions and design rationale
* xref:migration-to-4x/serialization-4x.adoc[Serialization] - JSON
serialization design and implementation details
+
+== TODOs / Missing Features in 4.x
+
+The following features from 3.x are not yet implemented in 4.x:
+
+=== Config Serialization
+
+The following tika-app options for dumping configuration are not yet available:
+
+* `--dump-minimal-config` - Print minimal TikaConfig
+* `--dump-current-config` - Print current TikaConfig
+* `--dump-static-config` - Print static config
+* `--dump-static-full-config` - Print static explicit config
+
+These require completing the JSON serialization support for TikaConfig
objects. The underlying serialization infrastructure exists (see
xref:migration-to-4x/serialization-4x.adoc[Serialization]) but the CLI
integration is pending.
+
+*Workaround:* Manually create JSON config files using the templates in
`tika-pipes/tika-async-cli/src/main/resources/config-template.json` as a
starting point.
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index c8cd0a7242..5c963f4809 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -76,16 +76,17 @@ The converter currently supports:
"sortByPosition": true,
"maxMainMemoryBytes": 1000000
}
- },
- {
- "default-parser": {
- "_exclude": ["pdf-parser"]
- }
}
]
}
----
+NOTE: When you configure a parser with specific settings in JSON, the loader
automatically
+excludes it from SPI loading. The parser (e.g., `pdf-parser`) is not even
instantiated in
+`default-parser` if there's a definition for it in the tika-config.json.
Explicit `_exclude`
+directives are only needed when you want to disable a parser entirely without
providing
+custom configuration.
+
=== Key Differences
[cols="1,1,2"]
@@ -102,13 +103,9 @@ The converter currently supports:
|Exclusions
|`<parser-exclude class="..."/>`
-|`"_exclude": ["component-name"]`
+|`"_exclude": ["component-name"]` (only needed to disable a parser entirely)
|===
-NOTE: When you configure a parser with specific settings in JSON, the loader
automatically
-excludes it from SPI loading. Explicit exclusions are only needed when you
want to disable
-a parser entirely without providing custom configuration.
-
=== Limitations
The automatic converter has some limitations:
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index 38a0094f79..e3561ecf5f 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -58,6 +58,22 @@ public class AsyncHelper {
argList.add(mode);
} else if (arg.equals(UNPACK_INCLUDE_METADATA)) {
argList.add("--unpack-include-metadata");
+ } else if (arg.equals("-t") || arg.equals("--text")) {
+ // Translate TikaCLI text output to TikaAsyncCLI handler type
+ argList.add("-h");
+ argList.add("t");
+ } else if (arg.equals("--html")) {
+ // Translate TikaCLI html output to TikaAsyncCLI handler type
+ // Note: TikaCLI uses -h for html, but TikaAsyncCLI uses -h
for handler type
+ argList.add("-h");
+ argList.add("h");
+ } else if (arg.equals("-x") || arg.equals("--xml")) {
+ // Translate TikaCLI xml output to TikaAsyncCLI handler type
+ argList.add("-h");
+ argList.add("x");
+ } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
+ // TikaAsyncCLI always outputs JSON with recursive metadata
(RMETA mode)
+ // This is already the default, so we just skip this arg
} else {
argList.add(args[i]);
}
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 5f388865c4..97ca90a489 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -38,7 +38,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
@@ -276,25 +275,11 @@ public class TikaCLI {
if (args.length == 1 && args[0].endsWith(".json")) {
TikaAsyncCLI.main(args);
return;
- };
- //TODO -- are there other shortcuts?
- Path tmpConfig = null;
- try {
- tmpConfig = Files.createTempFile("tika-config-", ".json");
-
Files.copy(TikaCLI.class.getResourceAsStream("/tika-config-default-single-file.json"),
- tmpConfig, StandardCopyOption.REPLACE_EXISTING);
- List<String> argList = new ArrayList<>();
- argList.add("-c");
- argList.add(tmpConfig.toAbsolutePath().toString());
- for (String arg : args) {
- argList.add(arg);
- }
- TikaAsyncCLI.main(argList.toArray(new String[0]));
- } finally {
- if (tmpConfig != null) {
- Files.delete(tmpConfig);
- }
}
+ // For batch mode (two directories), pass directly to TikaAsyncCLI.
+ // It will create its own config with PluginsWriter that includes
+ // plugin-roots, fetcher, emitter, and pipes-iterator configuration.
+ TikaAsyncCLI.main(args);
}
/**
@@ -350,12 +335,34 @@ public class TikaCLI {
private boolean testForAsync(String[] args) {
+ // Single .json file is a config file for async mode
+ if (args.length == 1 && args[0].endsWith(".json")) {
+ return true;
+ }
+
if (args.length == 2) {
if (Files.isDirectory(Paths.get(args[0]))) {
return true;
}
}
+ // Check if last two args are directories (batch mode with options)
+ if (args.length >= 2) {
+ String lastArg = args[args.length - 1];
+ String secondLastArg = args[args.length - 2];
+ // Make sure neither looks like an option value
+ if (!lastArg.startsWith("-") && !secondLastArg.startsWith("-")) {
+ try {
+ if (Files.isDirectory(Paths.get(secondLastArg)) &&
+ (Files.isDirectory(Paths.get(lastArg)) ||
!Files.exists(Paths.get(lastArg)))) {
+ return true;
+ }
+ } catch (Exception e) {
+ // Invalid path, not batch mode
+ }
+ }
+ }
+
for (String arg : args) {
if (arg.equals("-a") || arg.equals("--async")) {
return true;
@@ -590,10 +597,12 @@ public class TikaCLI {
out.println();
out.println(" --config=<tika-config.xml>");
out.println(" TikaConfig file. Must be specified before -g, -s,
-f or the dump-x-config !");
- out.println(" --dump-minimal-config Print minimal TikaConfig");
- out.println(" --dump-current-config Print current TikaConfig");
- out.println(" --dump-static-config Print static config");
- out.println(" --dump-static-full-config Print static explicit
config");
+ // TODO: TIKA-XXXX - Re-enable config dump options once JSON
serialization is complete
+ // These options are not yet implemented in 4.x due to the migration
from XML to JSON config
+ // out.println(" --dump-minimal-config Print minimal TikaConfig");
+ // out.println(" --dump-current-config Print current TikaConfig");
+ // out.println(" --dump-static-config Print static config");
+ // out.println(" --dump-static-full-config Print static explicit
config");
out.println("
--convert-config-xml-to-json=<input.xml>,<output.json>");
out.println(" Convert legacy XML config to JSON format (parsers
section only)");
out.println("");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index 9885feac3f..a26f247500 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -28,4 +28,47 @@ public class AsyncHelperTest {
String[] expected = new String[]{"-c", "blah.json", "-i",
"input.docx", "-o", "output/dir"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
+
+ @Test
+ public void testTextHandler() throws Exception {
+ String[] args = new String[]{"-t", "input", "output"};
+ String[] expected = new String[]{"-h", "t", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testTextHandlerLong() throws Exception {
+ String[] args = new String[]{"--text", "input", "output"};
+ String[] expected = new String[]{"-h", "t", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testHtmlHandler() throws Exception {
+ String[] args = new String[]{"--html", "input", "output"};
+ String[] expected = new String[]{"-h", "h", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testXmlHandler() throws Exception {
+ String[] args = new String[]{"-x", "input", "output"};
+ String[] expected = new String[]{"-h", "x", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testJsonRecursiveSkipped() throws Exception {
+ // -J is the default in async mode, so it's just skipped
+ String[] args = new String[]{"-J", "-t", "input", "output"};
+ String[] expected = new String[]{"-h", "t", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testBatchModeWithOptions() throws Exception {
+ String[] args = new String[]{"-J", "-t", "/path/to/input",
"/path/to/output"};
+ String[] expected = new String[]{"-h", "t", "/path/to/input",
"/path/to/output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
}
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 0de27d2354..8c3d78cd34 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -568,6 +568,57 @@ public class TikaCLITest {
"Should have at least 2 files (json + embedded), got " +
fileNames.size() + ": " + fileNames);
}
+ /**
+ * Test that --extract-dir option correctly sets the output directory
+ * for both -z (shallow) and -Z (recursive) extraction modes.
+ */
+ @Test
+ public void testExtractDirOption() throws Exception {
+ Path input = Paths.get(new URI(resourcePrefix +
"/test_recursive_embedded.docx"));
+ Path pluginsDir = Paths.get("target/plugins");
+
+ // Test with -z (shallow extraction)
+ String[] params = {"-z",
+ "--extract-dir=" + extractDir.toAbsolutePath(),
+ "-p", pluginsDir.toAbsolutePath().toString(),
+ input.toAbsolutePath().toString()};
+
+ TikaCLI.main(params);
+
+ Set<String> fileNames = getFileNames(extractDir);
+
+ // Should have extracted files in the specified directory, not current
dir
+ assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
+ "Should have a .json metadata file in extractDir, got: " +
fileNames);
+ assertTrue(fileNames.stream().anyMatch(f -> f.contains("-embed/")),
+ "Should have extracted embedded files in extractDir, got: " +
fileNames);
+ }
+
+ /**
+ * Test that --extract-dir option works with -Z (recursive) extraction.
+ */
+ @Test
+ public void testExtractDirOptionRecursive() throws Exception {
+ Path input = Paths.get(new URI(resourcePrefix +
"/test_recursive_embedded.docx"));
+ Path pluginsDir = Paths.get("target/plugins");
+
+ // Test with -Z (recursive extraction)
+ String[] params = {"-Z",
+ "--extract-dir=" + extractDir.toAbsolutePath(),
+ "-p", pluginsDir.toAbsolutePath().toString(),
+ input.toAbsolutePath().toString()};
+
+ TikaCLI.main(params);
+
+ Set<String> fileNames = getFileNames(extractDir);
+
+ // Should have extracted files in the specified directory
+ assertTrue(fileNames.stream().anyMatch(f -> f.endsWith(".json")),
+ "Should have a .json metadata file in extractDir, got: " +
fileNames);
+ assertTrue(fileNames.stream().anyMatch(f -> f.contains("-embed/")),
+ "Should have extracted embedded files in extractDir, got: " +
fileNames);
+ }
+
@Test
public void testDefaultConfigException() throws Exception {
//default xml parser will throw TikaException
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
index e16f0a9b6b..bca9d1a664 100644
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
+++
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
@@ -3,10 +3,7 @@
{
"default-parser": {
"_exclude": [
- "tesseract-ocr-parser",
- "pdf-parser",
- "ooxml-parser",
- "office-parser"
+ "tesseract-ocr-parser"
]
}
},
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/org/apache/tika/parser/ocr/configs/tika-config-restricted-gdal.json
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/org/apache/tika/parser/ocr/configs/tika-config-restricted-gdal.json
index 40c05e4288..99cf597805 100644
---
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/org/apache/tika/parser/ocr/configs/tika-config-restricted-gdal.json
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/resources/org/apache/tika/parser/ocr/configs/tika-config-restricted-gdal.json
@@ -1,10 +1,5 @@
{
"parsers": [
- {
- "default-parser": {
- "_exclude": ["gdal-parser"]
- }
- },
{
"gdal-parser": {
"_mime-exclude": ["image/jpeg", "image/png", "image/jp2", "image/gif"]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
index 00c67e9ebe..3474b85822 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
@@ -1,10 +1,5 @@
{
"parsers": [
- {
- "default-parser": {
- "_exclude": ["tesseract-ocr-parser"]
- }
- },
{
"tesseract-ocr-parser": {
"tesseractPath": "C:\\Program Files\\Tesseract OCR",
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index c6e7a30af8..1257c48e4c 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -17,7 +17,6 @@
package org.apache.tika.async.cli;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -52,30 +51,59 @@ public class PluginsWriter {
}
}
try {
- String jsonTemplate = new
String(getClass().getResourceAsStream("/config-template.json").readAllBytes(),
StandardCharsets.UTF_8);
- String json = jsonTemplate.replace("FETCHER_BASE_PATH",
baseInput.toAbsolutePath().toString());
- json = json.replace("EMITTER_BASE_PATH",
baseOutput.toAbsolutePath().toString());
- String pluginString =
StringUtils.isBlank(simpleAsyncConfig.getPluginsDir()) ? "plugins" :
simpleAsyncConfig.getPluginsDir();
+ ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper();
+ ObjectNode root = (ObjectNode) objectMapper.readTree(
+ getClass().getResourceAsStream("/config-template.json"));
+
+ // Set fetcher basePath
+ ObjectNode fetchers = (ObjectNode) root.get("fetchers");
+ if (fetchers != null && fetchers.has("fsf")) {
+ ObjectNode fsf = (ObjectNode) fetchers.get("fsf");
+ if (fsf != null && fsf.has("file-system-fetcher")) {
+ ObjectNode fsFetcher = (ObjectNode)
fsf.get("file-system-fetcher");
+ fsFetcher.put("basePath",
baseInput.toAbsolutePath().toString());
+ }
+ }
+
+ // Set emitter basePath
+ ObjectNode emitters = (ObjectNode) root.get("emitters");
+ if (emitters != null && emitters.has("fse")) {
+ ObjectNode fse = (ObjectNode) emitters.get("fse");
+ if (fse != null && fse.has("file-system-emitter")) {
+ ObjectNode fsEmitter = (ObjectNode)
fse.get("file-system-emitter");
+ fsEmitter.put("basePath",
baseOutput.toAbsolutePath().toString());
+ }
+ }
+
+ // Set pipes-iterator basePath
+ ObjectNode pipesIterator = (ObjectNode) root.get("pipes-iterator");
+ if (pipesIterator != null &&
pipesIterator.has("file-system-pipes-iterator")) {
+ ObjectNode fsIterator = (ObjectNode)
pipesIterator.get("file-system-pipes-iterator");
+ fsIterator.put("basePath",
baseInput.toAbsolutePath().toString());
+ }
+
+ // Set plugin-roots
+ String pluginString =
StringUtils.isBlank(simpleAsyncConfig.getPluginsDir()) ?
+ "plugins" : simpleAsyncConfig.getPluginsDir();
Path plugins = Paths.get(pluginString);
if (Files.isDirectory(plugins)) {
pluginString = plugins.toAbsolutePath().toString();
}
- json = json.replace("PLUGIN_ROOTS", pluginString).replace("\\",
"/");
- PipesConfig pipesConfig = new PipesConfig();
-
- pipesConfig.setNumClients(simpleAsyncConfig.getNumClients() ==
null ? 2 : simpleAsyncConfig.getNumClients());
+ root.put("plugin-roots", pluginString);
+ // Set pipes config
+ PipesConfig pipesConfig = new PipesConfig();
+ pipesConfig.setNumClients(simpleAsyncConfig.getNumClients() ==
null ?
+ 2 : simpleAsyncConfig.getNumClients());
if (simpleAsyncConfig.getXmx() != null) {
pipesConfig.setForkedJvmArgs(new
ArrayList<>(List.of(simpleAsyncConfig.getXmx())));
}
if (simpleAsyncConfig.getTimeoutMs() != null) {
pipesConfig.setTimeoutMillis(simpleAsyncConfig.getTimeoutMs());
}
- ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper();
- ObjectNode root = (ObjectNode)
objectMapper.readTree(json.getBytes(StandardCharsets.UTF_8));
root.set("pipes", objectMapper.valueToTree(pipesConfig));
- Files.writeString(output, root.toString());
+
objectMapper.writerWithDefaultPrettyPrinter().writeValue(output.toFile(), root);
} catch (Exception e) {
throw new IOException(e);
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 72531fcc66..4687845389 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -25,6 +25,9 @@ import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeoutException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
@@ -115,6 +118,13 @@ public class TikaAsyncCLI {
tikaConfig = tmpTikaConfig;
PluginsWriter pluginsWriter = new
PluginsWriter(simpleAsyncConfig, tikaConfig);
pluginsWriter.write(tikaConfig);
+ } else {
+ // User provided a config - ensure plugin-roots is set
+ tikaConfig = ensurePluginRoots(tikaConfig,
simpleAsyncConfig.getPluginsDir());
+ if
(!tikaConfig.equals(Paths.get(simpleAsyncConfig.getTikaConfig()))) {
+ // A new merged config was created, mark for cleanup
+ tmpTikaConfig = tikaConfig;
+ }
}
pipesIterator = buildPipesIterator(tikaConfig, simpleAsyncConfig);
@@ -260,10 +270,13 @@ public class TikaAsyncCLI {
throw new TikaConfigException("Input file/dir must exist: " +
inputPath);
}
inputDir = inString;
- if (Files.isRegularFile(inputPath)) {
- outputDir = Paths.get(".").toAbsolutePath().toString();
- } else {
- outputDir = Paths.get("output").toAbsolutePath().toString();
+ // Only set default outputDir if not already specified via -o
+ if (outputDir == null) {
+ if (Files.isRegularFile(inputPath)) {
+ outputDir = Paths.get(".").toAbsolutePath().toString();
+ } else {
+ outputDir =
Paths.get("output").toAbsolutePath().toString();
+ }
}
}
@@ -368,6 +381,43 @@ public class TikaAsyncCLI {
parseContext.set(UnpackConfig.class, config);
}
+ private static final String DEFAULT_PLUGINS_DIR = "plugins";
+
+ /**
+ * Ensures plugin-roots is set in the config. If missing, creates a merged
config
+ * with a default plugin-roots value.
+ *
+ * @param originalConfigPath the user's config file path
+ * @param pluginsDir optional plugins directory from command line (may be
null)
+ * @return the config path to use (original if plugin-roots exists, or a
new merged config)
+ */
+ static Path ensurePluginRoots(Path originalConfigPath, String pluginsDir)
throws IOException {
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode rootNode = mapper.readTree(originalConfigPath.toFile());
+
+ if (rootNode.has("plugin-roots")) {
+ // plugin-roots already set, use original config
+ return originalConfigPath;
+ }
+
+ // Need to add plugin-roots
+ ObjectNode mutableRoot = (ObjectNode) rootNode;
+ String pluginString = StringUtils.isBlank(pluginsDir) ?
DEFAULT_PLUGINS_DIR : pluginsDir;
+ Path plugins = Paths.get(pluginString);
+ if (Files.isDirectory(plugins)) {
+ pluginString = plugins.toAbsolutePath().toString();
+ }
+ mutableRoot.put("plugin-roots", pluginString);
+
+ // Write merged config to temp file
+ Path mergedConfig = Files.createTempFile("tika-async-merged-config-",
".json");
+
mapper.writerWithDefaultPrettyPrinter().writeValue(mergedConfig.toFile(),
mutableRoot);
+ mergedConfig.toFile().deleteOnExit();
+
+ LOG.info("Added default plugin-roots to config: {}", pluginString);
+ return mergedConfig;
+ }
+
private static void usage(Options options) throws IOException {
System.out.println("Two primary options:");
System.out.println("\t1. Specify a tika-config.xml on the commandline
that includes the definitions for async");
diff --git a/tika-pipes/tika-async-cli/src/main/resources/config-template.json
b/tika-pipes/tika-async-cli/src/main/resources/config-template.json
index ee1efd49dc..15cd90b19f 100644
--- a/tika-pipes/tika-async-cli/src/main/resources/config-template.json
+++ b/tika-pipes/tika-async-cli/src/main/resources/config-template.json
@@ -53,10 +53,7 @@
"basePath": "FETCHER_BASE_PATH",
"countTotal": true,
"fetcherId": "fsf",
- "emitterId": "fse",
- "onParseException": "EMIT",
- "maxWaitMs": 600000,
- "queueSize": 10000
+ "emitterId": "fse"
}
},
"pipes": {
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index 88f8371bdc..ef446d2fd7 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -17,9 +17,17 @@
package org.apache.tika.async.cli;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -83,4 +91,86 @@ public class AsyncCliParserTest {
}
//TODO -- test for file list with and without inputDir
+
+ @TempDir
+ Path tempDir;
+
+ @Test
+ public void testEnsurePluginRootsAddsDefault() throws Exception {
+ // Create a config without plugin-roots
+ Path configPath = tempDir.resolve("config-no-plugins.json");
+ Files.writeString(configPath, """
+ {
+ "pipes": {
+ "numClients": 2
+ }
+ }
+ """);
+
+ // ensurePluginRoots should create a new config with plugin-roots added
+ Path result = TikaAsyncCLI.ensurePluginRoots(configPath, null);
+
+ // Should return a different path (merged config)
+ assertFalse(result.equals(configPath), "Should create a new merged
config");
+
+ // The merged config should have plugin-roots
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode root = mapper.readTree(result.toFile());
+ assertTrue(root.has("plugin-roots"), "Merged config should have
plugin-roots");
+ assertEquals("plugins", root.get("plugin-roots").asText());
+
+ // Original config values should be preserved
+ assertTrue(root.has("pipes"));
+ assertEquals(2, root.get("pipes").get("numClients").asInt());
+
+ // Clean up
+ Files.deleteIfExists(result);
+ }
+
+ @Test
+ public void testEnsurePluginRootsPreservesExisting() throws Exception {
+ // Create a config with plugin-roots already set
+ Path configPath = tempDir.resolve("config-with-plugins.json");
+ Files.writeString(configPath, """
+ {
+ "plugin-roots": "/custom/plugins",
+ "pipes": {
+ "numClients": 4
+ }
+ }
+ """);
+
+ // ensurePluginRoots should return the original path (no merging
needed)
+ Path result = TikaAsyncCLI.ensurePluginRoots(configPath, null);
+
+ // Should return the same path
+ assertEquals(configPath, result, "Should return original config when
plugin-roots exists");
+ }
+
+ @Test
+ public void testEnsurePluginRootsUsesCommandLineOption() throws Exception {
+ // Create a config without plugin-roots
+ Path configPath = tempDir.resolve("config-no-plugins2.json");
+ Files.writeString(configPath, """
+ {
+ "pipes": {
+ "numClients": 2
+ }
+ }
+ """);
+
+ // ensurePluginRoots with a custom plugins dir
+ Path result = TikaAsyncCLI.ensurePluginRoots(configPath,
"/my/custom/plugins");
+
+ // Should create a merged config with the custom plugins dir
+ assertFalse(result.equals(configPath));
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode root = mapper.readTree(result.toFile());
+ assertTrue(root.has("plugin-roots"));
+ assertEquals("/my/custom/plugins", root.get("plugin-roots").asText());
+
+ // Clean up
+ Files.deleteIfExists(result);
+ }
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index d28cbb96c8..fdc8883f3b 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -27,7 +27,6 @@ import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
-import java.util.Locale;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
@@ -444,6 +443,18 @@ public class TikaServerProcess {
return endpoints.contains("tika") || endpoints.contains("rmeta");
}
+ /**
+ * Determines if the /unpack endpoint is enabled based on configured
endpoints.
+ */
+ private static boolean isUnpackEndpointEnabled(TikaServerConfig
tikaServerConfig) {
+ List<String> endpoints = tikaServerConfig.getEndpoints();
+ // If no endpoints specified, all default endpoints are loaded
(including unpack)
+ if (endpoints == null || endpoints.isEmpty()) {
+ return true;
+ }
+ return endpoints.contains("unpack");
+ }
+
/**
* Initializes the PipesParsingHelper for pipes-based parsing with process
isolation.
* <p>
@@ -452,22 +463,42 @@ public class TikaServerProcess {
* <p>
* If no config file is provided, a minimal default configuration will be
created.
* The plugin-roots will default to a "plugins" directory at the same
level as the server jar.
+ * <p>
+ * A dedicated temp directory is created for input files, and a
file-system-fetcher
+ * is configured with basePath pointing to that directory. This ensures
child processes
+ * can only access files in the designated temp directory (security
boundary).
*
* @param tikaServerConfig the server configuration
* @return the PipesParsingHelper
* @throws Exception if pipes initialization fails
*/
private static PipesParsingHelper initPipesParsingHelper(TikaServerConfig
tikaServerConfig) throws Exception {
- // Load or create config
+ // Create dedicated temp directory for input files
+ Path inputTempDirectory =
Files.createTempDirectory("tika-server-input-");
+ LOG.info("Created input temp directory: {}", inputTempDirectory);
+
+ // Only create unpack temp directory if /unpack endpoint is enabled
+ Path unpackTempDirectory = null;
+ if (isUnpackEndpointEnabled(tikaServerConfig)) {
+ unpackTempDirectory =
Files.createTempDirectory("tika-server-unpack-");
+ LOG.info("Created unpack temp directory: {}", unpackTempDirectory);
+ }
+
+ // Load or create config, adding the fetcher (and emitter if unpack is
enabled)
Path configPath;
if (tikaServerConfig.hasConfigFile()) {
configPath = tikaServerConfig.getConfigPath();
} else {
- configPath = createDefaultConfig();
+ configPath = createDefaultConfig(inputTempDirectory,
unpackTempDirectory);
}
TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath);
+ // Ensure fetcher (and emitter if unpack is enabled) are configured
with correct basePaths
+ configPath = ensureServerComponents(configPath, tikaJsonConfig,
+ inputTempDirectory, unpackTempDirectory);
+ tikaJsonConfig = TikaJsonConfig.load(configPath);
+
// Load or create PipesConfig with defaults
PipesConfig pipesConfig = tikaJsonConfig.deserialize("pipes",
PipesConfig.class);
if (pipesConfig == null) {
@@ -480,13 +511,13 @@ public class TikaServerProcess {
// Create PipesParser
PipesParser pipesParser = PipesParser.load(tikaJsonConfig,
pipesConfig, configPath);
- // Try to determine unpack emitter basePath from config
- Path unpackEmitterBasePath = getUnpackEmitterBasePath(tikaJsonConfig);
-
// Create and return the helper
- PipesParsingHelper helper = new PipesParsingHelper(pipesParser,
pipesConfig, unpackEmitterBasePath);
+ PipesParsingHelper helper = new PipesParsingHelper(pipesParser,
pipesConfig,
+ inputTempDirectory, unpackTempDirectory);
- // Register shutdown hook to clean up PipesParser
+ // Register shutdown hook to clean up PipesParser and temp directories
+ final Path inputDirToClean = inputTempDirectory;
+ final Path unpackDirToClean = unpackTempDirectory;
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
try {
LOG.info("Shutting down PipesParser");
@@ -494,62 +525,32 @@ public class TikaServerProcess {
} catch (Exception e) {
LOG.warn("Error closing PipesParser", e);
}
+ // Clean up temp directories
+ cleanupTempDirectory(inputDirToClean);
+ if (unpackDirToClean != null) {
+ cleanupTempDirectory(unpackDirToClean);
+ }
}));
return helper;
}
- /**
- * Attempts to determine the basePath for the unpack-emitter from the
config.
- * Returns null if the emitter is not configured or basePath cannot be
determined.
- */
- private static Path getUnpackEmitterBasePath(TikaJsonConfig
tikaJsonConfig) {
+ private static void cleanupTempDirectory(Path tempDir) {
try {
- java.util.Map<String, com.fasterxml.jackson.databind.JsonNode>
emitters =
- tikaJsonConfig.getComponents("emitters");
- if (emitters == null ||
!emitters.containsKey(PipesParsingHelper.UNPACK_EMITTER_ID)) {
- LOG.debug("No unpack-emitter configured, UNPACK mode will not
be available");
- return null;
- }
-
- com.fasterxml.jackson.databind.JsonNode emitterConfig =
- emitters.get(PipesParsingHelper.UNPACK_EMITTER_ID);
- com.fasterxml.jackson.databind.JsonNode basePath =
findBasePath(emitterConfig);
- if (basePath != null && basePath.isTextual()) {
- Path path = Path.of(basePath.asText());
- if (Files.isDirectory(path)) {
- LOG.info("UNPACK mode enabled with basePath: {}", path);
- return path;
- } else {
- LOG.warn("unpack-emitter basePath does not exist: {}",
path);
- }
+ if (Files.exists(tempDir)) {
+ Files.walk(tempDir)
+ .sorted((a, b) -> -a.compareTo(b)) // Delete files
before directories
+ .forEach(p -> {
+ try {
+ Files.deleteIfExists(p);
+ } catch (IOException e) {
+ LOG.warn("Failed to delete: {}", p);
+ }
+ });
}
- } catch (Exception e) {
- LOG.warn("Failed to determine unpack-emitter basePath", e);
+ } catch (IOException e) {
+ LOG.warn("Error cleaning up temp directory: {}", tempDir, e);
}
- return null;
- }
-
- /**
- * Recursively searches for "basePath" in a JSON node.
- */
- private static com.fasterxml.jackson.databind.JsonNode findBasePath(
- com.fasterxml.jackson.databind.JsonNode node) {
- if (node == null) {
- return null;
- }
- if (node.has("basePath")) {
- return node.get("basePath");
- }
- for (com.fasterxml.jackson.databind.JsonNode child : node) {
- if (child.isObject()) {
- com.fasterxml.jackson.databind.JsonNode result =
findBasePath(child);
- if (result != null) {
- return result;
- }
- }
- }
- return null;
}
/**
@@ -559,36 +560,150 @@ public class TikaServerProcess {
/**
* Creates a default configuration file with plugin-roots set to the
"plugins" directory
- * relative to the current working directory.
+ * relative to the current working directory, the tika-server-fetcher
configured
+ * with basePath pointing to the input temp directory, and optionally the
unpack-emitter
+ * configured with basePath pointing to the unpack temp directory.
+ *
+ * @param inputTempDirectory the temp directory for input files
+ * @param unpackTempDirectory the temp directory for unpack output files
(may be null)
*/
- private static Path createDefaultConfig() throws IOException {
+ private static Path createDefaultConfig(Path inputTempDirectory,
+ Path unpackTempDirectory) throws
IOException {
Path pluginsDir = Path.of(DEFAULT_PLUGINS_DIR).toAbsolutePath();
- String configJson = String.format(Locale.ROOT, """
- {
- "fetchers": {
- "file-system-fetcher": {
- "file-system-fetcher": {
- "allowAbsolutePaths": true
- }
- }
- },
- "pipes": {
- "numClients": 4,
- "timeoutMillis": 60000
- },
- "plugin-roots": "%s"
- }
- """, pluginsDir.toString().replace("\\", "/"));
+ com.fasterxml.jackson.databind.ObjectMapper mapper =
+ new com.fasterxml.jackson.databind.ObjectMapper();
+ com.fasterxml.jackson.databind.node.ObjectNode rootNode =
mapper.createObjectNode();
+
+ // Create fetchers section
+ com.fasterxml.jackson.databind.node.ObjectNode fetchersNode =
mapper.createObjectNode();
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherNode =
mapper.createObjectNode();
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig =
mapper.createObjectNode();
+ fetcherTypeConfig.put("basePath",
inputTempDirectory.toAbsolutePath().toString());
+ fetcherNode.set("file-system-fetcher", fetcherTypeConfig);
+ fetchersNode.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode);
+ rootNode.set("fetchers", fetchersNode);
+
+ // Create emitters section if unpack is enabled
+ if (unpackTempDirectory != null) {
+ com.fasterxml.jackson.databind.node.ObjectNode emittersNode =
mapper.createObjectNode();
+ com.fasterxml.jackson.databind.node.ObjectNode emitterNode =
mapper.createObjectNode();
+ com.fasterxml.jackson.databind.node.ObjectNode emitterTypeConfig =
mapper.createObjectNode();
+ emitterTypeConfig.put("basePath",
unpackTempDirectory.toAbsolutePath().toString());
+ emitterTypeConfig.put("onExists", "REPLACE");
+ emitterNode.set("file-system-emitter", emitterTypeConfig);
+ emittersNode.set(PipesParsingHelper.UNPACK_EMITTER_ID,
emitterNode);
+ rootNode.set("emitters", emittersNode);
+ }
+
+ // Create pipes section
+ com.fasterxml.jackson.databind.node.ObjectNode pipesNode =
mapper.createObjectNode();
+ pipesNode.put("numClients", 4);
+ pipesNode.put("timeoutMillis", 60000);
+ rootNode.set("pipes", pipesNode);
+
+ // Set plugin-roots
+ rootNode.put("plugin-roots", pluginsDir.toString());
Path tempConfig = Files.createTempFile("tika-server-default-config-",
".json");
- Files.writeString(tempConfig, configJson);
+
mapper.writerWithDefaultPrettyPrinter().writeValue(tempConfig.toFile(),
rootNode);
tempConfig.toFile().deleteOnExit();
LOG.info("Created default config with plugin-roots: {}", pluginsDir);
return tempConfig;
}
+ /**
+ * Ensures the tika-server-fetcher exists in the config with basePath
pointing to
+ * the input temp directory. If unpackTempDirectory is provided, also
ensures the
+ * unpack-emitter exists.
+ * <p>
+ * The fetcher is used by legacy endpoints (/tika, /rmeta, etc.) to read
uploaded files
+ * that have been spooled to the input temp directory.
+ * <p>
+ * The emitter is used by /unpack endpoints to write unpacked files that
are then
+ * streamed back to the client.
+ * <p>
+ * Both components are configured with basePath (not allowAbsolutePaths)
so child processes
+ * can only access files within their designated temp directories
(security boundary).
+ *
+ * @param originalConfigPath the original config file path
+ * @param tikaJsonConfig the parsed Tika JSON config
+ * @param inputTempDirectory the temp directory for input files
+ * @param unpackTempDirectory the temp directory for unpack output files
(may be null)
+ * @return the config path to use (always a new merged config with fetcher
and optionally emitter)
+ */
+ private static Path ensureServerComponents(Path originalConfigPath,
TikaJsonConfig tikaJsonConfig,
+ Path inputTempDirectory,
+ Path unpackTempDirectory)
throws IOException {
+ LOG.info("Configuring {} with basePath={}",
PipesParsingHelper.DEFAULT_FETCHER_ID, inputTempDirectory);
+
+ // Read original config as a mutable tree
+ com.fasterxml.jackson.databind.ObjectMapper mapper =
+ new com.fasterxml.jackson.databind.ObjectMapper();
+ com.fasterxml.jackson.databind.node.ObjectNode rootNode =
+ (com.fasterxml.jackson.databind.node.ObjectNode)
mapper.readTree(originalConfigPath.toFile());
+
+ // Get or create the fetchers section
+ com.fasterxml.jackson.databind.node.ObjectNode fetchersNode;
+ if (rootNode.has("fetchers") && rootNode.get("fetchers").isObject()) {
+ fetchersNode = (com.fasterxml.jackson.databind.node.ObjectNode)
rootNode.get("fetchers");
+ } else {
+ fetchersNode = mapper.createObjectNode();
+ rootNode.set("fetchers", fetchersNode);
+ }
+
+ // Create the fetcher config with basePath
+ // Structure: "tika-server-fetcher": { "file-system-fetcher": {
"basePath": "/tmp/..." } }
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig =
mapper.createObjectNode();
+ fetcherTypeConfig.put("basePath",
inputTempDirectory.toAbsolutePath().toString());
+
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherNode =
mapper.createObjectNode();
+ fetcherNode.set("file-system-fetcher", fetcherTypeConfig);
+
+ fetchersNode.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode);
+
+ // Only add unpack-emitter if unpack endpoint is enabled
+ if (unpackTempDirectory != null) {
+ LOG.info("Configuring {} with basePath={}",
PipesParsingHelper.UNPACK_EMITTER_ID, unpackTempDirectory);
+
+ // Get or create the emitters section
+ com.fasterxml.jackson.databind.node.ObjectNode emittersNode;
+ if (rootNode.has("emitters") &&
rootNode.get("emitters").isObject()) {
+ emittersNode =
(com.fasterxml.jackson.databind.node.ObjectNode) rootNode.get("emitters");
+ } else {
+ emittersNode = mapper.createObjectNode();
+ rootNode.set("emitters", emittersNode);
+ }
+
+ // Create the emitter config with basePath
+ // Structure: "unpack-emitter": { "file-system-emitter": {
"basePath": "/tmp/...", "onExists": "REPLACE" } }
+ com.fasterxml.jackson.databind.node.ObjectNode emitterTypeConfig =
mapper.createObjectNode();
+ emitterTypeConfig.put("basePath",
unpackTempDirectory.toAbsolutePath().toString());
+ emitterTypeConfig.put("onExists", "REPLACE");
+
+ com.fasterxml.jackson.databind.node.ObjectNode emitterNode =
mapper.createObjectNode();
+ emitterNode.set("file-system-emitter", emitterTypeConfig);
+
+ emittersNode.set(PipesParsingHelper.UNPACK_EMITTER_ID,
emitterNode);
+ }
+
+ // Ensure plugin-roots is set (required for child processes)
+ if (!rootNode.has("plugin-roots")) {
+ Path pluginsDir = Path.of(DEFAULT_PLUGINS_DIR).toAbsolutePath();
+ rootNode.put("plugin-roots", pluginsDir.toString());
+ LOG.info("Added default plugin-roots: {}", pluginsDir);
+ }
+
+ // Write merged config to temp file
+ Path mergedConfig = Files.createTempFile("tika-server-merged-config-",
".json");
+
mapper.writerWithDefaultPrettyPrinter().writeValue(mergedConfig.toFile(),
rootNode);
+ mergedConfig.toFile().deleteOnExit();
+
+ LOG.debug("Created merged config: {}", mergedConfig);
+ return mergedConfig;
+ }
+
private static class ServerDetails {
JAXRSServerFactoryBean sf;
String serverId;
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
index c88a1ec799..6b1a6fe699 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
@@ -50,18 +49,9 @@ import org.apache.tika.server.core.TikaServerParseException;
* Helper class for pipes-based parsing in tika-server endpoints.
* Handles temp file management, FetchEmitTuple creation, and result
processing.
* <p>
- * To use pipes-based parsing, your tika-config.json must include a
file-system fetcher
- * with allowAbsolutePaths enabled:
- * <pre>
- * {
- * "fetchers": {
- * "file-system-fetcher": {
- * "class": "org.apache.tika.pipes.fetcher.fs.FileSystemFetcher",
- * "allowAbsolutePaths": true
- * }
- * }
- * }
- * </pre>
+ * The helper manages a dedicated temp directory for input files. A
file-system-fetcher
+ * is configured with basePath pointing to this directory, ensuring child
processes
+ * can only access files within the designated temp directory (no absolute
paths).
*/
public class PipesParsingHelper {
@@ -69,9 +59,9 @@ public class PipesParsingHelper {
/**
* The fetcher ID used for reading temp files.
- * This fetcher must be configured in the JSON config with
allowAbsolutePaths=true.
+ * This fetcher is configured with basePath = inputTempDirectory.
*/
- public static final String DEFAULT_FETCHER_ID = "file-system-fetcher";
+ public static final String DEFAULT_FETCHER_ID = "tika-server-fetcher";
private final PipesParser pipesParser;
private final PipesConfig pipesConfig;
@@ -83,33 +73,42 @@ public class PipesParsingHelper {
*
* @param pipesParser the PipesParser instance
* @param pipesConfig the PipesConfig instance
+ * @param inputTempDirectory the temp directory for input files. The
file-system-fetcher
+ * is configured with basePath = this directory.
* @param unpackEmitterBasePath the basePath where the unpack-emitter
writes files.
* This is where the server will find the zip
files created
* by UNPACK mode. May be null if UNPACK mode
won't be used.
*/
- public PipesParsingHelper(PipesParser pipesParser, PipesConfig
pipesConfig, Path unpackEmitterBasePath) {
+ public PipesParsingHelper(PipesParser pipesParser, PipesConfig pipesConfig,
+ Path inputTempDirectory, Path
unpackEmitterBasePath) {
this.pipesParser = pipesParser;
this.pipesConfig = pipesConfig;
+ this.inputTempDirectory = inputTempDirectory;
this.unpackEmitterBasePath = unpackEmitterBasePath;
- // Determine input temp directory
- String configTempDir = pipesConfig.getTempDirectory();
- if (configTempDir != null && !configTempDir.isBlank()) {
- this.inputTempDirectory = Paths.get(configTempDir);
- if (!Files.isDirectory(this.inputTempDirectory)) {
- throw new IllegalArgumentException(
- "Configured tempDirectory does not exist or is not a
directory: " + configTempDir);
- }
- } else {
- this.inputTempDirectory = null; // Use system default
+ if (inputTempDirectory == null ||
!Files.isDirectory(inputTempDirectory)) {
+ throw new IllegalArgumentException(
+ "inputTempDirectory must be a valid directory: " +
inputTempDirectory);
}
+ LOG.info("PipesParsingHelper initialized with inputTempDirectory: {}",
inputTempDirectory);
+ }
+
+ /**
+ * Gets the input temp directory path.
+ * @return the input temp directory
+ */
+ public Path getInputTempDirectory() {
+ return inputTempDirectory;
}
/**
* Parses content using pipes-based parsing with process isolation.
* <p>
- * The TikaInputStream should already be spooled to a temp file via {@link
TikaInputStream#getPath()}.
- * The caller is responsible for closing the TikaInputStream, which will
clean up any temp files.
+ * This method spools the input to the dedicated temp directory and uses a
relative
+ * filename in the FetchKey. The file-system-fetcher is configured with
basePath
+ * pointing to this directory, so the child process can only access files
there.
+ * <p>
+ * The caller is responsible for closing the TikaInputStream.
*
* @param tis the TikaInputStream containing the content to parse
* @param metadata metadata to pass to the parser (may include filename,
content-type, etc.)
@@ -122,17 +121,22 @@ public class PipesParsingHelper {
public List<Metadata> parse(TikaInputStream tis, Metadata metadata,
ParseContext parseContext, ParseMode
parseMode) throws IOException {
String requestId = UUID.randomUUID().toString();
+ Path tempFile = null;
try {
- // Get the backing file path from the spooled TikaInputStream
- Path inputFile = tis.getPath();
- LOG.debug("parse: using file {} ({} bytes)", inputFile,
Files.size(inputFile));
+ // Spool input to our dedicated temp directory with proper suffix
+ String suffix = getSuffix(metadata);
+ tempFile = Files.createTempFile(inputTempDirectory, "tika-",
suffix);
+ Files.copy(tis, tempFile,
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+
+ String relativeName = tempFile.getFileName().toString();
+ LOG.debug("parse: spooled to {} ({} bytes)", relativeName,
Files.size(tempFile));
// Set parse mode in context
parseContext.set(ParseMode.class, parseMode);
- // Create FetchEmitTuple - use NO_EMIT since we're using
PASSBACK_ALL
- FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID,
inputFile.toAbsolutePath().toString());
+ // Create FetchEmitTuple with relative filename (basePath is
configured in fetcher)
+ FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, relativeName);
FetchEmitTuple tuple = new FetchEmitTuple(
requestId,
@@ -153,9 +157,33 @@ public class PipesParsingHelper {
throw new TikaServerParseException("Parsing interrupted");
} catch (PipesException e) {
throw new TikaServerParseException(e);
+ } finally {
+ // Clean up temp file
+ if (tempFile != null) {
+ try {
+ Files.deleteIfExists(tempFile);
+ } catch (IOException e) {
+ LOG.warn("Failed to delete temp file: {}", tempFile, e);
+ }
+ }
}
}
+ /**
+ * Extracts file suffix from metadata (resource name or content-type).
+ */
+ private String getSuffix(Metadata metadata) {
+ String resourceName =
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (resourceName != null) {
+ int lastDot = resourceName.lastIndexOf('.');
+ if (lastDot > 0 && lastDot < resourceName.length() - 1) {
+ return resourceName.substring(lastDot);
+ }
+ }
+ // Default suffix
+ return ".tmp";
+ }
+
/**
* Processes the PipesResult and returns the metadata list.
*/
@@ -260,10 +288,11 @@ public class PipesParsingHelper {
* extracted embedded documents.
* <p>
* This method:
- * 1. Configures UnpackConfig with zipEmbeddedFiles=true
- * 2. The pipes child process extracts embedded files and creates a zip
- * 3. The zip is emitted to the configured file-system emitter
- * 4. Returns the path to the zip file for streaming
+ * 1. Spools input to the dedicated temp directory
+ * 2. Configures UnpackConfig with zipEmbeddedFiles=true
+ * 3. The pipes child process extracts embedded files and creates a zip
+ * 4. The zip is emitted to the configured file-system emitter
+ * 5. Returns the path to the zip file for streaming
* <p>
* The caller is responsible for deleting the zip file after streaming.
*
@@ -277,42 +306,47 @@ public class PipesParsingHelper {
public UnpackResult parseUnpack(TikaInputStream tis, Metadata metadata,
ParseContext parseContext, boolean
saveAll) throws IOException {
String requestId = UUID.randomUUID().toString();
+ Path tempFile = null;
- // Get the backing file path from the spooled TikaInputStream
- Path inputFile = tis.getPath();
- LOG.debug("parseUnpack: using file {} ({} bytes), requestId={}",
- inputFile, Files.size(inputFile), requestId);
-
- // Set parse mode to UNPACK
- parseContext.set(ParseMode.class, ParseMode.UNPACK);
-
- // Configure UnpackConfig - use existing or create new
- UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);
- if (unpackConfig == null) {
- unpackConfig = new UnpackConfig();
- }
+ try {
+ // Spool input to our dedicated temp directory with proper suffix
+ String suffix = getSuffix(metadata);
+ tempFile = Files.createTempFile(inputTempDirectory,
"tika-unpack-", suffix);
+ Files.copy(tis, tempFile,
java.nio.file.StandardCopyOption.REPLACE_EXISTING);
+
+ String relativeName = tempFile.getFileName().toString();
+ LOG.debug("parseUnpack: spooled to {} ({} bytes), requestId={}",
+ relativeName, Files.size(tempFile), requestId);
+
+ // Set parse mode to UNPACK
+ parseContext.set(ParseMode.class, ParseMode.UNPACK);
+
+ // Configure UnpackConfig - use existing or create new
+ UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);
+ if (unpackConfig == null) {
+ unpackConfig = new UnpackConfig();
+ }
- // Enable zip creation in the child process
- unpackConfig.setZipEmbeddedFiles(true);
+ // Enable zip creation in the child process
+ unpackConfig.setZipEmbeddedFiles(true);
- // Set suffix strategy to DETECTED so files get their proper
extensions (e.g., .wav, .jpg)
- unpackConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED);
+ // Set suffix strategy to DETECTED so files get their proper
extensions (e.g., .wav, .jpg)
+
unpackConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED);
- // Set emitter to our file-system emitter
- unpackConfig.setEmitter(UNPACK_EMITTER_ID);
+ // Set emitter to our file-system emitter
+ unpackConfig.setEmitter(UNPACK_EMITTER_ID);
- // Include original document if saveAll is requested
- if (saveAll) {
- unpackConfig.setIncludeOriginal(true);
- unpackConfig.setIncludeMetadataInZip(true);
- }
+ // Include original document if saveAll is requested
+ if (saveAll) {
+ unpackConfig.setIncludeOriginal(true);
+ unpackConfig.setIncludeMetadataInZip(true);
+ }
- parseContext.set(UnpackConfig.class, unpackConfig);
+ parseContext.set(UnpackConfig.class, unpackConfig);
- // Create FetchEmitTuple - the emitKey will be used to determine the
zip file location
- // The zip file will be written to: emitter.basePath + "/" + emitKey +
"-embedded.zip"
- FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID,
inputFile.toAbsolutePath().toString());
- EmitKey emitKey = new EmitKey(UNPACK_EMITTER_ID, requestId);
+ // Create FetchEmitTuple with relative filename (basePath is
configured in fetcher)
+ FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, relativeName);
+ EmitKey emitKey = new EmitKey(UNPACK_EMITTER_ID, requestId);
FetchEmitTuple tuple = new FetchEmitTuple(
requestId,
@@ -322,70 +356,80 @@ public class PipesParsingHelper {
parseContext
);
- // Execute parse via pipes
- PipesResult result;
- try {
- result = pipesParser.parse(tuple);
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw new TikaServerParseException("Parsing interrupted");
- } catch (PipesException e) {
- throw new TikaServerParseException(e);
- }
+ // Execute parse via pipes
+ PipesResult result;
+ try {
+ result = pipesParser.parse(tuple);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new TikaServerParseException("Parsing interrupted");
+ } catch (PipesException e) {
+ throw new TikaServerParseException(e);
+ }
- // Check for errors
- if (result.isProcessCrash() || result.isFatal() ||
result.isInitializationFailure()) {
- LOG.warn("UNPACK parse failed: {} - {}", result.status(),
result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.status(),
- mapStatusToHttpResponse(result.status()));
- }
+ // Check for errors
+ if (result.isProcessCrash() || result.isFatal() ||
result.isInitializationFailure()) {
+ LOG.warn("UNPACK parse failed: {} - {}", result.status(),
result.message());
+ throw new WebApplicationException(
+ "Parse failed: " + result.status(),
+ mapStatusToHttpResponse(result.status()));
+ }
- if (result.isTaskException()) {
- LOG.warn("UNPACK task exception: {} - {}", result.status(),
result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.message(),
- Response.Status.INTERNAL_SERVER_ERROR);
- }
+ if (result.isTaskException()) {
+ LOG.warn("UNPACK task exception: {} - {}", result.status(),
result.message());
+ throw new WebApplicationException(
+ "Parse failed: " + result.message(),
+ Response.Status.INTERNAL_SERVER_ERROR);
+ }
- // Get metadata list from result
- List<Metadata> metadataList = Collections.emptyList();
- EmitData emitData = result.emitData();
- if (emitData != null && emitData.getMetadataList() != null) {
- metadataList = emitData.getMetadataList();
- }
+ // Get metadata list from result
+ List<Metadata> metadataList = Collections.emptyList();
+ EmitData emitData = result.emitData();
+ if (emitData != null && emitData.getMetadataList() != null) {
+ metadataList = emitData.getMetadataList();
+ }
- // Check for parse exceptions in the container document metadata
- // These should return appropriate HTTP status codes
- if (!metadataList.isEmpty()) {
- Metadata containerMetadata = metadataList.get(0);
- String containerException =
containerMetadata.get(TikaCoreProperties.CONTAINER_EXCEPTION);
- if (containerException != null) {
- // Map exception type to HTTP status
- // 422 (Unprocessable Entity) for parse-related exceptions
- int status = 422; // Default for parse exceptions
- if (containerException.contains("EncryptedDocumentException")
||
- containerException.contains("TikaException") ||
- containerException.contains("NullPointerException") ||
- containerException.contains("IllegalStateException")) {
- status = 422;
+ // Check for parse exceptions in the container document metadata
+ // These should return appropriate HTTP status codes
+ if (!metadataList.isEmpty()) {
+ Metadata containerMetadata = metadataList.get(0);
+ String containerException =
containerMetadata.get(TikaCoreProperties.CONTAINER_EXCEPTION);
+ if (containerException != null) {
+ // Map exception type to HTTP status
+ // 422 (Unprocessable Entity) for parse-related exceptions
+ int status = 422; // Default for parse exceptions
+ if
(containerException.contains("EncryptedDocumentException") ||
+ containerException.contains("TikaException") ||
+
containerException.contains("NullPointerException") ||
+
containerException.contains("IllegalStateException")) {
+ status = 422;
+ }
+ // Build response with exception string as body for stack
trace support
+ Response response = Response.status(status)
+ .entity(containerException)
+ .type("text/plain")
+ .build();
+ throw new WebApplicationException(response);
}
- // Build response with exception string as body for stack
trace support
- Response response = Response.status(status)
- .entity(containerException)
- .type("text/plain")
- .build();
- throw new WebApplicationException(response);
}
- }
- // Determine the zip file path
- // Regular format: emitter.basePath + "/" + emitKey + "-embedded.zip"
- // Frictionless format: emitter.basePath + "/" + emitKey +
"-frictionless.zip"
- boolean isFrictionless = unpackConfig.getOutputFormat() ==
UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS;
- Path zipFile = getEmittedZipPath(requestId, isFrictionless);
+ // Determine the zip file path
+ // Regular format: emitter.basePath + "/" + emitKey +
"-embedded.zip"
+ // Frictionless format: emitter.basePath + "/" + emitKey +
"-frictionless.zip"
+ boolean isFrictionless = unpackConfig.getOutputFormat() ==
UnpackConfig.OUTPUT_FORMAT.FRICTIONLESS;
+ Path zipFile = getEmittedZipPath(requestId, isFrictionless);
- return new UnpackResult(zipFile, metadataList);
+ return new UnpackResult(zipFile, metadataList);
+ } finally {
+ // Clean up temp file
+ if (tempFile != null) {
+ try {
+ Files.deleteIfExists(tempFile);
+ } catch (IOException e) {
+ LOG.warn("Failed to delete temp file: {}", tempFile, e);
+ }
+ }
+ }
}
/**
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index d11d21984d..9cbdb7a11d 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -196,7 +196,12 @@ public abstract class CXFTestBase {
this.tika = TikaLoader.load(tmp);
+ // Create input temp directory for pipes-based parsing
+ Path inputTempDirectory =
Files.createTempDirectory("tika-server-test-input-");
+
// Initialize PipesParsingHelper for pipes-based parsing
+ // Merge the fetcher config with basePath pointing to the temp
directory
+ this.pipesConfigPath = mergeFetcherConfig(this.pipesConfigPath,
inputTempDirectory);
TikaJsonConfig tikaJsonConfig =
TikaJsonConfig.load(this.pipesConfigPath);
PipesConfig pipesConfig = tikaJsonConfig.deserialize("pipes",
PipesConfig.class);
if (pipesConfig == null) {
@@ -204,7 +209,8 @@ public abstract class CXFTestBase {
}
pipesConfig.setEmitStrategy(new
EmitStrategyConfig(EmitStrategy.PASSBACK_ALL));
this.pipesParser = PipesParser.load(tikaJsonConfig, pipesConfig,
this.pipesConfigPath);
- PipesParsingHelper pipesParsingHelper = new
PipesParsingHelper(this.pipesParser, pipesConfig, getUnpackEmitterBasePath());
+ PipesParsingHelper pipesParsingHelper = new
PipesParsingHelper(this.pipesParser, pipesConfig,
+ inputTempDirectory, getUnpackEmitterBasePath());
TikaResource.init(tika, new ServerStatus(), pipesParsingHelper);
} finally {
@@ -259,6 +265,37 @@ public abstract class CXFTestBase {
return tempConfig;
}
+ /**
+ * Merges the tika-server-fetcher configuration into the pipes config.
+ * The fetcher is configured with basePath pointing to the input temp
directory.
+ */
+ private Path mergeFetcherConfig(Path configPath, Path inputTempDirectory)
throws IOException {
+ ObjectMapper mapper = new ObjectMapper();
+ com.fasterxml.jackson.databind.node.ObjectNode root =
+ (com.fasterxml.jackson.databind.node.ObjectNode)
mapper.readTree(configPath.toFile());
+
+ // Get or create fetchers section
+ com.fasterxml.jackson.databind.node.ObjectNode fetchers =
+ (com.fasterxml.jackson.databind.node.ObjectNode)
root.get("fetchers");
+ if (fetchers == null) {
+ fetchers = mapper.createObjectNode();
+ root.set("fetchers", fetchers);
+ }
+
+ // Create the tika-server-fetcher with basePath
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherTypeConfig =
mapper.createObjectNode();
+ fetcherTypeConfig.put("basePath",
inputTempDirectory.toAbsolutePath().toString());
+
+ com.fasterxml.jackson.databind.node.ObjectNode fetcherNode =
mapper.createObjectNode();
+ fetcherNode.set("file-system-fetcher", fetcherTypeConfig);
+
+ fetchers.set(PipesParsingHelper.DEFAULT_FETCHER_ID, fetcherNode);
+
+ Path tempConfig = Files.createTempFile("tika-server-pipes-fetcher-",
".json");
+
mapper.writerWithDefaultPrettyPrinter().writeValue(tempConfig.toFile(), root);
+ return tempConfig;
+ }
+
/**
* Creates a default test config with pipes configuration.
* If the tika config contains metadata-filters, they are merged into the
pipes config.