(tika) 01/01: TIKA-4746 -- sweep docs

tallison Mon, 01 Jun 2026 03:34:59 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4746
in repository https://gitbox.apache.org/repos/asf/tika.git


commit cde93bb66d3fd2e4737211b43ab015d3efef06de
Author: tallison <[email protected]>
AuthorDate: Mon Jun 1 06:34:36 2026 -0400

    TIKA-4746 -- sweep docs
---
 docs/modules/ROOT/examples/claude-vlm-basic.json   |  11 +-
 docs/modules/ROOT/examples/claude-vlm-full.json    |  19 +-
 docs/modules/ROOT/examples/gemini-vlm-basic.json   |  11 +-
 docs/modules/ROOT/examples/gemini-vlm-full.json    |  19 +-
 docs/modules/ROOT/examples/openai-vlm-basic.json   |  12 +-
 docs/modules/ROOT/examples/openai-vlm-full.json    |  19 +-
 docs/modules/ROOT/examples/tess4j-basic.json       |  11 +-
 docs/modules/ROOT/examples/tess4j-full.json        |  19 +-
 docs/modules/ROOT/examples/vlm-pdf-parsing.json    |  17 +-
 docs/modules/ROOT/nav.adoc                         |   1 +
 docs/modules/ROOT/pages/advanced/index.adoc        |  11 +
 .../integration-testing/run-uat-script.adoc        |   2 +-
 .../integration-testing/tika-eval-regression.adoc  |   2 +-
 .../ROOT/pages/advanced/language-detection.adoc    |  67 ++---
 .../ROOT/pages/advanced/setting-limits.adoc        |   6 +-
 docs/modules/ROOT/pages/configuration/index.adoc   |  51 +++-
 .../pages/configuration/parsers/tess4j-parser.adoc |   4 +
 .../pages/configuration/parsers/vlm-parsers.adoc   |  12 +
 .../ROOT/pages/developers/serialization.adoc       |  20 +-
 docs/modules/ROOT/pages/maintainers/index.adoc     |   1 +
 .../pages/migration-to-4x/migrating-to-4x.adoc     |  15 +
 .../pages/migration-to-4x/serialization-4x.adoc    |  42 +++
 docs/modules/ROOT/pages/pipes/configuration.adoc   |  58 +++-
 docs/modules/ROOT/pages/pipes/getting-started.adoc |   4 +-
 docs/modules/ROOT/pages/pipes/index.adoc           |  98 -------
 docs/modules/ROOT/pages/pipes/iterators.adoc       |   2 +-
 docs/modules/ROOT/pages/pipes/parse-modes.adoc     |  32 ++-
 .../ROOT/pages/pipes/plugins/filesystem.adoc       |  17 +-
 docs/modules/ROOT/pages/pipes/plugins/index.adoc   |   4 +-
 docs/modules/ROOT/pages/pipes/timeouts.adoc        |  34 ++-
 docs/modules/ROOT/pages/pipes/troubleshooting.adoc |  34 +++
 docs/modules/ROOT/pages/pipes/unpack-config.adoc   |  85 ++++--
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  | 317 +++++++++++++++++----
 docs/modules/ROOT/pages/using-tika/index.adoc      |   1 +
 .../ROOT/pages/using-tika/server/index.adoc        | 201 +++++++++----
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  99 ++++++-
 .../tika-parser-tess4j-module/pom.xml              |   6 +
 .../ocr/tess4j/Tess4JConfigExamplesTest.java       |  72 +++++
 .../resources/config-examples/tess4j-basic.json    |  11 +
 .../resources/config-examples/tess4j-full.json     |  20 ++
 tika-parsers/tika-parsers-ml/tika-vlm/pom.xml      |   6 +
 .../tika/parser/vlm/VLMConfigExamplesTest.java     |  98 +++++++
 .../config-examples}/claude-vlm-basic.json         |   0
 .../config-examples}/claude-vlm-full.json          |   4 +-
 .../config-examples}/gemini-vlm-basic.json         |   0
 .../config-examples}/gemini-vlm-full.json          |   4 +-
 .../config-examples}/openai-vlm-basic.json         |   0
 .../config-examples}/openai-vlm-full.json          |   5 +-
 .../config-examples}/vlm-pdf-parsing.json          |   0
 .../resources/config-examples/tesseract-full.json  |   1 +
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |   2 +-
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    |   6 +-
 .../azblob/AZBlobPipesIteratorFactory.java         |   6 +-
 .../iterator/csv/CSVPipesIteratorFactory.java      |   6 +-
 .../fs/FileSystemPipesIteratorFactory.java         |   6 +-
 .../iterator/gcs/GCSPipesIteratorFactory.java      |   6 +-
 .../iterator/jdbc/JDBCPipesIteratorFactory.java    |   6 +-
 .../json/JsonPipesIteratorFactory.java             |   6 +-
 .../iterator/kafka/KafkaPipesIteratorFactory.java  |   6 +-
 .../pipes/iterator/s3/S3PipesIteratorFactory.java  |   6 +-
 .../iterator/solr/SolrPipesIteratorFactory.java    |   6 +-
 .../apache/tika/plugins/ThreadSafeUnzipper.java    |  13 +
 62 files changed, 1179 insertions(+), 481 deletions(-)

diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json 
b/docs/modules/ROOT/examples/claude-vlm-basic.json
deleted file mode 100644
index 5931df09ed..0000000000
--- a/docs/modules/ROOT/examples/claude-vlm-basic.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "parsers": [
-    {
-      "claude-vlm-parser": {
-        "apiKey": "sk-ant-your-key-here",
-        "model": "claude-sonnet-4-20250514"
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json 
b/docs/modules/ROOT/examples/claude-vlm-basic.json
new file mode 120000
index 0000000000..d20891ef0d
--- /dev/null
+++ b/docs/modules/ROOT/examples/claude-vlm-basic.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json 
b/docs/modules/ROOT/examples/claude-vlm-full.json
deleted file mode 100644
index 9dc7ff67d3..0000000000
--- a/docs/modules/ROOT/examples/claude-vlm-full.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "parsers": [
-    {
-      "claude-vlm-parser": {
-        "baseUrl": "https://api.anthropic.com";,
-        "model": "claude-sonnet-4-20250514",
-        "prompt": "Extract all visible text from this image. Return the text 
in markdown format, preserving the original structure (headings, lists, tables, 
paragraphs). Do not describe the image. Only return the extracted text.",
-        "maxTokens": 4096,
-        "timeoutSeconds": 300,
-        "apiKey": "sk-ant-your-key-here",
-        "inlineContent": true,
-        "skipOcr": false,
-        "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json 
b/docs/modules/ROOT/examples/claude-vlm-full.json
new file mode 120000
index 0000000000..5392d5c689
--- /dev/null
+++ b/docs/modules/ROOT/examples/claude-vlm-full.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json 
b/docs/modules/ROOT/examples/gemini-vlm-basic.json
deleted file mode 100644
index a39ee9ed82..0000000000
--- a/docs/modules/ROOT/examples/gemini-vlm-basic.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "parsers": [
-    {
-      "gemini-vlm-parser": {
-        "apiKey": "your-gemini-api-key",
-        "model": "gemini-2.5-flash"
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json 
b/docs/modules/ROOT/examples/gemini-vlm-basic.json
new file mode 120000
index 0000000000..a0354acbab
--- /dev/null
+++ b/docs/modules/ROOT/examples/gemini-vlm-basic.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json 
b/docs/modules/ROOT/examples/gemini-vlm-full.json
deleted file mode 100644
index ab09b993f0..0000000000
--- a/docs/modules/ROOT/examples/gemini-vlm-full.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "parsers": [
-    {
-      "gemini-vlm-parser": {
-        "baseUrl": "https://generativelanguage.googleapis.com";,
-        "model": "gemini-2.5-flash",
-        "prompt": "Extract all visible text from this image. Return the text 
in markdown format, preserving the original structure (headings, lists, tables, 
paragraphs). Do not describe the image. Only return the extracted text.",
-        "maxTokens": 4096,
-        "timeoutSeconds": 300,
-        "apiKey": "your-gemini-api-key",
-        "inlineContent": true,
-        "skipOcr": false,
-        "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json 
b/docs/modules/ROOT/examples/gemini-vlm-full.json
new file mode 120000
index 0000000000..94c81ee02e
--- /dev/null
+++ b/docs/modules/ROOT/examples/gemini-vlm-full.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json 
b/docs/modules/ROOT/examples/openai-vlm-basic.json
deleted file mode 100644
index f54d9063ab..0000000000
--- a/docs/modules/ROOT/examples/openai-vlm-basic.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "parsers": [
-    {
-      "openai-vlm-parser": {
-        "baseUrl": "http://127.0.0.1:8000";,
-        "model": "jinaai/jina-vlm",
-        "timeoutSeconds": 300
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json 
b/docs/modules/ROOT/examples/openai-vlm-basic.json
new file mode 120000
index 0000000000..2a73403e3e
--- /dev/null
+++ b/docs/modules/ROOT/examples/openai-vlm-basic.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json 
b/docs/modules/ROOT/examples/openai-vlm-full.json
deleted file mode 100644
index 91baafc74e..0000000000
--- a/docs/modules/ROOT/examples/openai-vlm-full.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "parsers": [
-    {
-      "openai-vlm-parser": {
-        "baseUrl": "http://127.0.0.1:8000";,
-        "model": "jinaai/jina-vlm",
-        "prompt": "Extract all visible text from this image. Return the text 
in markdown format, preserving the original structure (headings, lists, tables, 
paragraphs). Do not describe the image. Only return the extracted text.",
-        "maxTokens": 4096,
-        "timeoutSeconds": 300,
-        "apiKey": "",
-        "inlineContent": true,
-        "skipOcr": false,
-        "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json 
b/docs/modules/ROOT/examples/openai-vlm-full.json
new file mode 120000
index 0000000000..fe81340fd6
--- /dev/null
+++ b/docs/modules/ROOT/examples/openai-vlm-full.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tess4j-basic.json 
b/docs/modules/ROOT/examples/tess4j-basic.json
deleted file mode 100644
index 3fc74587be..0000000000
--- a/docs/modules/ROOT/examples/tess4j-basic.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "parsers": [
-    {
-      "name": "tess4j-parser",
-      "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
-      "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
-      "poolSize": 4
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/tess4j-basic.json 
b/docs/modules/ROOT/examples/tess4j-basic.json
new file mode 120000
index 0000000000..8be9e0b76a
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-basic.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tess4j-full.json 
b/docs/modules/ROOT/examples/tess4j-full.json
deleted file mode 100644
index c2d5170ecf..0000000000
--- a/docs/modules/ROOT/examples/tess4j-full.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "parsers": [
-    {
-      "name": "tess4j-parser",
-      "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
-      "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
-      "language": "eng",
-      "pageSegMode": 1,
-      "ocrEngineMode": 3,
-      "poolSize": 4,
-      "timeoutSeconds": 120,
-      "dpi": 300,
-      "minFileSizeToOcr": 0,
-      "maxFileSizeToOcr": 2147483647,
-      "skipOcr": false
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/tess4j-full.json 
b/docs/modules/ROOT/examples/tess4j-full.json
new file mode 120000
index 0000000000..9df8ad46f7
--- /dev/null
+++ b/docs/modules/ROOT/examples/tess4j-full.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json 
b/docs/modules/ROOT/examples/vlm-pdf-parsing.json
deleted file mode 100644
index b76c0bbf35..0000000000
--- a/docs/modules/ROOT/examples/vlm-pdf-parsing.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "parsers": [
-    {
-      "default-parser": {
-        "exclude": ["pdf-parser"]
-      }
-    },
-    {
-      "claude-vlm-parser": {
-        "apiKey": "sk-ant-your-key-here",
-        "model": "claude-sonnet-4-20250514",
-        "prompt": "Extract all text from this document. Return the text in 
markdown format, preserving the original structure (headings, lists, tables, 
paragraphs). Do not describe the document. Only return the extracted text."
-      }
-    }
-  ]
-}
diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json 
b/docs/modules/ROOT/examples/vlm-pdf-parsing.json
new file mode 120000
index 0000000000..dd246c4856
--- /dev/null
+++ b/docs/modules/ROOT/examples/vlm-pdf-parsing.json
@@ -0,0 +1 @@
+../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 2fa628fef4..070f535ff8 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -27,6 +27,7 @@
 ** xref:pipes/iterators.adoc[Iterators]
 ** xref:pipes/reporters.adoc[Reporters]
 ** xref:pipes/configuration.adoc[Pipeline Configuration]
+** xref:pipes/shared-server-mode.adoc[Shared Server Mode]
 ** xref:pipes/parse-modes.adoc[Parse Modes]
 ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]
 ** xref:pipes/timeouts.adoc[Timeouts]
diff --git a/docs/modules/ROOT/pages/advanced/index.adoc 
b/docs/modules/ROOT/pages/advanced/index.adoc
index 72d1252269..abd555de9c 100644
--- a/docs/modules/ROOT/pages/advanced/index.adoc
+++ b/docs/modules/ROOT/pages/advanced/index.adoc
@@ -19,6 +19,17 @@
 
 This section covers advanced usage and internals of Apache Tika.
 
+NOTE: Most pages here are written from a Java-API perspective. Where a topic
+has a JSON-config or CLI equivalent, look first under
+xref:configuration/index.adoc[Configuration] (per-parser options),
+xref:pipes/index.adoc[Tika Pipes] (pipeline + Pipes-mode tuning),
+xref:using-tika/server/index.adoc[Tika Server] (REST + server config), or
+xref:using-tika/cli/index.adoc[Tika CLI] (`tika-app` flags). The
+xref:advanced/setting-limits.adoc[Setting Limits] page is the model — it
+covers Java, JSON, and CLI side by side. Filing issues against specific
+advanced pages where the JSON/CLI equivalent isn't documented yet helps us
+prioritize the gap.
+
 == Topics
 
 * xref:advanced/language-detection.adoc[Language Detection] - Built-in bigram 
language detector, training pipeline, and comparison with OpenNLP
diff --git 
a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc 
b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc
index d8b44453d0..594e9e91ff 100644
--- a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc
+++ b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc
@@ -46,7 +46,7 @@ Coverage includes:
 * `/version`, `/parsers`, `/detectors`, `/mime-types` (introspection)
 * `/detect/stream` (mime detection)
 * `/tika`, `/tika/text`, `/tika/xml`, `/tika/json` (parse)
-* `/meta`, `/meta/{field}` (metadata)
+* `/meta`, `/meta/\{field}` (metadata)
 * `/rmeta`, `/rmeta/text` (recursive metadata)
 * `/unpack/all` (embedded extraction; verifies the response is a valid zip)
 * `/language/stream`
diff --git 
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
 
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
index 24460db449..a81f6fabd4 100644
--- 
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
+++ 
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc
@@ -200,7 +200,7 @@ Options:
 * `-i` / `--inputDir` — original binary input directory (optional, lets
   tika-eval pair extracts to source files even if A or B failed on some)
 * `-d` / `--db` — H2 database name/path.  A short label is fine —
-  tika-eval will create `{label}.mv.db` and a `{label}-reports/` dir
+  tika-eval will create `\{label}.mv.db` and a `\{label}-reports/` dir
   alongside.  Persist the db if you want to re-run Report later.
 * `-r` / `--report` — automatically run the Report step after Compare,
   and zip the reports directory.
diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc 
b/docs/modules/ROOT/pages/advanced/language-detection.adoc
index b95e06eafb..120c2e320f 100644
--- a/docs/modules/ROOT/pages/advanced/language-detection.adoc
+++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc
@@ -153,56 +153,29 @@ back to the general model transparently.
 
 === Overriding Model Selection
 
-The selection strategy can be overridden at construction time or per-document
-via `ParseContext`:
-
-[source,java]
-----
-// Always use the short-text model (e.g. for a title-only pipeline)
-CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
-    Map.of("strategy", "SHORT_TEXT"));
-CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg);
-
-// Always use the general model (e.g. for full-document body text)
-CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(
-    Map.of("strategy", "STANDARD"));
-
-// Per-document override via ParseContext
-ParseContext context = new ParseContext();
-context.set(CharSoupDetectorConfig.class, CharSoupDetectorConfig.fromMap(
-    Map.of("strategy", "SHORT_TEXT")));
-detector.reset(context);
-----
-
-The three strategies are:
-
-[cols="1,3"]
-|===
-| Strategy | Behaviour
-
-| `AUTOMATIC` (default)
-| Use length and feature-density gates to choose between models per chunk.
-
-| `SHORT_TEXT`
-| Always use the short-text model (no-op if the binary is absent).
-
-| `STANDARD`
-| Always use the general model regardless of input length.
-|===
-
-The thresholds can also be tuned via `CharSoupDetectorConfig`:
-
-[source,java]
+The automatic gates (200-char length, 200-feature density) and the strategy
+choice (always-short / always-general / automatic) are currently fixed in
+`CharSoupLanguageDetector`. Programmatic knobs are limited to
+`setMaxLength(int)` for the rolling buffer and `setPriors(Map)` for language
+priors.
+
+`CharSoupLanguageDetector` is registered as `@TikaComponent(name =
+"charsoup-language-detector")` and implements `SelfConfiguring`, so once a
+config record is added, declarative tuning will be available via the standard
+`parse-context` JSON section:
+
+[source,json]
 ----
-CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of(
-    "strategy",          "AUTOMATIC",
-    "lengthThreshold",   300,   // chars; default 200
-    "featureThreshold",  300    // n-gram emissions; default 200
-));
+{
+  "parse-context": {
+    "charsoup-language-detector": {
+      /* future tuning fields will go here */
+    }
+  }
+}
 ----
 
-Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring`
-component loading.
+See 
xref:migration-to-4x/serialization-4x.adoc#discovering-friendly-names[Discovering
 the friendly name for a component] for how `SelfConfiguring` components are 
resolved at runtime.
 
 == Training the Models
 
diff --git a/docs/modules/ROOT/pages/advanced/setting-limits.adoc 
b/docs/modules/ROOT/pages/advanced/setting-limits.adoc
index a3af216175..a815ae6841 100644
--- a/docs/modules/ROOT/pages/advanced/setting-limits.adoc
+++ b/docs/modules/ROOT/pages/advanced/setting-limits.adoc
@@ -316,8 +316,10 @@ When the byte limit is reached:
 [source,json]
 ----
 {
-  "parseContext": {
-    "parseMode": "UNPACK",
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
     "unpack-config": {
       "maxUnpackBytes": 104857600
     }
diff --git a/docs/modules/ROOT/pages/configuration/index.adoc 
b/docs/modules/ROOT/pages/configuration/index.adoc
index 85864b4727..068fc71c9f 100644
--- a/docs/modules/ROOT/pages/configuration/index.adoc
+++ b/docs/modules/ROOT/pages/configuration/index.adoc
@@ -16,28 +16,69 @@
 //
 
 = Configuration
+:toc:
 
 This section covers configuring Apache Tika.
 
 == Overview
 
 Tika 4.x uses JSON configuration files. Configuration controls parsers, 
detectors,
-content handlers, and other components.
+content handlers, server behavior, and the Tika Pipes pipeline.
 
 NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the
 xref:migration-to-4x/index.adoc[Migration Guide] for details on converting to 
JSON.
 
+== Top-level JSON structure
+
+A `tika-config.json` is a single JSON object whose keys are the top-level 
sections
+listed below. Every section is optional — omit what you don't need. Defaults 
are
+used wherever a section is missing.
+
+[source,json]
+----
+{
+  "parsers": [ /* parser declarations */ ],
+  "detectors": [ /* detector declarations */ ],
+  "encoding-detectors": [ /* encoding detector declarations */ ],
+  "content-handler-factory": { /* handler type for emitted content */ },
+  "parse-context": {
+    "timeout-limits": { /* progress + total task timeouts */ },
+    "unpack-config": { /* embedded-byte extraction */ }
+    /* other SelfConfiguring components by component name */
+  },
+  "server": { /* tika-server options: enableUnsecureFeatures, cors, ... */ },
+  "pipes": { /* Pipes process management: numClients, parseMode, ... */ },
+  "fetchers": { /* named fetcher instances */ },
+  "emitters": { /* named emitter instances */ },
+  "pipes-iterator": { /* iterator (one per pipeline) */ },
+  "pipes-reporters": { /* per-document status reporters */ },
+  "plugin-roots": "/path/to/plugins"
+}
+----
+
+Per-section documentation:
+
+* `parsers`, `detectors`, `encoding-detectors`, `content-handler-factory`,
+  `parse-context` — covered below under <<_topics,Topics>>.
+* `server` — see xref:using-tika/server/index.adoc[Tika Server].
+* `pipes`, `fetchers`, `emitters`, `pipes-iterator`, `pipes-reporters`,
+  `plugin-roots` — see xref:pipes/configuration.adoc[Pipes Configuration]
+  and xref:pipes/index.adoc[Tika Pipes].
+
 == Topics
 
 === Parser Configuration
 
-* xref:configuration/parsers/pdf-parser.adoc[PDFParser] - PDF parsing options
-* xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - 
OCR options for image-based text extraction
+* xref:configuration/parsers/pdf-parser.adoc[PDFParser] — PDF parsing options
+* xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] — 
OCR options for image-based text extraction
+* xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR Parser] — 
in-process OCR via tess4j JNI bindings
+* xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers] — Claude, Gemini, 
OpenAI, Ollama, vLLM
+* xref:configuration/parsers/external-parser.adoc[External Parser] — wrap 
external tools (ffmpeg, exiftool, etc.)
 
 === Other Configuration
 
-* xref:configuration/digesters.adoc[Digesters] - Computing cryptographic 
hashes of documents
-* xref:configuration/encoding-detectors.adoc[Encoding Detectors] - Configuring 
charset/encoding detection
+* xref:configuration/digesters.adoc[Digesters] — Computing cryptographic 
hashes of documents
+* xref:configuration/encoding-detectors.adoc[Encoding Detectors] — Configuring 
charset/encoding detection
 
 // Add links to specific topics as they are created
 // * xref:json-config.adoc[JSON Configuration Reference]
diff --git a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc 
b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
index fb52b1d6e0..4dccac2c4d 100644
--- a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
+++ b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc
@@ -177,6 +177,10 @@ throwing an exception.
 |`2147483647` (~2 GB)
 |Maximum input file size in bytes. Larger files are skipped.
 
+|`maxImagePixels`
+|`100000000` (100 megapixels)
+|Maximum decoded-image area. Larger images are skipped. Guards against 
decompression-bomb inputs that would blow up memory before OCR even starts.
+
 |`skipOcr`
 |`false`
 |Runtime kill-switch to disable the parser entirely.
diff --git a/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc 
b/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
index b2b8b454b3..c0f9d53d4b 100644
--- a/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
+++ b/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
@@ -209,6 +209,18 @@ text into the parent document's content stream. Mirrors
 |`maxFileSizeToOcr`
 |`52428800` (50 MB)
 |Maximum input file size in bytes.
+
+|`maxImagePixels`
+|`100000000` (100 megapixels)
+|Maximum decoded image area in pixels. Larger images are rejected before being 
sent to the model. Set to `-1` to disable the limit. Guards against 
decompression-bomb inputs and runaway VLM cost on a single huge image.
+
+|`allowRuntimePrompt`
+|`false`
+|When `false` (default), the `prompt` is fixed at initialization time and 
per-request overrides are rejected. When `true`, the prompt can be overridden 
per request via the `ParseContext` `VLMOCRConfig`. Security-relevant: a 
runtime-controllable prompt is effectively a prompt-injection surface for any 
caller that can set the `ParseContext`. Only enable when callers are trusted.
+
+|`completionsPath`
+|`/v1/chat/completions` (OpenAI/vLLM only)
+|HTTP path appended to `baseUrl` for the chat-completions endpoint. Used by 
the OpenAI-compatible parser only. Claude and Gemini hardcode their own API 
paths (`/v1/messages` and `/v1beta/models/\{model}:generateContent` 
respectively) and ignore this field.
 |===
 
 == Markdown-to-XHTML conversion
diff --git a/docs/modules/ROOT/pages/developers/serialization.adoc 
b/docs/modules/ROOT/pages/developers/serialization.adoc
index 6ec426b061..b68435d939 100644
--- a/docs/modules/ROOT/pages/developers/serialization.adoc
+++ b/docs/modules/ROOT/pages/developers/serialization.adoc
@@ -212,7 +212,7 @@ Benefits:
 [source,json]
 ----
 {
-  "parseContext": {
+  "parse-context": {
     "pdf-parser": {
       "ocrStrategy": "AUTO",
       "extractInlineImages": true
@@ -231,7 +231,7 @@ For components that need immediate deserialization (not 
lazy loading):
 [source,json]
 ----
 {
-  "parseContext": {
+  "parse-context": {
     "typed": {
       "handler-config": {
         "type": "XML",
@@ -299,23 +299,27 @@ public class UpperCaseFilter implements MetadataFilter {
 }
 ----
 
-Configure in JSON:
+Configure in JSON. Metadata filters are loaded via `parse-context` (they
+implement the `MetadataFilter` interface, which is a `ParseContext`-keyed
+component):
 
 [source,json]
 ----
 {
-  "metadata-filters": [
-    {"upper-case-filter": {"fieldName": "dc:title"}}
-  ]
+  "parse-context": {
+    "upper-case-filter": {"fieldName": "dc:title"}
+  }
 }
 ----
 
-Or with defaults:
+Or with defaults (string form constructs with no overrides):
 
 [source,json]
 ----
 {
-  "metadata-filters": ["upper-case-filter"]
+  "parse-context": {
+    "upper-case-filter": {}
+  }
 }
 ----
 
diff --git a/docs/modules/ROOT/pages/maintainers/index.adoc 
b/docs/modules/ROOT/pages/maintainers/index.adoc
index 18735fea90..0452cd8f6a 100644
--- a/docs/modules/ROOT/pages/maintainers/index.adoc
+++ b/docs/modules/ROOT/pages/maintainers/index.adoc
@@ -16,6 +16,7 @@
 //
 
 = For Maintainers
+:toc:
 
 This section contains documentation for Apache Tika project maintainers and 
committers.
 
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index d36fd51c31..2942e18d32 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -25,6 +25,21 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and 
support schedules.
 
 * Java 17 or later (upgraded from Java 11 in 3.x)
 
+== `tika-app` distribution: jar -> zip
+
+In 3.x, `tika-app-<version>.jar` was a self-contained fat jar — you could drop 
it anywhere and run `java -jar tika-app.jar`. In 4.x it is a thin launcher that 
depends on the parsers, the Tika Pipes processor, and other modules living in 
an adjacent `lib/` directory. Running the bare jar by itself will fail with 
`NoClassDefFoundError`.
+
+Download `tika-app-<version>.zip` and run from inside the unzipped directory 
so `lib/` (and `plugins/`) sit alongside the jar:
+
+[source,bash]
+----
+unzip tika-app-<version>.zip
+cd tika-app-<version>
+java -jar tika-app-<version>.jar [option...] [file...]
+----
+
+If you have build scripts or container images that drop in just the jar, 
update them to unpack the zip and run from inside it.
+
 == Configuration: XML to JSON
 
 Tika 4.x uses JSON configuration files instead of XML. The legacy 
`tika-config.xml` format
diff --git a/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc
index e11bdc4959..8afafbe55e 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc
@@ -31,6 +31,48 @@ Jackson dependencies are kept out of core modules to 
maintain flexibility.
 Implementation uses friendly names like `pdf-parser` rather than full class 
names. These friendly
 names are applied to configured items rather than configuration class names.
 
+[#discovering-friendly-names]
+==== Discovering the friendly name for a component
+
+The 4.x JSON config refers to parsers, detectors, fetchers, emitters, and 
other components
+by their friendly name (e.g., `pdf-parser`, `file-system-fetcher`). To map a 
Java class
+to its friendly name (or vice versa), use any of:
+
+. **`tika-app --list-parser-names` / `--list-detector-names`** — emits each
+  registered class with its friendly name as tab-separated 
`class<TAB>friendly-name`:
++
+[source,bash]
+----
+java -jar tika-app.jar --list-parser-names
+# org.apache.tika.parser.pdf.PDFParser     pdf-parser
+# org.apache.tika.parser.html.JSoupParser  jsoup-parser
+# ...
+----
++
+The mapping comes from the `META-INF/tika/parsers.idx` / `detectors.idx` files
+generated at compile time by the `@TikaComponent` annotation processor. The
+underlying lookup is 
`o.a.t.config.loader.ComponentRegistry.getFriendlyName(Class)`.
+. **Per-parser configuration pages** under 
xref:configuration/index.adoc[Configuration]
+  show the friendly name in their page title and JSON examples.
+. **The naming convention** — when `@TikaComponent` has no explicit `name`, the
+  friendly name is derived from the class's simple name via the kebab-case rule
+  in `o.a.t.config.loader.KebabCaseConverter`. Examples:
++
+[cols="2,2"]
+|===
+|Class |Friendly name
+
+|`PDFParser` |`pdf-parser`
+|`TesseractOCRParser` |`tesseract-ocr-parser`
+|`AutoDetectParser` |`auto-detect-parser`
+|`FileSystemFetcher` |`file-system-fetcher`
+|`SolrEmitter` |`solr-emitter`
+|===
+
+NOTE: The `--list-parsers`, `--list-detectors`, and `--list-parser-details` 
commands
+print the hierarchical, human-oriented view (class names with composite parsers
+indented). Use the `--list-*-names` variants when you want a machine-readable 
mapping.
+
 === Custom Class Support
 
 The design permits users to add custom classes through Jackson's polymorphic 
handling:
diff --git a/docs/modules/ROOT/pages/pipes/configuration.adoc 
b/docs/modules/ROOT/pages/pipes/configuration.adoc
index e4e3d0b1c2..7049456f24 100644
--- a/docs/modules/ROOT/pages/pipes/configuration.adoc
+++ b/docs/modules/ROOT/pages/pipes/configuration.adoc
@@ -84,6 +84,14 @@ See also xref:pipes/timeouts.adoc[Timeouts] for the full 
timeout model.
 |`maxWaitForClientMillis`
 |`60000`
 |Maximum time (ms) to wait for an available forked process when all are busy.
+
+|`staleFetcherTimeoutSeconds`
+|`600`
+|How long (seconds) a fetcher-emitter pairing can sit idle in the cache before 
it is eligible for eviction. Increase if your pipeline has long quiet periods 
between tuples that reuse the same fetcher/emitter.
+
+|`staleFetcherDelaySeconds`
+|`60`
+|How often (seconds) the stale-fetcher reaper runs.
 |===
 
 == Parse Behavior
@@ -131,7 +139,53 @@ These settings control how parsed results are batched 
before sending to emitters
 
 |`emitIntermediateResults`
 |`false`
-|Emit partial results as they become available (rather than waiting for the 
full parse to complete).
+|When `false`, only successfully-parsed tuples reach the emitter — files that 
crash, time out, or otherwise fail are dropped from the output. When `true`, 
every tuple is emitted, including failures (the metadata carries the 
exception). Turn this on if you need a complete record of what was attempted 
(audit, retry logic, chaos-monkey tests).
+|===
+
+== Emit Strategy
+
+`emitStrategy` controls whether parsed extracts are emitted directly from the 
forked PipesServer or passed back to the parent process first. The default is 
balanced for typical workloads — tune only if you have a memory or throughput 
problem.
+
+[source,json]
+----
+{
+  "pipes": {
+    "emitStrategy": {
+      "type": "DYNAMIC",
+      "thresholdBytes": 100000
+    }
+  }
+}
+----
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`type`
+|`DYNAMIC`
+|One of `DYNAMIC`, `EMIT_ALL`, `PASSBACK_ALL`. `DYNAMIC` switches per-extract 
based on size (see `thresholdBytes`). `EMIT_ALL` always emits from the forked 
process. `PASSBACK_ALL` always passes extracts back to the parent for emission.
+
+|`thresholdBytes`
+|`100000`
+|Only used when `type` is `DYNAMIC`. Extracts larger than this are emitted 
directly from the forked PipesServer; smaller ones are passed back to the 
parent. Setting `thresholdBytes` with type `EMIT_ALL` or `PASSBACK_ALL` is a 
config error.
+|===
+
+== Distributed Config Store
+
+For multi-host pipelines (e.g., shared-server clusters) you can store 
fetcher/emitter configuration in a distributed backend instead of memory. Most 
users should leave the defaults.
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`configStoreType`
+|`"memory"`
+|Backend for storing fetcher/emitter configurations. `"memory"` (default) is 
in-process; `"ignite"` uses Apache Ignite for shared state across nodes.
+
+|`configStoreParams`
+|`"{}"`
+|JSON object (as a string) with backend-specific parameters. Structure depends 
on `configStoreType`.
 |===
 
 == Shared Server Mode (Experimental)
@@ -162,7 +216,7 @@ include::example$pipes-fs-pipeline.json[]
 
 icon:github[] 
https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View
 source on GitHub]
 
-Tokens (`FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`) are 
substituted by the test harness — replace them with real paths in production 
configs.
+Tokens (`FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`, 
`EMIT_INTERMEDIATE_RESULTS`) are substituted by the test harness — replace them 
with real values in production configs. The first three are paths; 
`EMIT_INTERMEDIATE_RESULTS` is the boolean `emitIntermediateResults` flag.
 
 [#emit-all]
 === Emit-all variant
diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc 
b/docs/modules/ROOT/pages/pipes/getting-started.adoc
index e52e02f1ac..db6955aeb7 100644
--- a/docs/modules/ROOT/pages/pipes/getting-started.adoc
+++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc
@@ -66,13 +66,13 @@ include::example$pipes-fs-pipeline.json[]
 ----
 icon:github[] 
https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View
 source on GitHub]
 
-NOTE: The values shown like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and 
`PLUGINS_PATHS` are placeholders the integration tests substitute at runtime. 
Replace them with real paths in your own config.
+NOTE: The values shown like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, 
`PLUGINS_PATHS`, and `EMIT_INTERMEDIATE_RESULTS` are placeholders the 
integration tests substitute at runtime. Replace them with real paths (or, for 
`EMIT_INTERMEDIATE_RESULTS`, the boolean `true`/`false`) in your own config.
 
 Run it with:
 
 [source,bash]
 ----
-java -jar tika-app.jar --config tika-config.json -i /data/input -o /data/output
+java -jar tika-app.jar --config=tika-config.json -i /data/input -o /data/output
 ----
 
 NOTE: The `-i` and `-o` flags override the `basePath` values in the config 
when used
diff --git a/docs/modules/ROOT/pages/pipes/index.adoc 
b/docs/modules/ROOT/pages/pipes/index.adoc
index 7bd2078238..8037b8857f 100644
--- a/docs/modules/ROOT/pages/pipes/index.adoc
+++ b/docs/modules/ROOT/pages/pipes/index.adoc
@@ -52,104 +52,6 @@ against problematic files.
 * xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] -- extract raw 
bytes from embedded documents
 * xref:pipes/timeouts.adoc[Timeouts] -- two-tier timeout system for handling 
long-running and hung parsers
 
-== Emitters
-
-=== ES Emitter (`es-emitter`)
-
-The ES emitter sends parsed documents to any ES-compatible REST API (ES 7+/8+) 
via
-the `_bulk` endpoint. It uses plain HTTP (Apache HttpClient) — there is no 
dependency
-on the ES Java client, which carries a non-ASL license.
-
-[source,json]
-----
-"emitters": {
-  "my-es": {
-    "es-emitter": {
-      "esUrl": "https://localhost:9200/my-index";,
-      "idField": "_id",
-      "attachmentStrategy": "SEPARATE_DOCUMENTS",
-      "updateStrategy": "UPSERT",
-      "embeddedFileFieldName": "embedded",
-      "apiKey": "<base64-encoded id:api_key>"
-    }
-  }
-}
-----
-
-[cols="1,1,3"]
-|===
-|Field |Default |Description
-
-|`esUrl`
-|_required_
-|Full URL including the index name, e.g. `https://localhost:9200/my-index`
-
-|`idField`
-|`_id`
-|Metadata field used as the document `_id`
-
-|`attachmentStrategy`
-|`SEPARATE_DOCUMENTS`
-|How embedded documents are stored. `SEPARATE_DOCUMENTS` gives each embedded
-file its own flat document. `PARENT_CHILD` uses an ES join field so embedded
-files are linked to their container via `relation_type`.
-
-|`updateStrategy`
-|`OVERWRITE`
-|`OVERWRITE` uses a bulk `index` action (full replace).
-`UPSERT` uses a bulk `update` / `doc_as_upsert` action (field-level merge).
-
-|`embeddedFileFieldName`
-|`embedded`
-|Name of the join-field relation used in `PARENT_CHILD` mode.
-
-|`apiKey`
-|_none_
-|Base64-encoded `id:api_key` sent as `Authorization: ApiKey <value>`.
-Takes precedence over `httpClientConfig` basic auth.
-
-|`httpClientConfig`
-|_none_
-|Optional block for `userName`, `password`, `authScheme`, `connectionTimeout`,
-`socketTimeout`, `proxyHost`, `proxyPort`, and `verifySsl` (boolean, default 
`false`).
-|===
-
-[WARNING]
-====
-By default (`verifySsl: false`) TLS certificate verification is disabled — all
-certificates are trusted and hostname verification is skipped.  Set
-`httpClientConfig.verifySsl: true` to enable proper certificate and hostname
-validation using the JVM's default trust store.  When `verifySsl` is `false`,
-do not transmit credentials over plain HTTP in production; prefer HTTPS with
-network-level controls (VPN, private endpoint) until verification is enabled.
-====
-
-=== ES Pipes Reporter (`es-pipes-reporter`)
-
-The ES reporter writes per-document parse status back into the same index,
-so you can query the processing outcome alongside the extracted content.
-
-[source,json]
-----
-"pipes-reporters": {
-  "es-pipes-reporter": {
-    "esUrl": "https://localhost:9200/my-index";,
-    "keyPrefix": "tika_",
-    "includeRouting": false
-  }
-}
-----
-
-The reporter adds `<keyPrefix>parse_status`, `<keyPrefix>parse_time_ms`,
-and (when the forked JVM exits abnormally) `<keyPrefix>exit_value` fields
-to each document via an upsert.
-
-=== OpenSearch Emitter
-
-The OpenSearch emitter is configured identically but uses `opensearch-emitter` 
as the
-plugin key and `openSearchUrl` as the URL field. It also ships with an
-`opensearch-pipes-reporter`.
-
 == Advanced Topics
 
 * xref:pipes/shared-server-mode.adoc[Shared Server Mode] - Experimental mode 
for reduced memory usage
diff --git a/docs/modules/ROOT/pages/pipes/iterators.adoc 
b/docs/modules/ROOT/pages/pipes/iterators.adoc
index a3e3bc7292..f58f5a1fbb 100644
--- a/docs/modules/ROOT/pages/pipes/iterators.adoc
+++ b/docs/modules/ROOT/pages/pipes/iterators.adoc
@@ -34,7 +34,7 @@ The iterator runs on its own thread; the pipeline reads 
tuples as fast as the wo
 [#wiring]
 == Wiring an Iterator Into a Pipeline
 
-The iterator lives under the singular top-level `pipes-iterator` key. The 
inner map key is the iterator's component name. `fetcherId` and `emitterId` are 
*flat fields* on the iterator config — they are not wrapped in a `baseConfig` 
block.
+The iterator lives under the singular top-level `pipes-iterator` key. The 
inner map key is the iterator's component name. `fetcherId` and `emitterId` are 
flat fields on the iterator config, alongside the iterator-specific options:
 
 [source,json]
 ----
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc 
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 69ba2204cd..ab81227ac9 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -19,9 +19,10 @@
 :toc:
 :toclevels: 3
 
-Tika Pipes uses `ParseMode` to control how documents are parsed and how 
results are emitted.
-The parse mode is configured in the `pipes` section of the JSON config, or 
overridden per-request
-in the `parseContext` field of a `FetchEmitTuple`.
+Tika Pipes uses parse modes to control how documents are parsed and how 
results are emitted.
+The mode is set as `parseMode` in the `pipes` section of the JSON config, and 
can be overridden
+per-request from Java code by attaching a `ParseMode` to the `ParseContext` on 
the
+`FetchEmitTuple` you submit.
 
 == Available Parse Modes
 
@@ -51,9 +52,24 @@ See <<no-parse-mode>>.
 
 == Content Handler Types
 
-The content handler type determines the format of the extracted text. It is 
set on the
-`ContentHandlerFactory` configured in `parseContext` (or via the CLI 
`--handler` flag), and applies
-to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`).
+The content handler type determines the format of the extracted text. It is 
set in the
+top-level `content-handler-factory` section of the JSON config (or via the CLI 
`--handler` flag),
+and applies to all modes that produce content (`RMETA`, `CONCATENATE`, 
`CONTENT_ONLY`).
+
+[source,json]
+----
+{
+  "content-handler-factory": {
+    "basic-content-handler-factory": {
+      "type": "TEXT"
+    }
+  }
+}
+----
+
+Accepted `type` values: `TEXT`, `HTML`, `XML`, `MARKDOWN`, `BODY`, `IGNORE`. 
The CLI
+`--handler` flag uses single-letter shortcuts (`t`, `h`, `x`, `m`, `b`, `i`) 
that map onto
+these values.
 
 [cols="1,1,2"]
 |===
@@ -147,7 +163,7 @@ only `X-TIKA:content` and `X-TIKA:container_exception`. If 
you set your own
 
 === CLI usage
 
-The `tika-app` batch processor supports `CONTENT_ONLY` via the `--content-only`
+The `tika-app` Pipes processor supports `CONTENT_ONLY` via the `--content-only`
 flag:
 
 [source,bash]
@@ -166,7 +182,7 @@ extracted markdown content. See <<_content_handler_types>> 
for the available han
 [source,json]
 ----
 {
-  "parseContext": {
+  "pipes": {
     "parseMode": "NO_PARSE"
   }
 }
diff --git a/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc
index 85fba5889e..034a5d7b93 100644
--- a/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc
+++ b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc
@@ -44,7 +44,7 @@ The File System plugin (`tika-pipes-file-system`) is the most 
common starting po
 
 == Complete Pipeline Example
 
-The example below is the canonical filesystem-to-filesystem integration test 
config. Tokens like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and 
`PLUGINS_PATHS` are placeholders the test harness substitutes; replace them 
with real paths in your own config.
+The example below is the canonical filesystem-to-filesystem integration test 
config. Tokens like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`, 
and `EMIT_INTERMEDIATE_RESULTS` are placeholders the test harness substitutes; 
replace the path tokens with real paths and `EMIT_INTERMEDIATE_RESULTS` with 
the boolean `true` or `false`. See xref:pipes/configuration.adoc[Pipes 
Configuration] for what each setting does.
 
 [source,json,subs=none]
 ----
@@ -208,7 +208,7 @@ Maintains a JSON status file that summarizes pipeline 
progress. The reporter wri
 
 |`statusFile`
 |_required_
-|Path of the JSON status file. The file is created on first write and 
overwritten in place.
+|Path of the JSON status file. Absolute paths are written as given; relative 
paths resolve against the JVM's working directory at startup. Parent 
directories that don't exist are created automatically on first write. Always 
include a parent component (e.g., `./status.json` rather than bare 
`status.json`) — the auto-create step fails on a path with no parent. The file 
is created on first write and overwritten in place.
 
 |`reportUpdateMs`
 |_no default_
@@ -219,14 +219,17 @@ Maintains a JSON status file that summarizes pipeline 
progress. The reporter wri
 
 The reporter serializes an `AsyncStatus` object to JSON, containing:
 
+* `started` — ISO-8601 timestamp of when the reporter was constructed.
+* `lastUpdate` — ISO-8601 timestamp of the most recent write.
 * `asyncStatus` — current pipeline phase (`STARTED`, `COMPLETED`, `CRASHED`).
-* `counts` — map of `RESULT_STATUS` to count (e.g., `PARSE_SUCCESS`, 
`PARSE_EXCEPTION`, `TIMEOUT`, `OOM`).
-* `totalCountResult` — total documents processed and whether the enumeration 
is complete.
-* `timestamp` — when the file was last written.
-* `crashMessage` — populated only on fatal pipeline failure.
+* `statusCounts` — map of `PipesResult.RESULT_STATUS` to count (e.g., 
`PARSE_SUCCESS`, `PARSE_EXCEPTION`, `TIMEOUT`, `OOM`, `EMIT_SUCCESS`, 
`EMIT_EXCEPTION`).
+* `totalCountResult` — total documents discovered by the iterator and whether 
the enumeration is complete.
+* `crashMessage` — empty string under normal operation; populated with a stack 
trace on fatal pipeline failure.
 
 The file is rewritten in full on each tick, not appended.
 
+NOTE: The write is **not** atomic — the reporter opens the target path with 
`Files.newBufferedWriter`, truncates, and streams the JSON. A watcher reading 
concurrently with a write can observe a truncated or partial document. Have the 
watcher treat a parse error as "stale read, try again on the next poll" rather 
than as a real error.
+
 [#watching]
 === Live status for watching applications
 
@@ -237,7 +240,7 @@ The reporter is designed to support external "watchers" — 
UIs, dashboards, or
 "reportUpdateMs": 250
 ----
 
-The watcher polls `statusFile` on its own interval and reads the most recent 
snapshot. Because the file is rewritten in full with the latest status, 
watchers do not need to handle partial reads.
+The watcher polls `statusFile` on its own interval and reads the most recent 
snapshot. Each tick rewrites the file in full, so successive snapshots are 
always coherent — but because the write is not atomic, a watcher reading 
mid-write can see a truncated document. Tolerate JSON parse errors as transient 
and retry on the next poll (see the NOTE under <<_status_file_schema,Status 
file schema>>).
 
 This pattern is used by `tika-gui-v2` to drive its progress UI: the GUI starts 
a pipeline subprocess, points the reporter at a temp file, and polls that file 
every few hundred milliseconds.
 
diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/index.adoc
index d5173d2032..9846a4a1ea 100644
--- a/docs/modules/ROOT/pages/pipes/plugins/index.adoc
+++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc
@@ -68,7 +68,7 @@ Many plugins implement more than one (e.g., the S3 plugin 
provides fetcher, emit
 |—
 |✓
 
-|xref:pipes/plugins/solr.adoc[Solr]
+|xref:pipes/plugins/solr.adoc[Apache Solr]
 |—
 |✓
 |✓
@@ -80,7 +80,7 @@ Many plugins implement more than one (e.g., the S3 plugin 
provides fetcher, emit
 |✓
 |✓
 
-|xref:pipes/plugins/kafka.adoc[Kafka]
+|xref:pipes/plugins/kafka.adoc[Apache Kafka]
 |—
 |✓
 |✓
diff --git a/docs/modules/ROOT/pages/pipes/timeouts.adoc 
b/docs/modules/ROOT/pages/pipes/timeouts.adoc
index 77008aa9b9..bbb94abfd5 100644
--- a/docs/modules/ROOT/pages/pipes/timeouts.adoc
+++ b/docs/modules/ROOT/pages/pipes/timeouts.adoc
@@ -154,11 +154,41 @@ For processing many small documents where you want fast 
failure:
 
 == CLI Usage
 
-When using `tika-app` with `--fork`, the `--fork-timeout` flag sets 
`progressTimeoutMillis`:
+=== Standard mode (single file)
+
+For single-document parsing, `--fork` runs the parser in a forked JVM and 
`--fork-timeout` (milliseconds) caps how long it may run:
+
+[source,bash]
+----
+java -jar tika-app.jar --fork --fork-timeout=120000 document.pdf
+----
+
+=== Pipes mode (`-i` / `-o`)
+
+In Pipes mode the parser ALREADY runs in forked JVMs — that's what 
`numClients` controls — so `--fork` does not apply. Setting it on the command 
line is silently ignored because tika-app routes `-i`/`-o` straight into the 
async dispatcher before its standard-mode flags are processed.
+
+Set per-parse timeouts in your `tika-config.json` instead:
+
+[source,json]
+----
+{
+  "pipes": {
+    "numClients": 4
+  },
+  "parse-context": {
+    "timeout-limits": {
+      "progressTimeoutMillis": 120000,
+      "totalTaskTimeoutMillis": 3600000
+    }
+  }
+}
+----
+
+Then run:
 
 [source,bash]
 ----
-java -jar tika-app.jar --fork --fork-timeout=120000 -i /input -o /output
+java -jar tika-app.jar --config=tika-config.json -i /input -o /output
 ----
 
 == Living Code Reference
diff --git a/docs/modules/ROOT/pages/pipes/troubleshooting.adoc 
b/docs/modules/ROOT/pages/pipes/troubleshooting.adoc
index d7e53de937..87077e8758 100644
--- a/docs/modules/ROOT/pages/pipes/troubleshooting.adoc
+++ b/docs/modules/ROOT/pages/pipes/troubleshooting.adoc
@@ -192,6 +192,40 @@ response-body bytes for HTTP-style fetchers (configurable 
via
 log catches the thrown exception. Lower `maxErrMsgSize` -- or set it to
 zero -- if your responses can contain sensitive data.
 
+== Logging
+
+Tika uses https://logging.apache.org/log4j/2.x/[Log4j 2] for both tika-app and 
tika-server. Default output goes to `SYSTEM_ERR` with the pattern `%-5p [%t] 
%d{HH:mm:ss,SSS} %c %m%n`. Each forked PipesServer logs with its own line 
prefix so parent and child output stays distinguishable; see 
<<_telling_fork_lines_from_parent_lines,Telling fork lines from parent lines>>.
+
+=== Default log4j2 configuration
+
+Each distribution ships its own `log4j2.xml` bundled inside the jar:
+
+* tika-app: `org/apache/tika/cli/log4j2.xml` (in `tika-app-<version>.jar`).
+* tika-server: `org/apache/tika/server/log4j2.xml` (in the relevant 
`tika-server-*.jar`).
+
+Root level defaults to `INFO`. The bundled configurations are the source of 
truth — pull them out of the jar if you want to see exactly which loggers are 
tuned.
+
+=== Changing the log level
+
+In order of increasing reach:
+
+. **`tika-app` `-v` / `--verbose`** — sets the root logger to `DEBUG` for the 
current invocation only. Cheapest knob if you just want a noisier one-off run.
+. **`tika-server` `logLevel` config field** — set `"server": {"logLevel": 
"debug"}` (or `"info"`) in `tika-config.json`. Applied at server startup.
+. **Custom `log4j2.xml`** — for fine-grained control (per-logger levels, 
custom appenders, JSON output, file rotation), supply your own configuration 
via the standard Log4j 2 system property:
++
+[source,bash]
+----
+java -Dlog4j.configurationFile=/path/to/my-log4j2.xml -jar tika-app.jar ...
+----
++
+Your file overrides the bundled one entirely. Start from a copy of the bundled 
config and tighten or relax loggers from there.
+
+=== Forked-process logging
+
+Forked PipesServer JVMs inherit the parent's log4j2 configuration unless 
`tika.pipes.server.stdio=discard` is set (in which case all child stdout/stderr 
is suppressed at the OS level — see 
<<_configuration_knobs_reference,Configuration knobs reference>>).
+
+To debug a specific fork, leave stdio on `inherit` (the default) and grep 
parent log output for the `pipesClientId=<n>` marker that each fork includes.
+
 == Configuration knobs reference
 
 [cols="2,3"]
diff --git a/docs/modules/ROOT/pages/pipes/unpack-config.adoc 
b/docs/modules/ROOT/pages/pipes/unpack-config.adoc
index f3bc1fe5f4..5ad9615301 100644
--- a/docs/modules/ROOT/pages/pipes/unpack-config.adoc
+++ b/docs/modules/ROOT/pages/pipes/unpack-config.adoc
@@ -15,32 +15,62 @@
 // limitations under the License.
 //
 
-= UnpackConfig: Extracting Embedded Document Bytes
+= unpack-config: Extracting Embedded Document Bytes
 
 When processing container files (ZIP, DOCX, PDF with attachments, etc.), you 
may want to
-extract the raw bytes of embedded documents in addition to parsing them. 
`UnpackConfig`
-controls how embedded bytes are extracted and emitted.
+extract the raw bytes of embedded documents in addition to parsing them. The
+`unpack-config` component (Java: `UnpackConfig`) controls how embedded bytes 
are
+extracted and emitted.
 
 == Quick Start
 
-Use `ParseMode.UNPACK` to automatically extract embedded document bytes:
+To turn on byte extraction for every document the pipeline processes, set
+`parseMode` to `UNPACK` in the `pipes` section of your `tika-config.json`.
+That's the minimum configuration — extraction defaults are fine for most cases.
 
 [source,json]
 ----
 {
-  "id": "doc1",
-  "fetchKey": {"fetcherId": "fsf", "fetchKey": "container.docx"},
-  "emitKey": {"emitterId": "fse", "emitKey": "container.docx"},
-  "parseContext": {
+  "pipes": {
     "parseMode": "UNPACK"
   }
 }
 ----
 
+To tune extraction (size limits, naming, ZIP output, etc.), add an 
`unpack-config`
+block under the top-level `parse-context` section. All the options listed below
+live inside that block:
+
+[source,json]
+----
+{
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
+    "unpack-config": {
+      "maxUnpackBytes": 104857600,
+      "zipEmbeddedFiles": true
+    }
+  }
+}
+----
+
 This extracts both metadata (like `RMETA` mode) and embedded document bytes.
 
+[NOTE]
+====
+You can also set `UnpackConfig` programmatically per request from Java code by
+calling `parseContext.set(UnpackConfig.class, ...)` on the `ParseContext`
+attached to your `FetchEmitTuple`. The JSON `parse-context` section above is 
the
+declarative equivalent.
+====
+
 == Configuration Options
 
+All options below are fields of the `unpack-config` block — nest them inside
+`parse-context.unpack-config` as shown in the Quick Start.
+
 [cols="2,1,2,3"]
 |===
 |Property |Type |Default |Description
@@ -105,7 +135,7 @@ Extract embedded bytes with default naming:
 [source,json]
 ----
 {
-  "parseContext": {
+  "pipes": {
     "parseMode": "UNPACK"
   }
 }
@@ -118,8 +148,10 @@ Collect all embedded files into a ZIP with metadata:
 [source,json]
 ----
 {
-  "parseContext": {
-    "parseMode": "UNPACK",
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
     "unpack-config": {
       "zipEmbeddedFiles": true,
       "includeMetadataInZip": true,
@@ -136,8 +168,10 @@ Control output file naming:
 [source,json]
 ----
 {
-  "parseContext": {
-    "parseMode": "UNPACK",
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
     "unpack-config": {
       "zeroPadName": 8,
       "suffixStrategy": "DETECTED",
@@ -156,8 +190,10 @@ Prevent unbounded extraction from malicious files:
 [source,json]
 ----
 {
-  "parseContext": {
-    "parseMode": "UNPACK",
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
     "unpack-config": {
       "maxUnpackBytes": 104857600
     }
@@ -200,13 +236,15 @@ manifest with file checksums and MIME types, making it 
easy to verify and proces
 
 === Enabling Frictionless Output
 
-Set `outputFormat` to `FRICTIONLESS` in your UnpackConfig:
+Set `outputFormat` to `FRICTIONLESS` in your `unpack-config`:
 
 [source,json]
 ----
 {
-  "parseContext": {
-    "parseMode": "UNPACK",
+  "pipes": {
+    "parseMode": "UNPACK"
+  },
+  "parse-context": {
     "unpack-config": {
       "outputFormat": "FRICTIONLESS",
       "includeFullMetadata": true
@@ -256,13 +294,20 @@ The `datapackage.json` file contains:
 
 === CLI Usage
 
-Extract files in Frictionless format using the CLI:
+Extract files in Frictionless format using the CLI. The `-Z` flag turns on 
recursive
+unpack (the Pipes-mode counterpart of standard-mode `-z`), and `-i`/`-o` are 
the
+Pipes input/output directories:
 
 [source,bash]
 ----
-java -jar tika-app.jar --unpack --unpack-format=FRICTIONLESS -i input.docx -o 
output/
+java -jar tika-app.jar -Z --unpack-format=FRICTIONLESS -i /path/to/input -o 
/path/to/output
 ----
 
+NOTE: `-i` expects a directory of containers to unpack, not a single file. For
+one-off unpacking of a single document, see the standard-mode `-z`/`--extract`
+flag — though as of 4.x that path also routes through the Pipes machinery and
+expects an input directory.
+
 == Code Examples
 
 For working code examples, see:
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc 
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index 594828fc78..a5284985b6 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -20,12 +20,22 @@
 WARNING: The tika-app command line interface is still in flux for 4.x. Options 
and
 behavior may change before the final release.
 
-This section covers using Apache Tika from the command line via `tika-app`.
+This section covers using Apache Tika from the command line via `tika-app`. The
+authoritative option list is `java -jar tika-app.jar --help` — this page 
mirrors
+that output and adds usage context. If the two disagree, `--help` wins; please
+file a ticket.
 
 == Overview
 
 The Tika application (`tika-app`) is a command line utility for extracting
-text content and metadata from all sorts of files.
+text content and metadata from all sorts of files. It operates in three modes:
+
+* **Standard mode** — parse a single file, URL, or stdin and write the result
+  to stdout.
+* **GUI mode** — `--gui` launches a desktop window for drag-and-drop parsing.
+* **Tika Pipes mode** — process many documents from a directory (or S3, GCS,
+  Azure, JDBC, etc.) via the asynchronous Pipes pipeline. Activated by any of
+  the Pipes-only flags listed below.
 
 == Installation
 
@@ -54,7 +64,13 @@ unzipped distribution.
 java -jar tika-app.jar [option...] [file|port...]
 ----
 
-== Command Line Options
+If no file or URL is given (or `-` is given), `tika-app` parses standard input.
+If no arguments are given at all and no stdin is piped in, the GUI launches.
+
+== Standard-mode Options
+
+These options apply to single-document parsing (the default mode). For 
Pipes-mode
+options see <<_tika_pipes_processing,Tika Pipes Processing>> below.
 
 === Help and Information
 
@@ -63,29 +79,36 @@ java -jar tika-app.jar [option...] [file|port...]
 |Option |Description
 
 |`-?` or `--help`
-|Display usage instructions
+|Print the usage message
 
 |`-v` or `--verbose`
-|Enable debug-level output
+|Print debug-level messages
 
 |`-V` or `--version`
-|Show version details
+|Print the Apache Tika version
 |===
 
-=== Operation Modes
+=== GUI
 
 [cols="1,3"]
 |===
 |Option |Description
 
 |`-g` or `--gui`
-|Launch the graphical interface
+|Launch the graphical interface (drag-and-drop parsing)
+|===
 
-|`-s` or `--server`
-|Start the web server
+=== Configuration
 
-|`-f` or `--fork`
-|Enable fork mode for isolated extraction
+[cols="1,3"]
+|===
+|Option |Description
+
+|`--config=<tika-config.json>`
+|TikaConfig file (JSON as of Tika 4.x). Must appear before `-g` or `-f`.
+
+|`--convert-config-xml-to-json=<input.xml>`
+|Convert a legacy 3.x XML config to 4.x JSON format (parsers section only) and 
write to stdout. Redirect to save, e.g. 
`--convert-config-xml-to-json=tika-config.xml > tika-config.json`.
 |===
 
 === Output Formatting
@@ -95,22 +118,150 @@ java -jar tika-app.jar [option...] [file|port...]
 |Option |Description
 
 |`-x` or `--xml`
-|Output XHTML (default)
+|Output XHTML content (default)
 
 |`-h` or `--html`
-|Output HTML
+|Output HTML content
 
 |`-t` or `--text`
-|Output plain text
+|Output plain text content (body)
 
 |`--md`
-|Output Markdown
+|Output Markdown content (body)
+
+|`-T` or `--text-main`
+|Output plain text — main content only, via the boilerpipe handler
+
+|`-A` or `--text-all`
+|Output all text content
 
 |`-m` or `--metadata`
 |Output metadata only
 
 |`-j` or `--json`
-|Output JSON metadata
+|Output metadata in JSON
+
+|`-y` or `--xmp`
+|Output metadata in XMP
+
+|`-J` or `--jsonRecursive`
+|Output metadata and content from all embedded files. Combine with 
`-x`/`-h`/`-t`/`-m` to choose the content type (default: `-x`).
+
+|`-r` or `--pretty-print`
+|For JSON, XML, and XHTML output, add newlines and whitespace for readability.
+
+|`-e<X>` or `--encoding=<X>`
+|Use output encoding `<X>` (e.g. `UTF-8`).
+|===
+
+=== Detection and Language
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-d` or `--detect`
+|Detect the document type and print the media type.
+
+|`-l` or `--language`
+|Detect and print only the language.
+|===
+
+=== Content Options
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-p<X>` or `--password=<X>`
+|Use document password `<X>` (for encrypted PDFs, OOXML, etc.).
+
+|`--digest=<X>`
+|Include a digest of the parsed bytes. Supported: `md2`, `md5`, `sha1`, 
`sha256`, `sha384`, `sha512`, `sha3_256`, `sha3_384`, `sha3_512`. See 
xref:configuration/digesters.adoc[Digesters] for the underlying providers.
+|===
+
+=== Attachment Extraction (single-document)
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-z` or `--extract`
+|Extract all attachments into the current directory.
+
+WARNING: As of 4.x `-z` routes through the async (Pipes) machinery, which 
expects an input directory, not a single file. Single-file attachment 
extraction is currently broken in this mode — see <<_tika_pipes_processing,Tika 
Pipes Processing>> below for the working `-Z` alternative.
+
+|`--extract-dir=<dir>`
+|Target directory for `-z`.
+
+|`--on-exists=<mode>`
+|Behavior when an output file already exists: `exception` (default), 
`replace`, or `skip`.
+
+|`--maxEmbeddedDepth=<X>`
+|Maximum depth for embedded document extraction.
+
+|`--maxEmbeddedCount=<X>`
+|Maximum number of embedded documents to extract.
+|===
+
+=== Async Mode
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-a` or `--async`
+|Run Tika in async mode. Requires a `tikaConfig` file describing the pipeline. 
Activates Tika Pipes mode — see below.
+|===
+
+=== Listing and Inspection
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`--list-parsers`
+|List the available document parsers.
+
+|`--list-parser-details`
+|List the available parsers and their supported mime types.
+
+|`--list-parser-details-apt`
+|Same as `--list-parser-details` in apt format.
+
+|`--list-detectors`
+|List the available document detectors.
+
+|`--list-met-models`
+|List the available metadata models and their supported keys.
+
+|`--list-supported-types`
+|List all known media types and related information.
+
+|`--compare-file-magic=<dir>`
+|Compare Tika's known media types to the `file(1)` tool's magic directory.
+|===
+
+=== Fork Mode (process isolation)
+
+Fork mode parses the document in a separate JVM, protecting the main process
+from parser crashes, OOM, and timeouts.
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-f` or `--fork`
+|Run parsing in a forked JVM process.
+
+|`--fork-timeout=<ms>`
+|Parse timeout in milliseconds (default: 60000).
+
+|`--fork-jvm-args=<args>`
+|JVM args for the forked process, comma-separated. Example: 
`--fork-jvm-args=-Xmx512m,-Dsome.prop=value`.
+
+|`--fork-plugins-dir=<dir>`
+|Directory containing plugin zips for the forked process.
 |===
 
 == Examples
@@ -129,31 +280,23 @@ java -jar tika-app.jar --text document.pdf
 java -jar tika-app.jar --json document.docx
 ----
 
-=== Pipeline processing
-
-Extract text from a remote document and search for keywords:
+=== Extract Markdown from a file
 
 [source,bash]
 ----
-curl http://example.com/document.doc | java -jar tika-app.jar --text | grep -q 
keyword
+java -jar tika-app.jar --md document.docx
 ----
 
-=== Tika Pipes processing
+=== Reading from stdin
 
-Process many documents by specifying input and output paths. Inputs can be a
-local directory, S3, GCS, Azure, JDBC, and others via Tika Pipes fetchers:
+Extract text from a remote document and search for keywords:
 
 [source,bash]
 ----
-java -jar tika-app.jar -i /path/to/input -o /path/to/output
+curl http://example.com/document.doc | java -jar tika-app.jar --text | grep -q 
keyword
 ----
 
-=== Extract Markdown from a file
-
-[source,bash]
-----
-java -jar tika-app.jar --md document.docx
-----
+`tika-app` reads from standard input when no file argument is given (or when 
`-` is given). For batch processing of many documents, see 
<<_tika_pipes_processing,Tika Pipes Processing>> below.
 
 === Custom configuration
 
@@ -178,13 +321,25 @@ it switches into Pipes mode so you can confirm which path 
is running.
 
 * Two positional arguments are given and the first is an existing directory
   (`tika-app.jar /in /out`).
-* Any of these options are present: `-i`, `-o`, `--input`, `--output`,
-  `--fileList`, `-z`/`-Z`/`--extract`/`--extract-dir`, or `-a`/`--async`.
 * A single `.json` argument is given — it is treated as a Tika Pipes config 
file.
+* Any of these options are present: `-i`, `--input`, `-o`, `--output`,
+  `--fileList`, `-z`, `--extract`, `--extract-dir`, `-Z`, or `-a`/`--async`.
 
-Anything else (single file, URL, stdin, `--gui`, `--server`) stays in standard
+Anything else (single file, URL, stdin, `--gui`) stays in standard
 single-document mode.
 
+NOTE: The activation list mixes standard-mode and Pipes-only flags (`-z`,
+`--extract`, `--extract-dir`). Passing one of those with a single file routes
+into Pipes mode and then fails because the async dispatcher expects an input
+directory. If you want unpack-while-pipes behaviour, use the Pipes-specific
+`-Z` instead.
+
+CAUTION: Use the GNU-style double-dash form for long flags. `--input /path`
+works; `-input /path` (single dash plus the long name) does not — `tika-app`
+rejects single-dash long names with an `IllegalArgumentException` pointing
+you at the right form. Single-letter short flags use one dash
+(e.g., `-i`, `-eUTF-8`, `-X512m`).
+
 === Basic Pipes Usage
 
 [source,bash]
@@ -192,41 +347,92 @@ single-document mode.
 java -jar tika-app.jar -i /path/to/input -o /path/to/output
 ----
 
-This processes all files in the input directory and writes JSON metadata 
(RMETA format)
-to the output directory.
+This processes all files in the input directory and writes JSON metadata
+(RMETA format) to the output directory.
 
 === Tika Pipes Options
 
+==== Input and output
+
 [cols="1,3"]
 |===
 |Option |Description
 
-|`-i`
-|Input directory
+|`-i` or `--input=<dir>`
+|Input directory.
+
+|`-o` or `--output=<dir>`
+|Output directory.
+
+|`--fileList=<path>`
+|File list (one path per line, relative to `-i` or absolute).
+
+|`--on-exists=<mode>`
+|Behavior when an output file already exists: `exception` (default), 
`replace`, or `skip`.
+|===
+
+==== Output formatting
 
-|`-o`
-|Output directory
+[cols="1,3"]
+|===
+|Option |Description
 
-|`--handler`
-|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, 
`i`=ignore (default: `t`)
+|`--handler=<X>`
+|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, 
`i`=ignore. Default: `t`.
 
 |`--concatenate`
-|Concatenate content from all embedded documents into a single content field
+|Concatenate content from all embedded documents into a single content field.
 
 |`--content-only`
-|Output only extracted content (no metadata, no JSON wrapper); implies 
`--concatenate`
+|Output only the extracted content (no metadata, no JSON wrapper). Implies 
`--concatenate`.
+|===
+
+==== Execution
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-n` or `--numClients=<N>`
+|Number of parallel forked processes.
+
+|`-X<size>`
+|`-Xmx` size for the forked processes (e.g. `-X512m`).
+
+|`-T` or `--timeoutMs=<ms>`
+|Timeout for each parse in milliseconds.
+|===
+
+==== Configuration
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-c` or `--config=<file>`
+|Tika config file. `--config=<file>` (the standard-mode long form) also works 
in Pipes mode.
+
+|`-p` or `--pluginsDir=<dir>`
+|Plugins directory.
+|===
+
+==== Unpack (recursive attachment extraction)
+
+[cols="1,3"]
+|===
+|Option |Description
 
-|`--on-exists`
-|Behavior when an output file already exists: `exception` (default), `replace` 
or `skip`
+|`-Z`
+|Recursively unpack all attachments. This is the Pipes-mode counterpart to 
standard-mode `-z`.
 
-|`-T` or `--timeoutMs`
-|Timeout for each parse in milliseconds
+|`--unpack-format=<format>`
+|Output format: `REGULAR` (default) or `FRICTIONLESS`.
 
-|`-n` or `--numClients`
-|Number of parallel forked processes
+|`--unpack-mode=<mode>`
+|Output mode: `ZIPPED` (default) or `DIRECTORY`.
 
-|`-p` or `--pluginsDir`
-|Plugins directory
+|`--unpack-include-metadata`
+|Include `metadata.json` in Frictionless output.
 |===
 
 === Tika Pipes Examples
@@ -255,3 +461,10 @@ Use a Tika config file alongside the Pipes options. Both 
`--config=foo.json`
 ----
 java -jar tika-app.jar -i /path/to/input -o /path/to/output 
--config=tika-config.json
 ----
+
+Recursively unpack attachments into the output directory:
+
+[source,bash]
+----
+java -jar tika-app.jar -i /path/to/input -o /path/to/output -Z
+----
diff --git a/docs/modules/ROOT/pages/using-tika/index.adoc 
b/docs/modules/ROOT/pages/using-tika/index.adoc
index eaf944757b..a81c5ee6c5 100644
--- a/docs/modules/ROOT/pages/using-tika/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/index.adoc
@@ -16,6 +16,7 @@
 //
 
 = Getting Started with Apache Tika
+:toc:
 
 Apache Tika can be used in several ways depending on your needs. Choose the 
approach
 that best fits your use case.
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc 
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index 315a1f91a4..ffe142677a 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -24,6 +24,13 @@ This section covers running Apache Tika as a REST server via 
`tika-server`.
 Tika Server provides a RESTful HTTP interface for parsing documents and 
extracting
 content. It can be deployed as a standalone service or in a containerized 
environment.
 
+In Tika 4.x, all parsing happens in forked child processes via the Tika Pipes
+infrastructure — the request-handling process never loads parser libraries 
directly.
+This provides process isolation (a parser crash or OOM cannot take down the 
server)
+at the cost of requiring a Pipes configuration. See
+xref:migration-to-4x/migrating-tika-server-4x.adoc[Migrating Tika Server to 
4.x]
+for the full breaking-change list when upgrading from 3.x.
+
 == Basic Usage
 
 [source,bash]
@@ -31,84 +38,189 @@ content. It can be deployed as a standalone service or in 
a containerized enviro
 java -jar tika-server-standard-X.Y.Z.jar
 ----
 
-The server starts on port 9998 by default.
+The server starts on `localhost:9998` by default.
+
+== Command Line Options
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-h <host>` or `--host <host>`
+|Hostname to bind to. Default `localhost`. Use `*` to bind to all interfaces.
+
+|`-p <port>` or `--port <port>`
+|Listen port. Default `9998`.
+
+|`-c <file>` or `--config <file>`
+|Path to `tika-config.json`. See <<_configuration,Configuration>> below.
+
+|`-a <file>` or `--pluginsConfig <file>`
+|Path to the Tika Pipes plugins configuration file.
+
+|`-i <id>` or `--id <id>`
+|Server ID, surfaced in the `/status` endpoint and in logs.
+
+|`-?` or `--help`
+|Print the usage message.
+|===
+
+NOTE: Other behavior — `enableUnsecureFeatures`, CORS, TLS, timeouts — is 
configured
+in the JSON config file (see <<_configuration,Configuration>>), not via CLI 
flags.
 
 == Endpoints
 
+For the canonical endpoint inventory, including the PUT vs POST split and the
+multipart-config pattern introduced in 4.x, see the
+xref:migration-to-4x/migrating-tika-server-4x.adoc#_new_tika_endpoint_structure[New
 `/tika` Endpoint Structure]
+section of the migration guide. The most-used endpoints are summarized below.
+
 === Content Extraction (`/tika`)
 
-The `/tika` endpoint extracts content from a document as plain text.
+Simple PUT — the entire request body is the document, no metadata:
 
 [source,bash]
 ----
+# Default: raw XHTML
 curl -T document.pdf http://localhost:9998/tika
-----
 
-==== Markdown Output (`/tika/md`)
+# Explicit handler
+curl -T document.pdf http://localhost:9998/tika/text
+curl -T document.docx http://localhost:9998/tika/html
+curl -T document.docx http://localhost:9998/tika/md
+curl -T document.pdf http://localhost:9998/tika/json
+----
 
-The `/tika/md` endpoint extracts content as Markdown, preserving structural 
semantics
-like headings, lists, tables, and emphasis:
+POST with multipart for custom per-request configuration:
 
 [source,bash]
 ----
-curl -T document.docx http://localhost:9998/tika/md
+curl -X POST http://localhost:9998/tika/json \
+  -F "[email protected]" \
+  -F 
"config={\"pdf-parser\":{\"ocrStrategy\":\"no_ocr\"}};type=application/json"
 ----
 
-==== Custom Handler Type
+Valid handler paths under `/tika/`: `text`, `html`, `xml`, `md`, `json`. For
+the JSON variant, you can also nest a handler — `/tika/json/text`,
+`/tika/json/html`, etc. — to choose the content-field format inside the JSON
+envelope; that nested handler accepts the full set (`text`, `html`, `xml`,
+`md`, `markdown`, `body`, `ignore`).
+
+==== `X-Tika-Handler` header
 
-Use the `X-Tika-Handler` header to control the output format. Valid values: 
`text` (default),
-`html`, `xml`, `markdown`, `ignore`.
+For the root `/tika` PUT endpoint you can also pick the handler with a header:
 
 [source,bash]
 ----
 curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika
 ----
 
+Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`.
+
 === Recursive Metadata (`/rmeta`)
 
-The `/rmeta` endpoint returns metadata for the container document and all 
embedded documents
-as a JSON array of metadata objects.
+Returns metadata for the container document and all embedded documents as a 
JSON
+array of metadata objects. The handler controls the content field of each 
entry:
 
 [source,bash]
 ----
-curl -T document.pdf http://localhost:9998/rmeta
+curl -T document.pdf http://localhost:9998/rmeta            # default: text
+curl -T document.pdf http://localhost:9998/rmeta/text
+curl -T document.pdf http://localhost:9998/rmeta/html
+curl -T document.pdf http://localhost:9998/rmeta/xml
+curl -T document.docx http://localhost:9998/rmeta/markdown  # or /md
+curl -T document.pdf http://localhost:9998/rmeta/ignore     # metadata only
 ----
 
-Content handler can be specified in the URL path:
+=== Metadata only (`/meta`)
 
-* `/rmeta/text` - plain text content (default)
-* `/rmeta/html` - HTML content
-* `/rmeta/xml` - XHTML content
-* `/rmeta/markdown` - Markdown content
-* `/rmeta/ignore` - metadata only, no content
+Returns container-document metadata only (no recursive embedded list, no 
content):
 
 [source,bash]
 ----
-curl -T document.docx http://localhost:9998/rmeta/markdown
+curl -T document.pdf http://localhost:9998/meta
+curl -T document.pdf http://localhost:9998/meta/Content-Type   # single field
 ----
 
+=== Other endpoints
+
+* `/version` — server version
+* `/status` — health/status (includes server ID)
+* `/parsers` and `/parsers/details` — registered parsers
+* `/detectors` — registered detectors
+* `/mime-types` — known MIME types
+* `/detect/stream` — type detection only (no parsing)
+* `/language/stream`, `/language/string` — language detection
+* `/translate/all/\{translator}/\{src}/\{dest}` — translation
+* `/pipes`, `/async` — Pipes-based bulk processing
+
+== Configuration
+
+Server behavior beyond host/port is controlled by a JSON config file passed via
+`-c`/`--config`. The `server` section in that file maps to fields on
+`TikaServerConfig`; commonly-set fields include:
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`enableUnsecureFeatures`
+|`false`
+|Enable the `/config` family of endpoints (see 
<<_security_configuration,Security Configuration>>).
+
+|`cors`
+|`""` (off)
+|`*` to allow any origin, or an explicit origin string. Empty disables CORS.
+
+|`returnStackTrace`
+|`false`
+|Include parser stack traces in error responses. Useful in dev, dangerous in 
production (leaks internals).
+
+|`digest`
+|`""` (off)
+|Compute a digest of the parsed bytes. Comma-separated algorithm names: `md5`, 
`sha1`, `sha256`, `sha384`, `sha512`.
+
+|`digestMarkLimit`
+|`20971520` (20 MiB)
+|Max bytes buffered for digest computation.
+
+|`logLevel`
+|_inherited_
+|`debug` or `info` to override the runtime log level.
+
+|`idBase`
+|random UUID
+|Override the auto-generated server ID (the `-i` CLI flag is the same setting).
+|===
+
+For the full Pipes-related sections (`pipes`, `fetchers`, `emitters`, 
`parse-context`)
+that tika-server 4.x requires, see
+xref:migration-to-4x/migrating-tika-server-4x.adoc#_configuration_changes[Configuration
 Changes].
+
 == Topics
 
-* xref:using-tika/server/tls.adoc[TLS/SSL Configuration] - Secure your server 
with TLS and mutual authentication
+* xref:using-tika/server/tls.adoc[TLS/SSL Configuration] — Secure your server 
with TLS and mutual authentication
+* xref:migration-to-4x/migrating-tika-server-4x.adoc[Migrating Tika Server to 
4.x] — Breaking changes from 3.x
 
 == Security Configuration
 
 === Config Endpoint Protection
 
-By default, the `/config` endpoints that expose server configuration are 
disabled for security
-reasons. These endpoints can reveal sensitive information about your server 
configuration,
-including parser settings and system properties (see 
https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271[CVE-2015-3271]).
+By default, the `/config` family of endpoints that expose server configuration 
are
+disabled. These endpoints can reveal sensitive information about your server,
+including parser settings and system properties (see
+https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271[CVE-2015-3271]).
 
-The protected endpoints include:
+Protected endpoints include:
 
-* `/config` - Returns the server's full configuration
-* `/config/parsers` - Returns configured parsers
-* `/config/detectors` - Returns configured detectors
-* `/config/mimeTypes` - Returns MIME type mappings
+* `/tika/config` and `/tika/config/{text,html,xml,md,json}` — POST with 
multipart config
+* `/rmeta/config` — POST with multipart config
+* `/meta/config` — POST with multipart config
 
 === Enabling Config Endpoints
 
-To enable these endpoints:
+The setting is JSON-only — there is no CLI flag. Set `enableUnsecureFeatures` 
in
+your config file's `server` section:
 
 [source,json]
 ----
@@ -119,24 +231,17 @@ To enable these endpoints:
 }
 ----
 
-WARNING: Only enable `enableUnsecureFeatures` if you have secured access to 
Tika Server through
-network controls (firewalls, private subnets), a reverse proxy (nginx, Apache 
httpd), or
-xref:using-tika/server/tls.adoc[2-way TLS authentication]. Exposing config 
endpoints to
-untrusted networks can help attackers identify vulnerabilities and craft 
targeted attacks.
-
-=== Command Line Usage
-
-You can also enable unsecure features via command line:
-
-[source,bash]
-----
-java -jar tika-server-standard-X.Y.Z.jar --enableUnsecureFeatures
-----
+WARNING: Only enable `enableUnsecureFeatures` if you have secured access to 
Tika
+Server through network controls (firewalls, private subnets), a reverse proxy
+(nginx, Apache httpd), or
+xref:using-tika/server/tls.adoc[2-way TLS authentication]. Exposing config 
endpoints
+to untrusted networks can help attackers identify vulnerabilities and craft
+targeted attacks.
 
 === Security Best Practices
 
-1. **Keep config endpoints disabled** in production (default behavior)
-2. **Use network controls** to restrict access to the Tika Server (firewall 
rules, private subnets)
-3. **Consider TLS** for encrypted communication - see 
xref:using-tika/server/tls.adoc[TLS Configuration]
-4. **Run with minimal privileges** - don't run Tika Server as root
-5. **Monitor logs** for unusual access patterns
+1. **Keep config endpoints disabled** in production (default behavior).
+2. **Use network controls** to restrict access (firewall rules, private 
subnets).
+3. **Consider TLS** for encrypted communication — see 
xref:using-tika/server/tls.adoc[TLS Configuration].
+4. **Run with minimal privileges** — don't run Tika Server as root.
+5. **Monitor logs** for unusual access patterns.
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 980f2833c5..3746260c3c 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -38,11 +38,13 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -68,6 +70,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.async.cli.TikaAsyncCLI;
 import org.apache.tika.config.EmbeddedLimits;
 import org.apache.tika.config.TimeoutLimits;
+import org.apache.tika.config.loader.ComponentRegistry;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.Detector;
@@ -470,9 +473,15 @@ public class TikaCLI {
         } else if (arg.equals("--list-parser") || 
arg.equals("--list-parsers")) {
             pipeMode = false;
             displayParsers(false, false);
+        } else if (arg.equals("--list-parser-names")) {
+            pipeMode = false;
+            displayParserNames();
         } else if (arg.equals("--list-detector") || 
arg.equals("--list-detectors")) {
             pipeMode = false;
             displayDetectors();
+        } else if (arg.equals("--list-detector-names")) {
+            pipeMode = false;
+            displayDetectorNames();
         } else if (arg.equals("--list-parser-detail") || 
arg.equals("--list-parser-details")) {
             pipeMode = false;
             displayParsers(true, false);
@@ -560,13 +569,28 @@ public class TikaCLI {
             maxEmbeddedCount = 
Integer.parseInt(arg.substring("--maxEmbeddedCount=".length()));
         } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
             prettyPrint = true;
-        } else if (arg.equals("-p") || arg.equals("--port") || 
arg.equals("-s") || arg.equals("--server")) {
-            throw new IllegalArgumentException("As of Tika 2.0, the server 
option is no longer supported in tika-app.\n" + "See 
https://wiki.apache.org/tika/TikaJAXRS for usage.");
         } else if (arg.startsWith("-c")) {
             networkURI = new URI(arg.substring("-c".length()));
         } else if (arg.startsWith("--client=")) {
             networkURI = new URI(arg.substring("--client=".length()));
         } else {
+            // Any arg that reaches here is either "-" (stdin), an existing
+            // file, a URL, or an unknown/typo'd flag. The default fallthrough
+            // lets typos like "-input /path" hit new URL("-input") and
+            // surface as a confusing MalformedURLException. Catch 
dash-prefixed
+            // args that aren't the stdin marker or an existing file and emit
+            // an actionable error before that happens.
+            if (arg.startsWith("-") && !arg.equals("-") && !new 
File(arg).exists()) {
+                String hint = " Run with --help for the full option list.";
+                // Heuristic: single-dash + multi-letter (e.g. "-input") is
+                // usually a long-form-with-one-dash typo. Single-dash + one
+                // letter (e.g. "-s") or "--<unknown>" is just an unknown flag.
+                if (arg.length() > 2 && !arg.startsWith("--")) {
+                    hint = " Long-form flags require two dashes (try '-"
+                            + arg + "' instead of '" + arg + "')." + hint;
+                }
+                throw new IllegalArgumentException("Unknown option '" + arg + 
"'." + hint);
+            }
             pipeMode = false;
             configure();
 
@@ -815,7 +839,8 @@ public class TikaCLI {
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
         out.println("           --digest=X      Include digest X (md2, md5, 
sha1,");
-        out.println("                               sha256, sha384, sha512");
+        out.println("                               sha256, sha384, sha512,");
+        out.println("                               sha3_256, sha3_384, 
sha3_512)");
         out.println("    -eX or --encoding=X    Use output encoding X");
         out.println("    -pX or --password=X    Use document password X");
         out.println("    -z  or --extract       Extract all attachements into 
current directory");
@@ -837,12 +862,17 @@ public class TikaCLI {
         out.println();
         out.println("    --list-parsers");
         out.println("         List the available document parsers");
+        out.println("    --list-parser-names");
+        out.println("         List parsers as tab-separated 
class-name<TAB>friendly-name");
+        out.println("         (friendly names are the kebab-case keys used in 
JSON config)");
         out.println("    --list-parser-details");
         out.println("         List the available document parsers and their 
supported mime types");
         out.println("    --list-parser-details-apt");
         out.println("         List the available document parsers and their 
supported mime types in apt format.");
         out.println("    --list-detectors");
         out.println("         List the available document detectors");
+        out.println("    --list-detector-names");
+        out.println("         List detectors as tab-separated 
class-name<TAB>friendly-name");
         out.println("    --list-met-models");
         out.println("         List the available metadata models, and their 
supported keys");
         out.println("    --list-supported-types");
@@ -878,8 +908,8 @@ public class TikaCLI {
         out.println("         java -jar tika-app.jar <inputDirectory> 
<outputDirectory>");
         out.println();
         out.println("Tika Pipes Options:");
-        out.println("    -i                         Input directory");
-        out.println("    -o                         Output directory");
+        out.println("    -i, --input=<dir>          Input directory");
+        out.println("    -o, --output=<dir>         Output directory");
         out.println("    -n, --numClients           Number of forked 
processes");
         out.println("    -X                         -Xmx in the forked 
processes");
         out.println("    -T, --timeoutMs            Timeout for each parse in 
milliseconds");
@@ -1056,6 +1086,65 @@ public class TikaCLI {
         }
     }
 
+    private void displayParserNames() throws TikaException, IOException, 
SAXException {
+        configure();
+        Set<Class<?>> seen = new LinkedHashSet<>();
+        collectParserClasses(parser, seen);
+        printNames(seen, "parsers");
+    }
+
+    private void displayDetectorNames() throws TikaException, IOException, 
SAXException {
+        configure();
+        Set<Class<?>> seen = new LinkedHashSet<>();
+        collectDetectorClasses(detector, seen);
+        printNames(seen, "detectors");
+    }
+
+    private void collectParserClasses(Parser p, Set<Class<?>> out) {
+        if (p instanceof ParserDecorator) {
+            p = ((ParserDecorator) p).getWrappedParser();
+        }
+        if (p instanceof CompositeParser) {
+            for (Parser sub : ((CompositeParser) p).getParsers().values()) {
+                collectParserClasses(sub, out);
+            }
+        } else {
+            out.add(p.getClass());
+        }
+    }
+
+    private void collectDetectorClasses(Detector d, Set<Class<?>> out) {
+        if (d instanceof CompositeDetector) {
+            for (Detector sub : ((CompositeDetector) d).getDetectors()) {
+                collectDetectorClasses(sub, out);
+            }
+        } else {
+            out.add(d.getClass());
+        }
+    }
+
+    private void printNames(Set<Class<?>> classes, String indexFileName) 
throws TikaException {
+        // Look up friendly names via ComponentRegistry, which reads them from
+        // the META-INF/tika/<indexFileName>.idx files generated at compile
+        // time by the @TikaComponent annotation processor. (The annotation
+        // itself has CLASS retention, so reflection on the class can't see it
+        // at runtime — the .idx file is the authoritative source.)
+        ComponentRegistry registry;
+        try {
+            registry = new ComponentRegistry(indexFileName, 
Thread.currentThread().getContextClassLoader());
+        } catch (TikaException e) {
+            throw e;
+        }
+        // Sort by class name for stable output; tab-separated so downstream
+        // scripts can `cut -f2` to get the JSON-config names.
+        List<Class<?>> sorted = new ArrayList<>(classes);
+        sorted.sort(Comparator.comparing(Class::getName));
+        for (Class<?> cls : sorted) {
+            String fname = registry.getFriendlyName(cls);
+            System.out.println(cls.getName() + "\t" + (fname != null ? fname : 
"(not registered)"));
+        }
+    }
+
     private String indent(int indent) {
         return "                     ".substring(0, indent);
     }
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
index fed6ba3af7..ccaf17bc91 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml
@@ -51,6 +51,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java
new file mode 100644
index 0000000000..d7c5e1e4aa
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr.tess4j;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Validates the Tess4J configuration examples used in the docs.
+ *
+ * <p>The JSON files under {@code src/test/resources/config-examples/} are
+ * symlinked from {@code docs/modules/ROOT/examples/}, so any change that
+ * keeps these tests passing also keeps the published docs correct.
+ *
+ * <p>If you change a tess4j example JSON in the docs tree, this test will fail
+ * unless the JSON still loads against {@link Tess4JConfig} / {@link 
Tess4JParser}.
+ * That's the point: documentation drift is caught at build time.
+ */
+public class Tess4JConfigExamplesTest {
+
+    private static final String EXAMPLES_DIR = "/config-examples/";
+
+    @TempDir
+    Path tempDir;
+
+    private Parser loadAndValidate(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + resourceName);
+            String json = new String(is.readAllBytes(), 
StandardCharsets.UTF_8);
+            Path configFile = tempDir.resolve("tika-config.json");
+            Files.writeString(configFile, json, StandardCharsets.UTF_8);
+            TikaLoader loader = TikaLoader.load(configFile);
+            Parser parser = loader.loadParsers();
+            assertNotNull(parser, "Parser should not be null for: " + 
resourceName);
+            return parser;
+        }
+    }
+
+    @Test
+    public void testTess4JBasicConfig() throws Exception {
+        loadAndValidate("tess4j-basic.json");
+    }
+
+    @Test
+    public void testTess4JFullConfig() throws Exception {
+        loadAndValidate("tess4j-full.json");
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json
new file mode 100644
index 0000000000..f75ee7a74f
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json
@@ -0,0 +1,11 @@
+{
+  "parsers": [
+    {
+      "tess4j-parser": {
+        "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+        "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+        "poolSize": 4
+      }
+    }
+  ]
+}
diff --git 
a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json
 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json
new file mode 100644
index 0000000000..8ad9c9d661
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json
@@ -0,0 +1,20 @@
+{
+  "parsers": [
+    {
+      "tess4j-parser": {
+        "dataPath": "/usr/share/tesseract-ocr/5/tessdata",
+        "nativeLibPath": "/usr/lib/x86_64-linux-gnu",
+        "language": "eng",
+        "pageSegMode": 1,
+        "ocrEngineMode": 3,
+        "poolSize": 4,
+        "timeoutSeconds": 120,
+        "dpi": 300,
+        "minFileSizeToOcr": 0,
+        "maxFileSizeToOcr": 2147483647,
+        "maxImagePixels": 100000000,
+        "skipOcr": false
+      }
+    }
+  ]
+}
diff --git a/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml 
b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
index c2cdeac4ee..81f31363c5 100644
--- a/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
+++ b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml
@@ -76,6 +76,12 @@
       <artifactId>junit-jupiter</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git 
a/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java
 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java
new file mode 100644
index 0000000000..7b3da8a5f0
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.vlm;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Validates the VLM (OpenAI / Claude / Gemini) configuration examples used
+ * in the docs.
+ *
+ * <p>The JSON files under {@code src/test/resources/config-examples/} are
+ * symlinked from {@code docs/modules/ROOT/examples/}, so any change that
+ * keeps these tests passing also keeps the published docs correct.
+ *
+ * <p>The tests only validate that the JSON deserializes and the parser
+ * constructs — no HTTP call is made to any model endpoint, so they're safe
+ * to run without network access or API keys.
+ */
+public class VLMConfigExamplesTest {
+
+    private static final String EXAMPLES_DIR = "/config-examples/";
+
+    @TempDir
+    Path tempDir;
+
+    private Parser loadAndValidate(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + resourceName);
+            String json = new String(is.readAllBytes(), 
StandardCharsets.UTF_8);
+            Path configFile = tempDir.resolve("tika-config.json");
+            Files.writeString(configFile, json, StandardCharsets.UTF_8);
+            TikaLoader loader = TikaLoader.load(configFile);
+            Parser parser = loader.loadParsers();
+            assertNotNull(parser, "Parser should not be null for: " + 
resourceName);
+            return parser;
+        }
+    }
+
+    @Test
+    public void testOpenAIVLMBasic() throws Exception {
+        loadAndValidate("openai-vlm-basic.json");
+    }
+
+    @Test
+    public void testOpenAIVLMFull() throws Exception {
+        loadAndValidate("openai-vlm-full.json");
+    }
+
+    @Test
+    public void testClaudeVLMBasic() throws Exception {
+        loadAndValidate("claude-vlm-basic.json");
+    }
+
+    @Test
+    public void testClaudeVLMFull() throws Exception {
+        loadAndValidate("claude-vlm-full.json");
+    }
+
+    @Test
+    public void testGeminiVLMBasic() throws Exception {
+        loadAndValidate("gemini-vlm-basic.json");
+    }
+
+    @Test
+    public void testGeminiVLMFull() throws Exception {
+        loadAndValidate("gemini-vlm-full.json");
+    }
+
+    @Test
+    public void testVLMForPdfParsing() throws Exception {
+        loadAndValidate("vlm-pdf-parsing.json");
+    }
+}
diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json
similarity index 100%
copy from docs/modules/ROOT/examples/claude-vlm-basic.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json
diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json
similarity index 84%
copy from docs/modules/ROOT/examples/claude-vlm-full.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json
index 9dc7ff67d3..682540197b 100644
--- a/docs/modules/ROOT/examples/claude-vlm-full.json
+++ 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json
@@ -11,7 +11,9 @@
         "inlineContent": true,
         "skipOcr": false,
         "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
+        "maxFileSizeToOcr": 52428800,
+        "maxImagePixels": 100000000,
+        "allowRuntimePrompt": false
       }
     }
   ]
diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json
similarity index 100%
copy from docs/modules/ROOT/examples/gemini-vlm-basic.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json
diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json
similarity index 84%
copy from docs/modules/ROOT/examples/gemini-vlm-full.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json
index ab09b993f0..8773b52eac 100644
--- a/docs/modules/ROOT/examples/gemini-vlm-full.json
+++ 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json
@@ -11,7 +11,9 @@
         "inlineContent": true,
         "skipOcr": false,
         "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
+        "maxFileSizeToOcr": 52428800,
+        "maxImagePixels": 100000000,
+        "allowRuntimePrompt": false
       }
     }
   ]
diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json
similarity index 100%
copy from docs/modules/ROOT/examples/openai-vlm-basic.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json
diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json
similarity index 77%
copy from docs/modules/ROOT/examples/openai-vlm-full.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json
index 91baafc74e..9c80fb77b4 100644
--- a/docs/modules/ROOT/examples/openai-vlm-full.json
+++ 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json
@@ -3,6 +3,7 @@
     {
       "openai-vlm-parser": {
         "baseUrl": "http://127.0.0.1:8000";,
+        "completionsPath": "/v1/chat/completions",
         "model": "jinaai/jina-vlm",
         "prompt": "Extract all visible text from this image. Return the text 
in markdown format, preserving the original structure (headings, lists, tables, 
paragraphs). Do not describe the image. Only return the extracted text.",
         "maxTokens": 4096,
@@ -11,7 +12,9 @@
         "inlineContent": true,
         "skipOcr": false,
         "minFileSizeToOcr": 0,
-        "maxFileSizeToOcr": 52428800
+        "maxFileSizeToOcr": 52428800,
+        "maxImagePixels": 100000000,
+        "allowRuntimePrompt": false
       }
     }
   ]
diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json 
b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json
similarity index 100%
copy from docs/modules/ROOT/examples/vlm-pdf-parsing.json
copy to 
tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
index 4e3e75aeae..96282dbe14 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
@@ -23,6 +23,7 @@
         "outputType": "TXT",
         "pageSeparator": "",
         "pageSegMode": "1",
+        "preloadLangs": false,
         "preserveInterwordSpacing": false,
         "resize": 200,
         "skipOcr": false,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index f006d43222..72f9f0f42f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -440,7 +440,7 @@ public class TesseractOCRConfig implements Serializable {
 
     /**
      * @param resize the resize to set. Valid range of values is 100-900.
-     *               Default value is 900.
+     *               Default value is 200 (see the {@code resize} field 
initializer).
      */
     public void setResize(int resize) {
         for (int i = 1; i < 10; i++) {
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 845b5b1940..fb19447111 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -57,7 +57,11 @@ import org.apache.tika.utils.StringUtils;
 public class TikaAsyncCLI {
 
     private static final long TIMEOUT_MS = 600_000;
-    private static final Logger LOG = 
LoggerFactory.getLogger(TikaAsyncCLI.class);
+    // Use the user-facing "tika.pipes" name rather than the FQ class name so
+    // the internal TikaAsyncCLI detail doesn't leak into user-visible logs.
+    // tika-app users invoke the Pipes processor via -i/-o flags and shouldn't
+    // need to know about the underlying async CLI class.
+    private static final Logger LOG = LoggerFactory.getLogger("tika.pipes");
 
     private static Options getOptions() {
         Options options = new Options();
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java
index 16f2f2b3e0..67afe6e5c6 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java
@@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "endpoint": "https://account.blob.core.windows.net";,
  *     "container": "my-container",
  *     "prefix": "documents/",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java
index c052ce03a6..bfd9dda160 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java
@@ -34,10 +34,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "csvPath": "/path/to/files.csv",
  *     "fetchKeyColumn": "path",
  *     "emitKeyColumn": "id",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java
index 91df67eb42..6a6ab42033 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java
@@ -33,10 +33,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *   "file-system-pipes-iterator": {
  *     "basePath": "/path/to/files",
  *     "countTotal": true,
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java
index cba8c18336..748d0f467c 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java
@@ -34,10 +34,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "projectId": "my-project",
  *     "bucket": "my-bucket",
  *     "prefix": "documents/",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java
index d805cf468c..a4571ba37b 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java
@@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "select": "select id, path from documents",
  *     "fetchKeyColumn": "path",
  *     "idColumn": "id",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java
index b6f6c683c0..ee950154b9 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java
@@ -32,10 +32,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  * "pipes-iterator": {
  *   "json-pipes-iterator": {
  *     "jsonPath": "/path/to/files.json",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java
index 4698e20337..ff7336292e 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java
@@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "bootstrapServers": "localhost:9092",
  *     "groupId": "my-group",
  *     "autoOffsetReset": "earliest",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java
index 05aa62132d..33b52cae6c 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java
@@ -36,10 +36,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "prefix": "documents/",
  *     "credentialsProvider": "profile",
  *     "profile": "default",
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java
index f822e2c68f..c863afe1ac 100644
--- 
a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java
@@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig;
  *     "solrUrls": ["http://localhost:8983/solr";],
  *     "idField": "id",
  *     "rows": 5000,
- *     "baseConfig": {
- *       "fetcherId": "my-fetcher",
- *       "emitterId": "my-emitter"
- *     }
+ *     "fetcherId": "my-fetcher",
+ *     "emitterId": "my-emitter"
  *   }
  * }
  * </pre>
diff --git 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java
 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java
index 7287a4cbd1..40bd04b137 100644
--- 
a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java
+++ 
b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java
@@ -71,6 +71,19 @@ public class ThreadSafeUnzipper {
             return;
         }
 
+        // Destination exists but has no completion marker. Possible causes:
+        // a previous extraction was killed mid-stream, the marker was deleted
+        // out from under us, or something other than our extractor put files
+        // there. Without this cleanup the subsequent Files.move() below will
+        // fail with DirectoryNotEmptyException on every run until a human
+        // manually removes the directory. Treat the half-extracted state as
+        // garbage and rebuild.
+        if (Files.exists(destination)) {
+            LOG.warn("destination {} exists without a completion marker; "
+                    + "treating as stale partial extraction and removing", 
destination);
+            deleteRecursively(destination);
+        }
+
         // Extract to a unique temp directory
         Path tempDir = destination.resolveSibling(
                 destination.getFileName() + ".tmp." + UUID.randomUUID());

(tika) 01/01: TIKA-4746 -- sweep docs

Reply via email to