(tika) branch main updated: update-4x-docs (#2802)

tallison Tue, 05 May 2026 06:10:18 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 9ac71296c0 update-4x-docs (#2802)
9ac71296c0 is described below

commit 9ac71296c0db1ddda986b883a5469c3accc80f67
Author: Tim Allison <[email protected]>
AuthorDate: Tue May 5 09:10:01 2026 -0400

    update-4x-docs (#2802)
---
 docs/.gitignore                                    |   3 +
 docs/build-docs.sh                                 |  53 ------
 .../ROOT/examples/migration-full-example.json      |   2 +-
 docs/modules/ROOT/examples/pdf-parser-basic.json   |   2 +-
 docs/modules/ROOT/examples/pdf-parser-full.json    |   2 +-
 docs/modules/ROOT/examples/tesseract-basic.json    |   2 +-
 docs/modules/ROOT/examples/tesseract-full.json     |   2 +-
 docs/modules/ROOT/nav.adoc                         |   1 -
 .../pages/advanced/charset-detection-design.adoc   |   2 +-
 .../ROOT/pages/advanced/junk-detection-build.adoc  |  16 +-
 .../ROOT/pages/advanced/language-detection.adoc    |  19 ---
 .../pages/configuration/encoding-detectors.adoc    | 183 +++++++++++----------
 .../configuration/parsers/external-parser.adoc     |   8 +-
 .../pages/maintainers/release-guides/tika.adoc     | 123 +++++++++++++-
 docs/modules/ROOT/pages/maintainers/site.adoc      |  36 ++--
 .../pages/migration-to-4x/design-notes-4x.adoc     |   2 +-
 docs/modules/ROOT/pages/migration-to-4x/index.adoc |   2 +
 docs/modules/ROOT/pages/using-tika/grpc/index.adoc |  22 +++
 docs/pom.xml                                       |  82 +++++++--
 docs/publish-docs.sh                               |  51 ++++++
 20 files changed, 397 insertions(+), 216 deletions(-)

diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000000..120d89fc35
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,3 @@
+# Generated by maven-antrun-plugin from antora-playbook.yml at build time.
+# Contains the current git-commit stamp injected for the docs home page.
+antora-playbook-stamped.yml
diff --git a/docs/build-docs.sh b/docs/build-docs.sh
deleted file mode 100755
index 030ca1199d..0000000000
--- a/docs/build-docs.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-# Builds the Antora docs site with the current git commit stamped on the home 
page.
-# Usage: ./build-docs.sh
-# Output: target/site/
-#
-# To publish to the tika-site SVN repo:
-#   ./build-docs.sh --publish /path/to/tika-site/publish
-
-set -euo pipefail
-cd "$(dirname "$0")"
-
-COMMIT=$(git rev-parse --short HEAD)
-DATE=$(date -u +%Y-%m-%d)
-
-# Inject commit into playbook, build, restore
-sed -i "/tika-stable-version/a\\    git-commit: '${COMMIT} (${DATE})'" 
antora-playbook.yml
-trap 'git checkout antora-playbook.yml' EXIT
-
-# Pass remaining args to Maven (filter out our --publish flag)
-PUBLISH_DIR=""
-MVN_ARGS=()
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --publish)
-            PUBLISH_DIR="$2"
-            shift 2
-            ;;
-        *)
-            MVN_ARGS+=("$1")
-            shift
-            ;;
-    esac
-done
-
-../mvnw antora:antora "${MVN_ARGS[@]}"
-
-echo "Site built at: target/site/"
-echo "Commit: ${COMMIT} (${DATE})"
-
-if [[ -n "${PUBLISH_DIR}" ]]; then
-    # Flatten: skip the 'tika/' component directory so URLs are 
/docs/4.0.0-SNAPSHOT/
-    # Copy UI assets one level above docs/ since HTML uses ../../_/ relative 
paths
-    DOCS_DIR="${PUBLISH_DIR}/docs"
-    mkdir -p "${DOCS_DIR}"
-    cp -r target/site/tika/* "${DOCS_DIR}/"
-    cp -r target/site/_/ "${PUBLISH_DIR}/_/"
-    # Fix the root redirect to match flattened layout
-    sed 's|tika/||g' target/site/index.html > "${DOCS_DIR}/index.html"
-    sed 's|/docs/tika/|/docs/|g' target/site/sitemap.xml > 
"${DOCS_DIR}/sitemap.xml"
-    cp target/site/404.html "${DOCS_DIR}/"
-    cp target/site/search-index.js "${DOCS_DIR}/"
-    echo "Published to: ${DOCS_DIR}/"
-fi
diff --git a/docs/modules/ROOT/examples/migration-full-example.json 
b/docs/modules/ROOT/examples/migration-full-example.json
index 05f93d7f23..7ce787b42d 120000
--- a/docs/modules/ROOT/examples/migration-full-example.json
+++ b/docs/modules/ROOT/examples/migration-full-example.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/migration-full-example.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pdf-parser-basic.json 
b/docs/modules/ROOT/examples/pdf-parser-basic.json
index b1a2ae805f..a0fa8b34ca 120000
--- a/docs/modules/ROOT/examples/pdf-parser-basic.json
+++ b/docs/modules/ROOT/examples/pdf-parser-basic.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pdf-parser-full.json 
b/docs/modules/ROOT/examples/pdf-parser-full.json
index 922388d57b..eeaa9e50c2 120000
--- a/docs/modules/ROOT/examples/pdf-parser-full.json
+++ b/docs/modules/ROOT/examples/pdf-parser-full.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tesseract-basic.json 
b/docs/modules/ROOT/examples/tesseract-basic.json
index 1a508253c6..2844915265 120000
--- a/docs/modules/ROOT/examples/tesseract-basic.json
+++ b/docs/modules/ROOT/examples/tesseract-basic.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tesseract-full.json 
b/docs/modules/ROOT/examples/tesseract-full.json
index c5d2e0c823..f1e5bc8bde 120000
--- a/docs/modules/ROOT/examples/tesseract-full.json
+++ b/docs/modules/ROOT/examples/tesseract-full.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 16429e45bf..979555022a 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -46,7 +46,6 @@
 ** xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies]
 ** xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler 
Requirements]
 * xref:advanced/index.adoc[Advanced]
-** xref:advanced/charset-detection-design.adoc[Charset Detection Pipeline]
 ** xref:advanced/language-detection.adoc[Language Detection]
 ** xref:advanced/language-detection-build.adoc[Building the Language Detector]
 ** xref:advanced/junk-detection.adoc[Text Quality Scoring (Junk Detection)]
diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc 
b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
index 7c45d7ea23..fc870e5d51 100644
--- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
+++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
@@ -510,4 +510,4 @@ with feature hashing.  The move to NB was driven by:
 == See also
 
 * Configuration: xref:../configuration/encoding-detectors.adoc[Configuring 
Encoding Detectors]
-* Language detection: xref:language-detection.adoc[Language Detection]
+* Language detection: xref:advanced/language-detection.adoc[Language Detection]
diff --git a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc 
b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
index 046099899f..c5397a7464 100644
--- a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
+++ b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
@@ -58,11 +58,11 @@ Source data lives in one directory per language (ISO 639 
code), each containing
 up to two files:
 
 `sentences_wikipedia.txt`::
-  Line-numbered Wikipedia sentences: `{lineNum}{TAB}{text}`.
+  Line-numbered Wikipedia sentences: `\{lineNum}\{TAB}\{text}`.
   One sentence per line.
 
 `sentences_madlad.txt`::
-  Line-numbered MADLAD-400 documents: `{lineNum}{TAB}{text}`.
+  Line-numbered MADLAD-400 documents: `\{lineNum}\{TAB}\{text}`.
   Documents contain literal two-character `\n` escape sequences as
   sub-sentence separators.  The tool splits on these before processing.
 
@@ -108,16 +108,16 @@ per script:
 |===
 | File | Split | Purpose
 
-| `{script}.train.gz`
+| `\{script}.train.gz`
 | 80%
 | Bigram count accumulation in `TrainJunkModel`.
 
-| `{script}.dev.gz`
+| `\{script}.dev.gz`
 | 10%
 | Calibration (mu/sigma estimation) in `TrainJunkModel`.
   Also used for iterative evaluation during development.
 
-| `{script}.test.gz`
+| `\{script}.test.gz`
 | 10%
 | **Held out completely.**  Use only for final reported evaluation numbers.
   Never use to make model or threshold decisions.
@@ -146,7 +146,7 @@ Key options:
 
 | `--output-dir`
 | `~/datasets/madlad/junkdetect`
-| Where to write `{script}.train.gz`, `.dev.gz`, `.test.gz`, and 
`manifest.tsv`.
+| Where to write `\{script}.train.gz`, `.dev.gz`, `.test.gz`, and 
`manifest.tsv`.
 
 | `--total-budget-bytes`
 | `50000000`
@@ -178,7 +178,7 @@ then calibrates z-score statistics from the `.dev.gz` file.
 
 === Bigram table training
 
-[source]
+[source,subs="-attributes"]
 ----
 for each sentence in {script}.train.gz:
     utf8 = sentence.getBytes(UTF-8)
@@ -197,7 +197,7 @@ a small but nonzero probability for novel byte sequences.
 
 === Calibration
 
-For each sentence in `{script}.dev.gz`:
+For each sentence in `\{script}.dev.gz`:
 
 [source]
 ----
diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc 
b/docs/modules/ROOT/pages/advanced/language-detection.adoc
index 9fdbe3c551..b95e06eafb 100644
--- a/docs/modules/ROOT/pages/advanced/language-detection.adoc
+++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc
@@ -204,25 +204,6 @@ CharSoupDetectorConfig cfg = 
CharSoupDetectorConfig.fromMap(Map.of(
 Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring`
 component loading.
 
-== Generative Language Model
-
-In addition to the discriminative models above, Tika ships a
-**generative character n-gram model** 
(`langdetect-generative-v4-20260320.bin`) that
-answers a complementary question: _how language-like is this text?_
-
-The generative model is used for:
-
-* **Charset detection tiebreaking** — when the discriminative model cannot
-  distinguish candidate charsets, the generative model picks the one that
-  produces the most language-like decoded text.
-* **Text quality scoring** — the `tika-eval:languageness` metadata field
-  provides a z-score indicating how normal or garbled the extracted text is.
-* **Training data filtering** — flagging bot-generated or mixed-language
-  sentences in training corpora.
-
-For full details, see
-xref:advanced/generative-language-model.adoc[Generative Language Model].
-
 == Training the Models
 
 Training is fully reproducible from source. For step-by-step instructions,
diff --git a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc 
b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
index 8205f2145f..9ec15dc5a6 100644
--- a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
+++ b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
@@ -18,52 +18,44 @@
 = Configuring Encoding Detectors
 
 Tika uses a chain of _encoding detectors_ to determine the character encoding
-of plain text and HTML content.  `DefaultEncodingDetector` loads detectors
-via the Java service-provider interface (SPI) and runs them in registration
-order.  See xref:../advanced/charset-detection-design.adoc[Charset Detection 
Pipeline]
-for design details.
+of plain text and HTML content.  `DefaultEncodingDetector` loads detectors via
+the Java service-provider interface (SPI) and runs them in registration order;
+the first non-null result wins.
+
+The default chain is `html-encoding-detector`, `universal-encoding-detector`,
+and `icu4j-encoding-detector`.
 
 == Default Detection Chain
 
-With the stock dependencies on the classpath:
+With the stock dependencies on the classpath (the modules
+`tika-encoding-detector-html`, `tika-encoding-detector-universal`, and
+`tika-encoding-detector-icu4j`):
 
-[cols="1,2,2"]
+[cols="1,2,3"]
 |===
 |Step |Detector |Returns non-null when…
 
 |1
-|`bom-detector`
-|A UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE byte-order mark is present.
-Emits DECLARATIVE.
+|`html-encoding-detector`
+|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag is
+found.  Fast lenient regex matcher with a curated subset of WHATWG label
+aliases.
 
 |2
-|`metadata-charset-detector`
-|A `charset=` parameter is present in the `Content-Type` metadata field
-(populated from an HTTP response header or similar).  Emits DECLARATIVE.
+|`universal-encoding-detector`
+|A state-machine structural prober (juniversalchardet fork) recognises the
+byte pattern as a known encoding (UTF-8, GB18030, Big5, EUC-JP, several
+ISO-8859 variants, etc.).
 
 |3
-|`mojibuster-encoding-detector`
-|A structural UTF-32 check, structural UTF-16 specialist, UTF-8 grammar
-gate, and 33-class byte-bigram Naive Bayes classifier.  STRUCTURAL for
-structural hits; STATISTICAL for NB predictions.
-
-|4
-|`html-encoding-detector`
-|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag
-is found (fast lenient regex matcher, curated WHATWG label aliases).
-Emits DECLARATIVE.
-
-|5
-|`junk-filter-encoding-detector`
-|`MetaEncodingDetector` — strips HTML/XML markup, decodes the probe under
-each candidate charset, and picks the cleanest decoding via a script-aware
-text-quality model.  Honours a declaration when its decoding is
-byte-identical to at least one other candidate's.  Always runs last.
+|`icu4j-encoding-detector`
+|ICU4J's `CharsetDetector` returns a match.  Catches additional single-byte
+encodings (Windows code pages, IBM/EBCDIC variants, etc.).
 |===
 
-NOTE: `junk-filter-encoding-detector` is supplied by `tika-ml-junkdetect`
-and SPI-loads when the module jar is on the classpath.  Omit it for plain
-first-match-wins — see <<opting-out-of-arbitration>>.
+The chain is permissive — first-match-wins.  A declared charset
+(e.g. from a `<meta charset>` tag) wins over later structural or statistical
+detectors.
 
 == Available Detectors
 
@@ -74,128 +66,141 @@ referenced by their SPI name in JSON configuration.
 |===
 |Name |Module |Description
 
-|`bom-detector`
-|`tika-core`
-|Reads the first 4 bytes for BOM signatures.  In the default chain.
-
-|`metadata-charset-detector`
-|`tika-core`
-|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from
-the `Metadata` object.  Applies WHATWG label normalization
-(ISO-8859-1 and US-ASCII → windows-1252).  In the default chain.
-
-|`mojibuster-encoding-detector`
-|`tika-encoding-detector-mojibuster`
-|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32
-and UTF-16 and a UTF-8 grammar gate.  33 classes including CJK
-multi-byte, EBCDIC variants, DOS code pages, Cyrillic, Windows single-byte,
-ISO-8859-3/16, Mac, and UTF-8.  In the default chain.  See
-xref:../advanced/charset-detection-design.adoc[the design doc].
-
 |`html-encoding-detector`
 |`tika-encoding-detector-html`
 |Fast lenient regex matcher for `<meta charset>` / `http-equiv` tags, with a
-curated subset of WHATWG label aliases.  In the default chain.
+curated subset of WHATWG label aliases.  Auto-registered (in default chain).
+
+|`universal-encoding-detector`
+|`tika-encoding-detector-universal`
+|State-machine structural prober (juniversalchardet fork).  Auto-registered
+(in default chain).
+
+|`icu4j-encoding-detector`
+|`tika-encoding-detector-icu4j`
+|Wraps ICU4J's `CharsetDetector`.  Auto-registered (in default chain).
 
 |`standard-html-encoding-detector`
 |`tika-encoding-detector-html`
 |Spec-strict WHATWG prescan algorithm.  Not in the default chain — opt in
 explicitly if you need strict WHATWG tokenisation (e.g. ignoring charset
-declarations inside comments or other contexts the lenient regex may match).
+declarations inside HTML comments or other contexts the lenient regex may
+match).
+
+|`mojibuster-encoding-detector`
+|`tika-encoding-detector-mojibuster`
+|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32
+and UTF-16 and a UTF-8 grammar gate.  Not in the default chain — opt in
+explicitly.
 
 |`junk-filter-encoding-detector`
 |`tika-ml-junkdetect`
-|Text-quality arbitrator (`MetaEncodingDetector`).  In the default chain
-when the module jar is on the classpath; always runs last.
+|Text-quality arbitrator (`MetaEncodingDetector`) that picks among other
+detectors' candidates by decode quality.  Not in the default chain — opt in
+explicitly.
 
-|`icu4j-encoding-detector`
-|`tika-encoding-detector-icu4j`
-|Wraps ICU4J `CharsetDetector`.  Legacy — the NB pipeline supersedes it
-for most cases.  Not auto-registered.
+|`bom-detector`
+|`tika-core`
+|Reads the first 4 bytes for BOM signatures.  Helper component, used
+internally by `AutoDetectReader`.  Not normally added to the SPI chain.
 
-|`universal-encoding-detector`
-|`tika-encoding-detector-universal`
-|State-machine structural prober (juniversalchardet fork).  Not
-auto-registered; opt in if you specifically need it.
+|`metadata-charset-detector`
+|`tika-core`
+|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from
+the `Metadata` object.  Helper component, used by parsers that consult
+`Content-Type` directly.  Not normally added to the SPI chain.
 |===
 
 == Configuration Examples
 
 === Exclude a detector from the default chain
 
+Use `default-encoding-detector` with an `exclude` list to drop one or more
+auto-registered detectors:
+
 [source,json]
 ----
 {
   "encoding-detectors": [
     {
       "default-encoding-detector": {
-        "exclude": ["bom-detector"]
+        "exclude": ["icu4j-encoding-detector"]
       }
     }
   ]
 }
 ----
 
-[[opting-out-of-arbitration]]
-=== Strict first-match-wins (no JunkFilter arbitration)
+=== Specify the chain explicitly
 
-Omit JunkFilter to get plain first-match-wins (each base detector's top
-result wins in registration order):
+To replace the SPI-discovered chain with an explicit ordered list:
 
 [source,json]
 ----
 {
   "encoding-detectors": [
-    {"bom-detector": {}},
-    {"metadata-charset-detector": {}},
     {"html-encoding-detector": {}},
-    {"mojibuster-encoding-detector": {}}
+    {"universal-encoding-detector": {}}
   ]
 }
 ----
 
-Trade-off: lying declarations propagate unfiltered, and Mojibuster's
-statistical guess wins ties without any text-quality cross-check.
-See
-xref:../advanced/charset-detection-design.adoc#opting-out-of-arbitration[the 
design doc].
-
 === Configure the HTML detector's read limit
 
-`html-encoding-detector` reads up to 65 536 bytes by default when
-scanning for the `<meta charset>` tag.  Raise it if your documents embed
-large `<script>` blocks before the meta tag (TIKA-2485):
+`html-encoding-detector` reads up to 65 536 bytes by default when scanning
+for the `<meta charset>` tag.  Raise it if your documents embed large
+`<script>` blocks before the meta tag (TIKA-2485):
 
 [source,json]
 ----
 {
   "encoding-detectors": [
-    {"bom-detector": {}},
-    {"metadata-charset-detector": {}},
     {
       "html-encoding-detector": {
         "markLimit": 131072
       }
     },
-    {"mojibuster-encoding-detector": {}},
-    {"junk-filter-encoding-detector": {}}
+    {"universal-encoding-detector": {}},
+    {"icu4j-encoding-detector": {}}
   ]
 }
 ----
 
-=== Legacy chain (ICU4J + juniversalchardet)
+=== Use the spec-strict WHATWG HTML detector
 
-Not recommended — the NB pipeline is strictly better on accuracy and
-latency — but available for regression testing or comparison:
+If your input HTML has charset declarations inside comments (or other
+contexts where the lenient regex would false-match), opt in to the
+spec-strict prescan:
+
+[source,json]
+----
+{
+  "encoding-detectors": [
+    {"standard-html-encoding-detector": {}},
+    {"universal-encoding-detector": {}},
+    {"icu4j-encoding-detector": {}}
+  ]
+}
+----
+
+=== Add the Mojibuster + JunkFilter chain (opt-in)
+
+The byte-bigram NB classifier (`mojibuster-encoding-detector`) and the
+text-quality arbitrator (`junk-filter-encoding-detector`) are available as
+opt-in components.  They require the `tika-encoding-detector-mojibuster`
+and `tika-ml-junkdetect` modules on the classpath:
 
 [source,json]
 ----
 {
   "encoding-detectors": [
-    {"bom-detector": {}},
-    {"metadata-charset-detector": {}},
     {"html-encoding-detector": {}},
-    {"icu4j-encoding-detector": {}},
-    {"universal-encoding-detector": {}}
+    {"mojibuster-encoding-detector": {}},
+    {"junk-filter-encoding-detector": {}}
   ]
 }
 ----
+
+`junk-filter-encoding-detector` is a `MetaEncodingDetector` — it collects
+candidates from the other detectors and picks the cleanest decoding via a
+script-aware text-quality model.  It must run last.
diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc 
b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
index 281bd6d12d..1bda565f56 100644
--- a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
+++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
@@ -37,7 +37,7 @@ independent handler (any Tika parser):
 
 * **`stdoutHandler`** -- processes stdout
 * **`stderrHandler`** -- processes stderr
-* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}` 
is used)
+* **`outputFileHandler`** -- processes the output file (when `$\{OUTPUT_FILE}` 
is used)
 
 Handlers extract metadata, content, or both. `regex-capture-parser` is the
 most common choice for extracting metadata via regex patterns.
@@ -46,8 +46,8 @@ most common choice for extracting metadata via regex patterns.
 
 The `contentSource` field controls which stream provides the XHTML text 
content:
 
-* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command
-* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command
+* `"stdout"` -- default when no `$\{OUTPUT_FILE}` in the command
+* `"outputFile"` -- default when `$\{OUTPUT_FILE}` is in the command
 * `"stderr"` -- use stderr as the content source
 * `"none"` -- metadata-only mode, no text content extracted
 
@@ -63,7 +63,7 @@ configured, the raw bytes are written as text.
 
 |`commandLine`
 |`List<String>`
-|The command and arguments to run. Use `${INPUT_FILE}` and `${OUTPUT_FILE}` 
tokens for file paths.
+|The command and arguments to run. Use `$\{INPUT_FILE}` and `$\{OUTPUT_FILE}` 
tokens for file paths.
 
 |`checkCommandLine`
 |`List<String>`
diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc 
b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
index a967c80421..7d65fe8637 100644
--- a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
+++ b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
@@ -27,6 +27,8 @@ Before starting the release process, ensure you have:
 * A valid GPG key published to a public keyserver
 * Maven credentials configured in `~/.m2/settings.xml`
 * Access to Apache's Nexus repository manager
+* SVN client (`svn`) — release candidates upload to `dist.apache.org` via SVN, 
not scp
+* Internet access on first build — the Antora docs build downloads Node.js 
into `~/.cache/tika-antora/` on first run (~100 MB, one-time per machine; 
reused across clean builds)
 
 == Pre-Release Checks
 
@@ -125,9 +127,16 @@ Execute the Maven release perform goal:
 
 [source,bash]
 ----
-mvn release:perform
+mvn release:perform -Darguments="-DskipITs"
 ----
 
+`-DskipITs` skips integration tests during the inner build.  Tests already
+ran in `release:prepare`'s `verify` phase; re-running them during perform is
+belt-and-suspenders, and some pipes/elasticsearch chaos-monkey tests are
+timing-sensitive enough to flake on a tagged build.
+
+If `release:perform` fails partway through, see <<troubleshooting>>.
+
 Ensure you have valid Maven credentials in `~/.m2/settings.xml`:
 
 [source,xml]
@@ -152,24 +161,40 @@ Ensure you have valid Maven credentials in 
`~/.m2/settings.xml`:
 
 === Step 10: Upload Distribution Artifacts
 
-Upload artifacts to `dist.apache.org`:
+The release-plugin's antrun task assembles a dist directory at
+`target/checkout/target/X.Y.Z/` containing the source zip, app jar, server
+tarballs, and parser-package jars (each with `.asc` and `.sha512`).
+
+WARNING: At the end of `release:perform` you will see an echo telling you
+to `scp -r ... people.apache.org:public_html/tika/`.  *Ignore that.*  It
+is stale — the current ASF release distribution channel is the SVN repo
+under `dist.apache.org`, not `people.apache.org`.
+
+Check out the dist dev SVN repo and copy the prepared dist directory in:
 
 [source,bash]
 ----
 svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev
+cp -r target/checkout/target/X.Y.Z tika-dist-dev/
 cd tika-dist-dev
+svn add X.Y.Z
+svn commit -m "Stage Apache Tika X.Y.Z RC<n>"
 ----
 
-Upload the following files with their signatures (.asc) and checksums 
(.sha512):
+Verify the directory contains all expected artifacts (each with `.asc` and
+`.sha512`):
 
 * `tika-X.Y.Z-src.zip`
 * `tika-app-X.Y.Z.jar`
-* `tika-server-standard-X.Y.Z.jar`
+* `tika-server-standard-X.Y.Z.jar` (and `-bin.tgz`, `-bin.zip`)
+* `tika-parser-scientific-package-X.Y.Z.jar`
+* `tika-parser-sqlite3-package-X.Y.Z.jar`
+* `tika-parser-nlp-package-X.Y.Z.jar`
 
 Also:
 
-* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt`
-* Ensure the `KEYS` file contains all contributor signatures
+* `CHANGES.txt` (already in the dist directory; rename to `CHANGES-X.Y.Z.txt` 
if your local copy hasn't been)
+* Ensure the `KEYS` file at the parent directory contains your GPG key
 
 === Step 11: Call the Vote
 
@@ -214,6 +239,92 @@ svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \
        -m "Release Apache Tika X.Y.Z"
 ----
 
+[[troubleshooting]]
+== Troubleshooting `release:perform`
+
+The `release:perform` build can fail mid-way for reasons unrelated to the
+release itself.  This section captures the recoveries learned during recent
+releases.  Once these get fixed in the build (tracked in the to-fix-before-
+beta punch list), this section can be slimmed down.
+
+=== `tika-docs` assembly fails: "archive cannot be empty"
+
+[source]
+----
+[ERROR] Failed to create assembly: Error creating assembly archive docs:
+        archive cannot be empty
+----
+
+Cause: the Antora plugin is not auto-bound to the `package` phase, so
+`target/site/` is empty when `maven-assembly-plugin` runs.
+
+Recovery (resume from `tika-docs`):
+
+[source,bash]
+----
+cd target/checkout
+mvn deploy -Papache-release -rf :tika-docs -DskipITs
+----
+
+If the antora binding (the recommended fix in the to-fix-before-beta punch
+list) hasn't yet landed, you may need to manually build the site first:
+
+[source,bash]
+----
+cd target/checkout/docs
+mvn antora:antora
+cd ..
+mvn deploy -Papache-release -rf :tika-docs -DskipITs
+----
+
+=== Antrun error from a child module: "Could not find file ... -src.zip"
+
+[source]
+----
+Could not find file .../docs/target/X.Y.Z/tika-X.Y.Z-src.zip
+to generate checksum for.
+----
+
+Cause: the root-pom antrun execution lacks `<inherited>false</inherited>`,
+so it fires from each child module on a resumed deploy with `$\{basedir}`
+pointing at the wrong directory.
+
+Recovery (run the antrun once at the root):
+
+[source,bash]
+----
+cd target/checkout
+mvn deploy --non-recursive -Papache-release -Dmaven.deploy.skip=true
+----
+
+`--non-recursive` runs only the root pom; `-Dmaven.deploy.skip=true`
+prevents re-uploading the root pom artifact (already uploaded earlier).
+The antrun fires in the correct basedir and `target/X.Y.Z/` gets
+populated.
+
+=== Nexus staging repository: only one repo when I expected two
+
+If `release:perform` fails partway and you re-run it, you may see only one
+open staging repository on `repository.apache.org` even though both
+invocations uploaded artifacts.  This is normal: while the staging repo is
+*open*, redeploys overwrite earlier artifacts.  Confirm by checking the
+`Last Modified` timestamp on a representative artifact (e.g.
+`tika-core-X.Y.Z.jar`) — it should match the most recent run.
+
+When in doubt, drop the staging repo and run `release:perform` cleanly
+from scratch.  It costs ~1 hour but yields a guaranteed single-build set
+of artifacts.
+
+=== gRPC distribution zip is huge (~600+ MB)
+
+The `tika-grpc-X.Y.Z.zip` artifact bundles every pipes plugin with its full
+transitive closure (microsoft-graph, gcs, az-blob, s3, kafka, etc.) plus
+multi-platform native libs (rocksdbjni, netty natives).  Several hundred MB
+of that is duplication of dependencies already in the root `lib/`
+directory.  This is a known issue tracked for cleanup before beta — see
+the to-fix-before-4.0.0-beta punch list.  The release can ship as-is; the
+zip is correct, just bloated.
+
 == Post-Release
 
 === Update Unreleased Modules
diff --git a/docs/modules/ROOT/pages/maintainers/site.adoc 
b/docs/modules/ROOT/pages/maintainers/site.adoc
index 2a86d9231d..ce751cf4a1 100644
--- a/docs/modules/ROOT/pages/maintainers/site.adoc
+++ b/docs/modules/ROOT/pages/maintainers/site.adoc
@@ -28,31 +28,27 @@ The site supports multiple versions through Git branches 
and includes client-sid
 
 * Maven 3.9+
 * Git
+* Internet access on first build — the Antora plugin downloads Node.js into 
`~/.cache/tika-antora/` (~100 MB, one-time per machine; reused across clean 
builds and across worktrees)
 
 == Building the Site Locally
 
-To build the documentation locally:
+The `docs` module is only included in the reactor under the `apache-release` 
profile. Build the site from the repo root:
 
 [source,bash]
 ----
-cd docs
-mvn antora:antora
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
 ----
 
-The generated site will be at `docs/target/site/`.
+The generated site will be at `docs/target/site/`. The current git commit and 
date are stamped automatically onto the home page (a generated copy of the 
playbook lives at `docs/antora-playbook-stamped.yml` — gitignored).
 
-To stamp the build with the current commit hash (shown on the home page),
-add `git-commit` to the attributes in `antora-playbook.yml`:
+To skip the stamping or override the playbook:
 
-[source,yaml]
+[source,bash]
 ----
-asciidoc:
-  attributes:
-    git-commit: 'abc1234'
+# build directly with the unstamped playbook
+cd docs && mvn antora:antora -Dplaybook=antora-playbook.yml
 ----
 
-Or pass it on the command line when you have a playbook that supports CLI 
attributes.
-
 === Previewing the Site
 
 **Option 1: Python HTTP server (recommended)**
@@ -113,12 +109,13 @@ The playbook (`antora-playbook.yml`) is configured to 
build all `docs/*` branche
 
 === Publishing to the Site
 
-Use `build-docs.sh` with the `--publish` flag to build and copy to the site 
SVN checkout:
+Build the docs with Maven, then run `publish-docs.sh` to copy the output to a 
tika-site SVN checkout (with URL flattening so `/docs/tika/X.Y.Z/...` becomes 
`/docs/X.Y.Z/...`):
 
 [source,bash]
 ----
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
 cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
 
 # Then in the SVN checkout:
 cd /path/to/tika-site
@@ -126,8 +123,9 @@ svn add publish/docs publish/_ --force
 svn commit -m "Publish 4.0.0-SNAPSHOT docs"
 ----
 
-This builds the Antora site, stamps the git commit on the home page, and copies
-the output to the site with the correct directory layout:
+The Maven `package` step builds the Antora site (stamping the current git
+commit and date on the home page); `publish-docs.sh` copies the output to
+the site checkout with the correct directory layout:
 
 * `publish/docs/4.0.0-SNAPSHOT/` -- the documentation pages
 * `publish/_/` -- CSS, JS, fonts (shared across versions)
@@ -151,8 +149,9 @@ git commit -am "Set docs version to 4.0.0"
 git push origin docs/4.0.0
 
 # 4. Build and publish
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
 cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
 
 # 5. Commit to SVN
 cd /path/to/tika-site
@@ -177,8 +176,9 @@ git commit -am "Fix PDF parser example"
 git push origin docs/4.0.0
 
 # 4. Rebuild and republish
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
 cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
 cd /path/to/tika-site
 svn commit -m "Update 4.0.0 docs"
 ----
diff --git a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
index d7ea0fb5fa..d8e027d496 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
@@ -121,7 +121,7 @@ mvn clean spotless:apply install
 
 * Implement flexible component loading without `@TikaComponent` requirements
 * Enable friendly name usage throughout the codebase
-* Resolve gRPC issues
 * Fix external renderer byte-passing in open containers
 * Simplify and strengthen serialization code
 * Consider relocating `TikaConfig` and `ForkParser` to legacy module
+* Complete CLI integration for `tika-app` config dump 
(`--dump-minimal-config`, `--dump-current-config`, `--dump-static-config`, 
`--dump-static-full-config` are stubbed but not yet wired)
diff --git a/docs/modules/ROOT/pages/migration-to-4x/index.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
index eebf29f3db..39675318c5 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/index.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
@@ -31,6 +31,8 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and 
support schedules.
 
 * xref:migration-to-4x/design-notes-4x.adoc[Design Notes] - Architectural 
decisions and design rationale
 * xref:migration-to-4x/serialization-4x.adoc[Serialization] - JSON 
serialization design and implementation details
+* xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies] - Chunk 
emission to Elasticsearch/OpenSearch from the inference pipeline
+* xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler 
Requirements] - Why inference filters require MARKDOWN content handler
 
 == TODOs / Missing Features in 4.x
 
diff --git a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc 
b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
index 2f1eb24adb..7782e1cd63 100644
--- a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
@@ -24,6 +24,28 @@ This section covers using Apache Tika via gRPC.
 Tika gRPC provides a high-performance gRPC interface for parsing documents.
 This is useful for microservices architectures and polyglot environments.
 
+The service definition lives in `tika-grpc/src/main/proto/tika.proto`.  Clients
+register a fetcher (`SaveFetcher`) and then submit `FetchAndParseRequest`
+messages, each of which returns a `FetchAndParseReply` with extracted
+metadata and content.
+
+== Per-Request `ParseContext`
+
+`FetchAndParseRequest.parse_context_json` lets the caller override the
+server's default `ParseContext` on a per-request basis.  Keys are
+parse-context component names; values are their JSON configs.
+
+[source,json]
+----
+{
+  "basic-content-handler-factory": {"type": "HTML"},
+  "timeout-limits": {"progressTimeoutMillis": 30000}
+}
+----
+
+See `META-INF/tika/parse-context.idx` (generated at build time from
+`@TikaComponent` annotations) for the available component names.
+
 == Topics
 
 // Add links to specific topics as they are created
diff --git a/docs/pom.xml b/docs/pom.xml
index 3759e2b7c4..84d570e554 100644
--- a/docs/pom.xml
+++ b/docs/pom.xml
@@ -64,34 +64,64 @@ under the License.
                     </filesets>
                 </configuration>
             </plugin>
-            <!-- Maven Assembly plugin to create tar.gz -->
+            <!-- Stamp the current git commit and build date onto a
+                 sibling copy of antora-playbook.yml so the docs home
+                 page shows which commit produced the site. The copy
+                 lives next to the original so Antora's relative-path
+                 resolution (e.g. ./supplemental-ui) still works. The
+                 generated file is gitignored. -->
             <plugin>
-                <artifactId>maven-assembly-plugin</artifactId>
+                <artifactId>maven-antrun-plugin</artifactId>
                 <executions>
                     <execution>
-                        <id>make-docs-archive</id>
-                        <phase>package</phase>
+                        <id>stamp-antora-playbook</id>
+                        <phase>generate-resources</phase>
                         <goals>
-                            <goal>single</goal>
+                            <goal>run</goal>
                         </goals>
                         <configuration>
-                            <descriptors>
-                                <descriptor>src/assembly/docs.xml</descriptor>
-                            </descriptors>
-                            
<finalName>${project.artifactId}-${project.version}</finalName>
+                            <target>
+                                <exec executable="git" 
outputproperty="git.commit.short" failonerror="true">
+                                    <arg value="rev-parse"/>
+                                    <arg value="--short"/>
+                                    <arg value="HEAD"/>
+                                </exec>
+                                <tstamp>
+                                    <format property="build.date" 
pattern="yyyy-MM-dd"/>
+                                </tstamp>
+                                <copy 
file="${project.basedir}/antora-playbook.yml"
+                                      
tofile="${project.basedir}/antora-playbook-stamped.yml"
+                                      overwrite="true"/>
+                                <replaceregexp 
file="${project.basedir}/antora-playbook-stamped.yml"
+                                               match="(tika-stable-version:.*)"
+                                               replace="\1${line.separator}    
git-commit: '${git.commit.short} (${build.date})'"
+                                               byline="true"/>
+                            </target>
                         </configuration>
                     </execution>
                 </executions>
             </plugin>
 
-            <!-- Antora plugin for building the documentation site -->
+            <!-- Antora plugin for building the documentation site.
+                 Bound to prepare-package so target/site/ exists before
+                 the assembly runs.
+
+                 nodeInstallDirectory is moved out of target/ so 'mvn
+                 clean' does not nuke the downloaded Node.js. First
+                 build per machine downloads ~100MB once; subsequent
+                 builds reuse it. The plugin's nodeExecutable param
+                 cannot reuse a system Node on distros where npm and
+                 node packages aren't co-located (Debian/Ubuntu), so
+                 caching the plugin-managed install is the practical
+                 alternative. -->
             <plugin>
                 <groupId>org.antora</groupId>
                 <artifactId>antora-maven-plugin</artifactId>
                 <version>${antora.version}</version>
                 <extensions>true</extensions>
                 <configuration>
-                    <playbook>antora-playbook.yml</playbook>
+                    
<nodeInstallDirectory>${user.home}/.cache/tika-antora</nodeInstallDirectory>
+                    <playbook>antora-playbook-stamped.yml</playbook>
                     <packages>
                         <package>@antora/[email protected]</package>
                     </packages>
@@ -100,6 +130,36 @@ under the License.
                         <option>--log-failure-level=fatal</option>
                     </options>
                 </configuration>
+                <executions>
+                    <execution>
+                        <id>build-antora-site</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>antora</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <!-- Maven Assembly plugin to package the built site into a
+                 tar.gz for release distribution. Runs after Antora. -->
+            <plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>make-docs-archive</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                        <configuration>
+                            <descriptors>
+                                <descriptor>src/assembly/docs.xml</descriptor>
+                            </descriptors>
+                            
<finalName>${project.artifactId}-${project.version}</finalName>
+                        </configuration>
+                    </execution>
+                </executions>
             </plugin>
         </plugins>
     </build>
diff --git a/docs/publish-docs.sh b/docs/publish-docs.sh
new file mode 100755
index 0000000000..fcbed4e995
--- /dev/null
+++ b/docs/publish-docs.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copies the already-built Antora site into a tika-site SVN checkout,
+# flattening URLs from /docs/tika/X.Y.Z/... to /docs/X.Y.Z/... so they
+# match the website layout.
+#
+# Usage:
+#   ./publish-docs.sh /path/to/tika-site/publish
+#
+# Prerequisite: run 'mvn package -pl docs' first to populate target/site/.
+
+set -euo pipefail
+cd "$(dirname "$0")"
+
+PUBLISH_DIR="${1:?usage: publish-docs.sh <tika-site-publish-dir>}"
+DOCS_DIR="${PUBLISH_DIR}/docs"
+
+if [[ ! -d target/site ]]; then
+    echo "target/site/ not found." >&2
+    echo "Build the docs first: cd .. && ./mvnw package -pl docs" >&2
+    exit 1
+fi
+
+mkdir -p "${DOCS_DIR}"
+
+# Strip the 'tika/' component dir prefix so URLs are /docs/X.Y.Z/...
+cp -r target/site/tika/* "${DOCS_DIR}/"
+# UI assets one level above docs/, since HTML uses ../../_/ relative paths
+cp -r target/site/_/ "${PUBLISH_DIR}/_/"
+# Fix the root redirect and sitemap to match the flattened layout
+sed 's|tika/||g' target/site/index.html > "${DOCS_DIR}/index.html"
+sed 's|/docs/tika/|/docs/|g' target/site/sitemap.xml > 
"${DOCS_DIR}/sitemap.xml"
+cp target/site/404.html "${DOCS_DIR}/"
+cp target/site/search-index.js "${DOCS_DIR}/"
+
+echo "Published to: ${DOCS_DIR}/"

(tika) branch main updated: update-4x-docs (#2802)

Reply via email to