This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9ac71296c0 update-4x-docs (#2802)
9ac71296c0 is described below
commit 9ac71296c0db1ddda986b883a5469c3accc80f67
Author: Tim Allison <[email protected]>
AuthorDate: Tue May 5 09:10:01 2026 -0400
update-4x-docs (#2802)
---
docs/.gitignore | 3 +
docs/build-docs.sh | 53 ------
.../ROOT/examples/migration-full-example.json | 2 +-
docs/modules/ROOT/examples/pdf-parser-basic.json | 2 +-
docs/modules/ROOT/examples/pdf-parser-full.json | 2 +-
docs/modules/ROOT/examples/tesseract-basic.json | 2 +-
docs/modules/ROOT/examples/tesseract-full.json | 2 +-
docs/modules/ROOT/nav.adoc | 1 -
.../pages/advanced/charset-detection-design.adoc | 2 +-
.../ROOT/pages/advanced/junk-detection-build.adoc | 16 +-
.../ROOT/pages/advanced/language-detection.adoc | 19 ---
.../pages/configuration/encoding-detectors.adoc | 183 +++++++++++----------
.../configuration/parsers/external-parser.adoc | 8 +-
.../pages/maintainers/release-guides/tika.adoc | 123 +++++++++++++-
docs/modules/ROOT/pages/maintainers/site.adoc | 36 ++--
.../pages/migration-to-4x/design-notes-4x.adoc | 2 +-
docs/modules/ROOT/pages/migration-to-4x/index.adoc | 2 +
docs/modules/ROOT/pages/using-tika/grpc/index.adoc | 22 +++
docs/pom.xml | 82 +++++++--
docs/publish-docs.sh | 51 ++++++
20 files changed, 397 insertions(+), 216 deletions(-)
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000000..120d89fc35
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,3 @@
+# Generated by maven-antrun-plugin from antora-playbook.yml at build time.
+# Contains the current git-commit stamp injected for the docs home page.
+antora-playbook-stamped.yml
diff --git a/docs/build-docs.sh b/docs/build-docs.sh
deleted file mode 100755
index 030ca1199d..0000000000
--- a/docs/build-docs.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-# Builds the Antora docs site with the current git commit stamped on the home
page.
-# Usage: ./build-docs.sh
-# Output: target/site/
-#
-# To publish to the tika-site SVN repo:
-# ./build-docs.sh --publish /path/to/tika-site/publish
-
-set -euo pipefail
-cd "$(dirname "$0")"
-
-COMMIT=$(git rev-parse --short HEAD)
-DATE=$(date -u +%Y-%m-%d)
-
-# Inject commit into playbook, build, restore
-sed -i "/tika-stable-version/a\\ git-commit: '${COMMIT} (${DATE})'"
antora-playbook.yml
-trap 'git checkout antora-playbook.yml' EXIT
-
-# Pass remaining args to Maven (filter out our --publish flag)
-PUBLISH_DIR=""
-MVN_ARGS=()
-while [[ $# -gt 0 ]]; do
- case $1 in
- --publish)
- PUBLISH_DIR="$2"
- shift 2
- ;;
- *)
- MVN_ARGS+=("$1")
- shift
- ;;
- esac
-done
-
-../mvnw antora:antora "${MVN_ARGS[@]}"
-
-echo "Site built at: target/site/"
-echo "Commit: ${COMMIT} (${DATE})"
-
-if [[ -n "${PUBLISH_DIR}" ]]; then
- # Flatten: skip the 'tika/' component directory so URLs are
/docs/4.0.0-SNAPSHOT/
- # Copy UI assets one level above docs/ since HTML uses ../../_/ relative
paths
- DOCS_DIR="${PUBLISH_DIR}/docs"
- mkdir -p "${DOCS_DIR}"
- cp -r target/site/tika/* "${DOCS_DIR}/"
- cp -r target/site/_/ "${PUBLISH_DIR}/_/"
- # Fix the root redirect to match flattened layout
- sed 's|tika/||g' target/site/index.html > "${DOCS_DIR}/index.html"
- sed 's|/docs/tika/|/docs/|g' target/site/sitemap.xml >
"${DOCS_DIR}/sitemap.xml"
- cp target/site/404.html "${DOCS_DIR}/"
- cp target/site/search-index.js "${DOCS_DIR}/"
- echo "Published to: ${DOCS_DIR}/"
-fi
diff --git a/docs/modules/ROOT/examples/migration-full-example.json
b/docs/modules/ROOT/examples/migration-full-example.json
index 05f93d7f23..7ce787b42d 120000
--- a/docs/modules/ROOT/examples/migration-full-example.json
+++ b/docs/modules/ROOT/examples/migration-full-example.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/migration-full-example.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pdf-parser-basic.json
b/docs/modules/ROOT/examples/pdf-parser-basic.json
index b1a2ae805f..a0fa8b34ca 120000
--- a/docs/modules/ROOT/examples/pdf-parser-basic.json
+++ b/docs/modules/ROOT/examples/pdf-parser-basic.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pdf-parser-full.json
b/docs/modules/ROOT/examples/pdf-parser-full.json
index 922388d57b..eeaa9e50c2 120000
--- a/docs/modules/ROOT/examples/pdf-parser-full.json
+++ b/docs/modules/ROOT/examples/pdf-parser-full.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tesseract-basic.json
b/docs/modules/ROOT/examples/tesseract-basic.json
index 1a508253c6..2844915265 120000
--- a/docs/modules/ROOT/examples/tesseract-basic.json
+++ b/docs/modules/ROOT/examples/tesseract-basic.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-basic.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/tesseract-full.json
b/docs/modules/ROOT/examples/tesseract-full.json
index c5d2e0c823..f1e5bc8bde 120000
--- a/docs/modules/ROOT/examples/tesseract-full.json
+++ b/docs/modules/ROOT/examples/tesseract-full.json
@@ -1 +1 @@
-../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json
\ No newline at end of file
+../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 16429e45bf..979555022a 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -46,7 +46,6 @@
** xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies]
** xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler
Requirements]
* xref:advanced/index.adoc[Advanced]
-** xref:advanced/charset-detection-design.adoc[Charset Detection Pipeline]
** xref:advanced/language-detection.adoc[Language Detection]
** xref:advanced/language-detection-build.adoc[Building the Language Detector]
** xref:advanced/junk-detection.adoc[Text Quality Scoring (Junk Detection)]
diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
index 7c45d7ea23..fc870e5d51 100644
--- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
+++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
@@ -510,4 +510,4 @@ with feature hashing. The move to NB was driven by:
== See also
* Configuration: xref:../configuration/encoding-detectors.adoc[Configuring
Encoding Detectors]
-* Language detection: xref:language-detection.adoc[Language Detection]
+* Language detection: xref:advanced/language-detection.adoc[Language Detection]
diff --git a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
index 046099899f..c5397a7464 100644
--- a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
+++ b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc
@@ -58,11 +58,11 @@ Source data lives in one directory per language (ISO 639
code), each containing
up to two files:
`sentences_wikipedia.txt`::
- Line-numbered Wikipedia sentences: `{lineNum}{TAB}{text}`.
+ Line-numbered Wikipedia sentences: `\{lineNum}\{TAB}\{text}`.
One sentence per line.
`sentences_madlad.txt`::
- Line-numbered MADLAD-400 documents: `{lineNum}{TAB}{text}`.
+ Line-numbered MADLAD-400 documents: `\{lineNum}\{TAB}\{text}`.
Documents contain literal two-character `\n` escape sequences as
sub-sentence separators. The tool splits on these before processing.
@@ -108,16 +108,16 @@ per script:
|===
| File | Split | Purpose
-| `{script}.train.gz`
+| `\{script}.train.gz`
| 80%
| Bigram count accumulation in `TrainJunkModel`.
-| `{script}.dev.gz`
+| `\{script}.dev.gz`
| 10%
| Calibration (mu/sigma estimation) in `TrainJunkModel`.
Also used for iterative evaluation during development.
-| `{script}.test.gz`
+| `\{script}.test.gz`
| 10%
| **Held out completely.** Use only for final reported evaluation numbers.
Never use to make model or threshold decisions.
@@ -146,7 +146,7 @@ Key options:
| `--output-dir`
| `~/datasets/madlad/junkdetect`
-| Where to write `{script}.train.gz`, `.dev.gz`, `.test.gz`, and
`manifest.tsv`.
+| Where to write `\{script}.train.gz`, `.dev.gz`, `.test.gz`, and
`manifest.tsv`.
| `--total-budget-bytes`
| `50000000`
@@ -178,7 +178,7 @@ then calibrates z-score statistics from the `.dev.gz` file.
=== Bigram table training
-[source]
+[source,subs="-attributes"]
----
for each sentence in {script}.train.gz:
utf8 = sentence.getBytes(UTF-8)
@@ -197,7 +197,7 @@ a small but nonzero probability for novel byte sequences.
=== Calibration
-For each sentence in `{script}.dev.gz`:
+For each sentence in `\{script}.dev.gz`:
[source]
----
diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc
b/docs/modules/ROOT/pages/advanced/language-detection.adoc
index 9fdbe3c551..b95e06eafb 100644
--- a/docs/modules/ROOT/pages/advanced/language-detection.adoc
+++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc
@@ -204,25 +204,6 @@ CharSoupDetectorConfig cfg =
CharSoupDetectorConfig.fromMap(Map.of(
Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring`
component loading.
-== Generative Language Model
-
-In addition to the discriminative models above, Tika ships a
-**generative character n-gram model**
(`langdetect-generative-v4-20260320.bin`) that
-answers a complementary question: _how language-like is this text?_
-
-The generative model is used for:
-
-* **Charset detection tiebreaking** — when the discriminative model cannot
- distinguish candidate charsets, the generative model picks the one that
- produces the most language-like decoded text.
-* **Text quality scoring** — the `tika-eval:languageness` metadata field
- provides a z-score indicating how normal or garbled the extracted text is.
-* **Training data filtering** — flagging bot-generated or mixed-language
- sentences in training corpora.
-
-For full details, see
-xref:advanced/generative-language-model.adoc[Generative Language Model].
-
== Training the Models
Training is fully reproducible from source. For step-by-step instructions,
diff --git a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
index 8205f2145f..9ec15dc5a6 100644
--- a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
+++ b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
@@ -18,52 +18,44 @@
= Configuring Encoding Detectors
Tika uses a chain of _encoding detectors_ to determine the character encoding
-of plain text and HTML content. `DefaultEncodingDetector` loads detectors
-via the Java service-provider interface (SPI) and runs them in registration
-order. See xref:../advanced/charset-detection-design.adoc[Charset Detection
Pipeline]
-for design details.
+of plain text and HTML content. `DefaultEncodingDetector` loads detectors via
+the Java service-provider interface (SPI) and runs them in registration order;
+the first non-null result wins.
+
+The default chain is `html-encoding-detector`, `universal-encoding-detector`,
+and `icu4j-encoding-detector`.
== Default Detection Chain
-With the stock dependencies on the classpath:
+With the stock dependencies on the classpath (the modules
+`tika-encoding-detector-html`, `tika-encoding-detector-universal`, and
+`tika-encoding-detector-icu4j`):
-[cols="1,2,2"]
+[cols="1,2,3"]
|===
|Step |Detector |Returns non-null when…
|1
-|`bom-detector`
-|A UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE byte-order mark is present.
-Emits DECLARATIVE.
+|`html-encoding-detector`
+|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag is
+found. Fast lenient regex matcher with a curated subset of WHATWG label
+aliases.
|2
-|`metadata-charset-detector`
-|A `charset=` parameter is present in the `Content-Type` metadata field
-(populated from an HTTP response header or similar). Emits DECLARATIVE.
+|`universal-encoding-detector`
+|A state-machine structural prober (juniversalchardet fork) recognises the
+byte pattern as a known encoding (UTF-8, GB18030, Big5, EUC-JP, several
+ISO-8859 variants, etc.).
|3
-|`mojibuster-encoding-detector`
-|A structural UTF-32 check, structural UTF-16 specialist, UTF-8 grammar
-gate, and 33-class byte-bigram Naive Bayes classifier. STRUCTURAL for
-structural hits; STATISTICAL for NB predictions.
-
-|4
-|`html-encoding-detector`
-|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag
-is found (fast lenient regex matcher, curated WHATWG label aliases).
-Emits DECLARATIVE.
-
-|5
-|`junk-filter-encoding-detector`
-|`MetaEncodingDetector` — strips HTML/XML markup, decodes the probe under
-each candidate charset, and picks the cleanest decoding via a script-aware
-text-quality model. Honours a declaration when its decoding is
-byte-identical to at least one other candidate's. Always runs last.
+|`icu4j-encoding-detector`
+|ICU4J's `CharsetDetector` returns a match. Catches additional single-byte
+encodings (Windows code pages, IBM/EBCDIC variants, etc.).
|===
-NOTE: `junk-filter-encoding-detector` is supplied by `tika-ml-junkdetect`
-and SPI-loads when the module jar is on the classpath. Omit it for plain
-first-match-wins — see <<opting-out-of-arbitration>>.
+The chain is permissive — first-match-wins. A declared charset
+(e.g. from a `<meta charset>` tag) wins over later structural or statistical
+detectors.
== Available Detectors
@@ -74,128 +66,141 @@ referenced by their SPI name in JSON configuration.
|===
|Name |Module |Description
-|`bom-detector`
-|`tika-core`
-|Reads the first 4 bytes for BOM signatures. In the default chain.
-
-|`metadata-charset-detector`
-|`tika-core`
-|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from
-the `Metadata` object. Applies WHATWG label normalization
-(ISO-8859-1 and US-ASCII → windows-1252). In the default chain.
-
-|`mojibuster-encoding-detector`
-|`tika-encoding-detector-mojibuster`
-|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32
-and UTF-16 and a UTF-8 grammar gate. 33 classes including CJK
-multi-byte, EBCDIC variants, DOS code pages, Cyrillic, Windows single-byte,
-ISO-8859-3/16, Mac, and UTF-8. In the default chain. See
-xref:../advanced/charset-detection-design.adoc[the design doc].
-
|`html-encoding-detector`
|`tika-encoding-detector-html`
|Fast lenient regex matcher for `<meta charset>` / `http-equiv` tags, with a
-curated subset of WHATWG label aliases. In the default chain.
+curated subset of WHATWG label aliases. Auto-registered (in default chain).
+
+|`universal-encoding-detector`
+|`tika-encoding-detector-universal`
+|State-machine structural prober (juniversalchardet fork). Auto-registered
+(in default chain).
+
+|`icu4j-encoding-detector`
+|`tika-encoding-detector-icu4j`
+|Wraps ICU4J's `CharsetDetector`. Auto-registered (in default chain).
|`standard-html-encoding-detector`
|`tika-encoding-detector-html`
|Spec-strict WHATWG prescan algorithm. Not in the default chain — opt in
explicitly if you need strict WHATWG tokenisation (e.g. ignoring charset
-declarations inside comments or other contexts the lenient regex may match).
+declarations inside HTML comments or other contexts the lenient regex may
+match).
+
+|`mojibuster-encoding-detector`
+|`tika-encoding-detector-mojibuster`
+|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32
+and UTF-16 and a UTF-8 grammar gate. Not in the default chain — opt in
+explicitly.
|`junk-filter-encoding-detector`
|`tika-ml-junkdetect`
-|Text-quality arbitrator (`MetaEncodingDetector`). In the default chain
-when the module jar is on the classpath; always runs last.
+|Text-quality arbitrator (`MetaEncodingDetector`) that picks among other
+detectors' candidates by decode quality. Not in the default chain — opt in
+explicitly.
-|`icu4j-encoding-detector`
-|`tika-encoding-detector-icu4j`
-|Wraps ICU4J `CharsetDetector`. Legacy — the NB pipeline supersedes it
-for most cases. Not auto-registered.
+|`bom-detector`
+|`tika-core`
+|Reads the first 4 bytes for BOM signatures. Helper component, used
+internally by `AutoDetectReader`. Not normally added to the SPI chain.
-|`universal-encoding-detector`
-|`tika-encoding-detector-universal`
-|State-machine structural prober (juniversalchardet fork). Not
-auto-registered; opt in if you specifically need it.
+|`metadata-charset-detector`
+|`tika-core`
+|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from
+the `Metadata` object. Helper component, used by parsers that consult
+`Content-Type` directly. Not normally added to the SPI chain.
|===
== Configuration Examples
=== Exclude a detector from the default chain
+Use `default-encoding-detector` with an `exclude` list to drop one or more
+auto-registered detectors:
+
[source,json]
----
{
"encoding-detectors": [
{
"default-encoding-detector": {
- "exclude": ["bom-detector"]
+ "exclude": ["icu4j-encoding-detector"]
}
}
]
}
----
-[[opting-out-of-arbitration]]
-=== Strict first-match-wins (no JunkFilter arbitration)
+=== Specify the chain explicitly
-Omit JunkFilter to get plain first-match-wins (each base detector's top
-result wins in registration order):
+To replace the SPI-discovered chain with an explicit ordered list:
[source,json]
----
{
"encoding-detectors": [
- {"bom-detector": {}},
- {"metadata-charset-detector": {}},
{"html-encoding-detector": {}},
- {"mojibuster-encoding-detector": {}}
+ {"universal-encoding-detector": {}}
]
}
----
-Trade-off: lying declarations propagate unfiltered, and Mojibuster's
-statistical guess wins ties without any text-quality cross-check.
-See
-xref:../advanced/charset-detection-design.adoc#opting-out-of-arbitration[the
design doc].
-
=== Configure the HTML detector's read limit
-`html-encoding-detector` reads up to 65 536 bytes by default when
-scanning for the `<meta charset>` tag. Raise it if your documents embed
-large `<script>` blocks before the meta tag (TIKA-2485):
+`html-encoding-detector` reads up to 65 536 bytes by default when scanning
+for the `<meta charset>` tag. Raise it if your documents embed large
+`<script>` blocks before the meta tag (TIKA-2485):
[source,json]
----
{
"encoding-detectors": [
- {"bom-detector": {}},
- {"metadata-charset-detector": {}},
{
"html-encoding-detector": {
"markLimit": 131072
}
},
- {"mojibuster-encoding-detector": {}},
- {"junk-filter-encoding-detector": {}}
+ {"universal-encoding-detector": {}},
+ {"icu4j-encoding-detector": {}}
]
}
----
-=== Legacy chain (ICU4J + juniversalchardet)
+=== Use the spec-strict WHATWG HTML detector
-Not recommended — the NB pipeline is strictly better on accuracy and
-latency — but available for regression testing or comparison:
+If your input HTML has charset declarations inside comments (or other
+contexts where the lenient regex would false-match), opt in to the
+spec-strict prescan:
+
+[source,json]
+----
+{
+ "encoding-detectors": [
+ {"standard-html-encoding-detector": {}},
+ {"universal-encoding-detector": {}},
+ {"icu4j-encoding-detector": {}}
+ ]
+}
+----
+
+=== Add the Mojibuster + JunkFilter chain (opt-in)
+
+The byte-bigram NB classifier (`mojibuster-encoding-detector`) and the
+text-quality arbitrator (`junk-filter-encoding-detector`) are available as
+opt-in components. They require the `tika-encoding-detector-mojibuster`
+and `tika-ml-junkdetect` modules on the classpath:
[source,json]
----
{
"encoding-detectors": [
- {"bom-detector": {}},
- {"metadata-charset-detector": {}},
{"html-encoding-detector": {}},
- {"icu4j-encoding-detector": {}},
- {"universal-encoding-detector": {}}
+ {"mojibuster-encoding-detector": {}},
+ {"junk-filter-encoding-detector": {}}
]
}
----
+
+`junk-filter-encoding-detector` is a `MetaEncodingDetector` — it collects
+candidates from the other detectors and picks the cleanest decoding via a
+script-aware text-quality model. It must run last.
diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
index 281bd6d12d..1bda565f56 100644
--- a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
+++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
@@ -37,7 +37,7 @@ independent handler (any Tika parser):
* **`stdoutHandler`** -- processes stdout
* **`stderrHandler`** -- processes stderr
-* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}`
is used)
+* **`outputFileHandler`** -- processes the output file (when `$\{OUTPUT_FILE}`
is used)
Handlers extract metadata, content, or both. `regex-capture-parser` is the
most common choice for extracting metadata via regex patterns.
@@ -46,8 +46,8 @@ most common choice for extracting metadata via regex patterns.
The `contentSource` field controls which stream provides the XHTML text
content:
-* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command
-* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command
+* `"stdout"` -- default when no `$\{OUTPUT_FILE}` in the command
+* `"outputFile"` -- default when `$\{OUTPUT_FILE}` is in the command
* `"stderr"` -- use stderr as the content source
* `"none"` -- metadata-only mode, no text content extracted
@@ -63,7 +63,7 @@ configured, the raw bytes are written as text.
|`commandLine`
|`List<String>`
-|The command and arguments to run. Use `${INPUT_FILE}` and `${OUTPUT_FILE}`
tokens for file paths.
+|The command and arguments to run. Use `$\{INPUT_FILE}` and `$\{OUTPUT_FILE}`
tokens for file paths.
|`checkCommandLine`
|`List<String>`
diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
index a967c80421..7d65fe8637 100644
--- a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
+++ b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc
@@ -27,6 +27,8 @@ Before starting the release process, ensure you have:
* A valid GPG key published to a public keyserver
* Maven credentials configured in `~/.m2/settings.xml`
* Access to Apache's Nexus repository manager
+* SVN client (`svn`) — release candidates upload to `dist.apache.org` via SVN,
not scp
+* Internet access on first build — the Antora docs build downloads Node.js
into `~/.cache/tika-antora/` on first run (~100 MB, one-time per machine;
reused across clean builds)
== Pre-Release Checks
@@ -125,9 +127,16 @@ Execute the Maven release perform goal:
[source,bash]
----
-mvn release:perform
+mvn release:perform -Darguments="-DskipITs"
----
+`-DskipITs` skips integration tests during the inner build. Tests already
+ran in `release:prepare`'s `verify` phase; re-running them during perform is
+belt-and-suspenders, and some pipes/elasticsearch chaos-monkey tests are
+timing-sensitive enough to flake on a tagged build.
+
+If `release:perform` fails partway through, see <<troubleshooting>>.
+
Ensure you have valid Maven credentials in `~/.m2/settings.xml`:
[source,xml]
@@ -152,24 +161,40 @@ Ensure you have valid Maven credentials in
`~/.m2/settings.xml`:
=== Step 10: Upload Distribution Artifacts
-Upload artifacts to `dist.apache.org`:
+The release-plugin's antrun task assembles a dist directory at
+`target/checkout/target/X.Y.Z/` containing the source zip, app jar, server
+tarballs, and parser-package jars (each with `.asc` and `.sha512`).
+
+WARNING: At the end of `release:perform` you will see an echo telling you
+to `scp -r ... people.apache.org:public_html/tika/`. *Ignore that.* It
+is stale — the current ASF release distribution channel is the SVN repo
+under `dist.apache.org`, not `people.apache.org`.
+
+Check out the dist dev SVN repo and copy the prepared dist directory in:
[source,bash]
----
svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev
+cp -r target/checkout/target/X.Y.Z tika-dist-dev/
cd tika-dist-dev
+svn add X.Y.Z
+svn commit -m "Stage Apache Tika X.Y.Z RC<n>"
----
-Upload the following files with their signatures (.asc) and checksums
(.sha512):
+Verify the directory contains all expected artifacts (each with `.asc` and
+`.sha512`):
* `tika-X.Y.Z-src.zip`
* `tika-app-X.Y.Z.jar`
-* `tika-server-standard-X.Y.Z.jar`
+* `tika-server-standard-X.Y.Z.jar` (and `-bin.tgz`, `-bin.zip`)
+* `tika-parser-scientific-package-X.Y.Z.jar`
+* `tika-parser-sqlite3-package-X.Y.Z.jar`
+* `tika-parser-nlp-package-X.Y.Z.jar`
Also:
-* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt`
-* Ensure the `KEYS` file contains all contributor signatures
+* `CHANGES.txt` (already in the dist directory; rename to `CHANGES-X.Y.Z.txt`
if your local copy hasn't been)
+* Ensure the `KEYS` file at the parent directory contains your GPG key
=== Step 11: Call the Vote
@@ -214,6 +239,92 @@ svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \
-m "Release Apache Tika X.Y.Z"
----
+[[troubleshooting]]
+== Troubleshooting `release:perform`
+
+The `release:perform` build can fail mid-way for reasons unrelated to the
+release itself. This section captures the recoveries learned during recent
+releases. Once these get fixed in the build (tracked in the to-fix-before-
+beta punch list), this section can be slimmed down.
+
+=== `tika-docs` assembly fails: "archive cannot be empty"
+
+[source]
+----
+[ERROR] Failed to create assembly: Error creating assembly archive docs:
+ archive cannot be empty
+----
+
+Cause: the Antora plugin is not auto-bound to the `package` phase, so
+`target/site/` is empty when `maven-assembly-plugin` runs.
+
+Recovery (resume from `tika-docs`):
+
+[source,bash]
+----
+cd target/checkout
+mvn deploy -Papache-release -rf :tika-docs -DskipITs
+----
+
+If the antora binding (the recommended fix in the to-fix-before-beta punch
+list) hasn't yet landed, you may need to manually build the site first:
+
+[source,bash]
+----
+cd target/checkout/docs
+mvn antora:antora
+cd ..
+mvn deploy -Papache-release -rf :tika-docs -DskipITs
+----
+
+=== Antrun error from a child module: "Could not find file ... -src.zip"
+
+[source]
+----
+Could not find file .../docs/target/X.Y.Z/tika-X.Y.Z-src.zip
+to generate checksum for.
+----
+
+Cause: the root-pom antrun execution lacks `<inherited>false</inherited>`,
+so it fires from each child module on a resumed deploy with `$\{basedir}`
+pointing at the wrong directory.
+
+Recovery (run the antrun once at the root):
+
+[source,bash]
+----
+cd target/checkout
+mvn deploy --non-recursive -Papache-release -Dmaven.deploy.skip=true
+----
+
+`--non-recursive` runs only the root pom; `-Dmaven.deploy.skip=true`
+prevents re-uploading the root pom artifact (already uploaded earlier).
+The antrun fires in the correct basedir and `target/X.Y.Z/` gets
+populated.
+
+=== Nexus staging repository: only one repo when I expected two
+
+If `release:perform` fails partway and you re-run it, you may see only one
+open staging repository on `repository.apache.org` even though both
+invocations uploaded artifacts. This is normal: while the staging repo is
+*open*, redeploys overwrite earlier artifacts. Confirm by checking the
+`Last Modified` timestamp on a representative artifact (e.g.
+`tika-core-X.Y.Z.jar`) — it should match the most recent run.
+
+When in doubt, drop the staging repo and run `release:perform` cleanly
+from scratch. It costs ~1 hour but yields a guaranteed single-build set
+of artifacts.
+
+=== gRPC distribution zip is huge (~600+ MB)
+
+The `tika-grpc-X.Y.Z.zip` artifact bundles every pipes plugin with its full
+transitive closure (microsoft-graph, gcs, az-blob, s3, kafka, etc.) plus
+multi-platform native libs (rocksdbjni, netty natives). Several hundred MB
+of that is duplication of dependencies already in the root `lib/`
+directory. This is a known issue tracked for cleanup before beta — see
+the to-fix-before-4.0.0-beta punch list. The release can ship as-is; the
+zip is correct, just bloated.
+
== Post-Release
=== Update Unreleased Modules
diff --git a/docs/modules/ROOT/pages/maintainers/site.adoc
b/docs/modules/ROOT/pages/maintainers/site.adoc
index 2a86d9231d..ce751cf4a1 100644
--- a/docs/modules/ROOT/pages/maintainers/site.adoc
+++ b/docs/modules/ROOT/pages/maintainers/site.adoc
@@ -28,31 +28,27 @@ The site supports multiple versions through Git branches
and includes client-sid
* Maven 3.9+
* Git
+* Internet access on first build — the Antora plugin downloads Node.js into
`~/.cache/tika-antora/` (~100 MB, one-time per machine; reused across clean
builds and across worktrees)
== Building the Site Locally
-To build the documentation locally:
+The `docs` module is only included in the reactor under the `apache-release`
profile. Build the site from the repo root:
[source,bash]
----
-cd docs
-mvn antora:antora
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
----
-The generated site will be at `docs/target/site/`.
+The generated site will be at `docs/target/site/`. The current git commit and
date are stamped automatically onto the home page (a generated copy of the
playbook lives at `docs/antora-playbook-stamped.yml` — gitignored).
-To stamp the build with the current commit hash (shown on the home page),
-add `git-commit` to the attributes in `antora-playbook.yml`:
+To skip the stamping or override the playbook:
-[source,yaml]
+[source,bash]
----
-asciidoc:
- attributes:
- git-commit: 'abc1234'
+# build directly with the unstamped playbook
+cd docs && mvn antora:antora -Dplaybook=antora-playbook.yml
----
-Or pass it on the command line when you have a playbook that supports CLI
attributes.
-
=== Previewing the Site
**Option 1: Python HTTP server (recommended)**
@@ -113,12 +109,13 @@ The playbook (`antora-playbook.yml`) is configured to
build all `docs/*` branche
=== Publishing to the Site
-Use `build-docs.sh` with the `--publish` flag to build and copy to the site
SVN checkout:
+Build the docs with Maven, then run `publish-docs.sh` to copy the output to a
tika-site SVN checkout (with URL flattening so `/docs/tika/X.Y.Z/...` becomes
`/docs/X.Y.Z/...`):
[source,bash]
----
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
# Then in the SVN checkout:
cd /path/to/tika-site
@@ -126,8 +123,9 @@ svn add publish/docs publish/_ --force
svn commit -m "Publish 4.0.0-SNAPSHOT docs"
----
-This builds the Antora site, stamps the git commit on the home page, and copies
-the output to the site with the correct directory layout:
+The Maven `package` step builds the Antora site (stamping the current git
+commit and date on the home page); `publish-docs.sh` copies the output to
+the site checkout with the correct directory layout:
* `publish/docs/4.0.0-SNAPSHOT/` -- the documentation pages
* `publish/_/` -- CSS, JS, fonts (shared across versions)
@@ -151,8 +149,9 @@ git commit -am "Set docs version to 4.0.0"
git push origin docs/4.0.0
# 4. Build and publish
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
# 5. Commit to SVN
cd /path/to/tika-site
@@ -177,8 +176,9 @@ git commit -am "Fix PDF parser example"
git push origin docs/4.0.0
# 4. Rebuild and republish
+./mvnw package -Papache-release -pl :tika-docs -DskipTests
cd docs
-./build-docs.sh --publish /path/to/tika-site/publish
+./publish-docs.sh /path/to/tika-site/publish
cd /path/to/tika-site
svn commit -m "Update 4.0.0 docs"
----
diff --git a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
index d7ea0fb5fa..d8e027d496 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc
@@ -121,7 +121,7 @@ mvn clean spotless:apply install
* Implement flexible component loading without `@TikaComponent` requirements
* Enable friendly name usage throughout the codebase
-* Resolve gRPC issues
* Fix external renderer byte-passing in open containers
* Simplify and strengthen serialization code
* Consider relocating `TikaConfig` and `ForkParser` to legacy module
+* Complete CLI integration for `tika-app` config dump
(`--dump-minimal-config`, `--dump-current-config`, `--dump-static-config`,
`--dump-static-full-config` are stubbed but not yet wired)
diff --git a/docs/modules/ROOT/pages/migration-to-4x/index.adoc
b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
index eebf29f3db..39675318c5 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/index.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/index.adoc
@@ -31,6 +31,8 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and
support schedules.
* xref:migration-to-4x/design-notes-4x.adoc[Design Notes] - Architectural
decisions and design rationale
* xref:migration-to-4x/serialization-4x.adoc[Serialization] - JSON
serialization design and implementation details
+* xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies] - Chunk
emission to Elasticsearch/OpenSearch from the inference pipeline
+* xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler
Requirements] - Why inference filters require MARKDOWN content handler
== TODOs / Missing Features in 4.x
diff --git a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
index 2f1eb24adb..7782e1cd63 100644
--- a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc
@@ -24,6 +24,28 @@ This section covers using Apache Tika via gRPC.
Tika gRPC provides a high-performance gRPC interface for parsing documents.
This is useful for microservices architectures and polyglot environments.
+The service definition lives in `tika-grpc/src/main/proto/tika.proto`. Clients
+register a fetcher (`SaveFetcher`) and then submit `FetchAndParseRequest`
+messages, each of which returns a `FetchAndParseReply` with extracted
+metadata and content.
+
+== Per-Request `ParseContext`
+
+`FetchAndParseRequest.parse_context_json` lets the caller override the
+server's default `ParseContext` on a per-request basis. Keys are
+parse-context component names; values are their JSON configs.
+
+[source,json]
+----
+{
+ "basic-content-handler-factory": {"type": "HTML"},
+ "timeout-limits": {"progressTimeoutMillis": 30000}
+}
+----
+
+See `META-INF/tika/parse-context.idx` (generated at build time from
+`@TikaComponent` annotations) for the available component names.
+
== Topics
// Add links to specific topics as they are created
diff --git a/docs/pom.xml b/docs/pom.xml
index 3759e2b7c4..84d570e554 100644
--- a/docs/pom.xml
+++ b/docs/pom.xml
@@ -64,34 +64,64 @@ under the License.
</filesets>
</configuration>
</plugin>
- <!-- Maven Assembly plugin to create tar.gz -->
+ <!-- Stamp the current git commit and build date onto a
+ sibling copy of antora-playbook.yml so the docs home
+ page shows which commit produced the site. The copy
+ lives next to the original so Antora's relative-path
+ resolution (e.g. ./supplemental-ui) still works. The
+ generated file is gitignored. -->
<plugin>
- <artifactId>maven-assembly-plugin</artifactId>
+ <artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
- <id>make-docs-archive</id>
- <phase>package</phase>
+ <id>stamp-antora-playbook</id>
+ <phase>generate-resources</phase>
<goals>
- <goal>single</goal>
+ <goal>run</goal>
</goals>
<configuration>
- <descriptors>
- <descriptor>src/assembly/docs.xml</descriptor>
- </descriptors>
-
<finalName>${project.artifactId}-${project.version}</finalName>
+ <target>
+ <exec executable="git"
outputproperty="git.commit.short" failonerror="true">
+ <arg value="rev-parse"/>
+ <arg value="--short"/>
+ <arg value="HEAD"/>
+ </exec>
+ <tstamp>
+ <format property="build.date"
pattern="yyyy-MM-dd"/>
+ </tstamp>
+ <copy
file="${project.basedir}/antora-playbook.yml"
+
tofile="${project.basedir}/antora-playbook-stamped.yml"
+ overwrite="true"/>
+ <replaceregexp
file="${project.basedir}/antora-playbook-stamped.yml"
+ match="(tika-stable-version:.*)"
+ replace="\1${line.separator}
git-commit: '${git.commit.short} (${build.date})'"
+ byline="true"/>
+ </target>
</configuration>
</execution>
</executions>
</plugin>
- <!-- Antora plugin for building the documentation site -->
+ <!-- Antora plugin for building the documentation site.
+ Bound to prepare-package so target/site/ exists before
+ the assembly runs.
+
+ nodeInstallDirectory is moved out of target/ so 'mvn
+ clean' does not nuke the downloaded Node.js. First
+ build per machine downloads ~100MB once; subsequent
+ builds reuse it. The plugin's nodeExecutable param
+ cannot reuse a system Node on distros where npm and
+ node packages aren't co-located (Debian/Ubuntu), so
+ caching the plugin-managed install is the practical
+ alternative. -->
<plugin>
<groupId>org.antora</groupId>
<artifactId>antora-maven-plugin</artifactId>
<version>${antora.version}</version>
<extensions>true</extensions>
<configuration>
- <playbook>antora-playbook.yml</playbook>
+
<nodeInstallDirectory>${user.home}/.cache/tika-antora</nodeInstallDirectory>
+ <playbook>antora-playbook-stamped.yml</playbook>
<packages>
<package>@antora/[email protected]</package>
</packages>
@@ -100,6 +130,36 @@ under the License.
<option>--log-failure-level=fatal</option>
</options>
</configuration>
+ <executions>
+ <execution>
+ <id>build-antora-site</id>
+ <phase>prepare-package</phase>
+ <goals>
+ <goal>antora</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Maven Assembly plugin to package the built site into a
+ tar.gz for release distribution. Runs after Antora. -->
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>make-docs-archive</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>src/assembly/docs.xml</descriptor>
+ </descriptors>
+
<finalName>${project.artifactId}-${project.version}</finalName>
+ </configuration>
+ </execution>
+ </executions>
</plugin>
</plugins>
</build>
diff --git a/docs/publish-docs.sh b/docs/publish-docs.sh
new file mode 100755
index 0000000000..fcbed4e995
--- /dev/null
+++ b/docs/publish-docs.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copies the already-built Antora site into a tika-site SVN checkout,
+# flattening URLs from /docs/tika/X.Y.Z/... to /docs/X.Y.Z/... so they
+# match the website layout.
+#
+# Usage:
+# ./publish-docs.sh /path/to/tika-site/publish
+#
+# Prerequisite: run 'mvn package -pl docs' first to populate target/site/.
+
+set -euo pipefail
+cd "$(dirname "$0")"
+
+PUBLISH_DIR="${1:?usage: publish-docs.sh <tika-site-publish-dir>}"
+DOCS_DIR="${PUBLISH_DIR}/docs"
+
+if [[ ! -d target/site ]]; then
+ echo "target/site/ not found." >&2
+ echo "Build the docs first: cd .. && ./mvnw package -pl docs" >&2
+ exit 1
+fi
+
+mkdir -p "${DOCS_DIR}"
+
+# Strip the 'tika/' component dir prefix so URLs are /docs/X.Y.Z/...
+cp -r target/site/tika/* "${DOCS_DIR}/"
+# UI assets one level above docs/, since HTML uses ../../_/ relative paths
+cp -r target/site/_/ "${PUBLISH_DIR}/_/"
+# Fix the root redirect and sitemap to match the flattened layout
+sed 's|tika/||g' target/site/index.html > "${DOCS_DIR}/index.html"
+sed 's|/docs/tika/|/docs/|g' target/site/sitemap.xml >
"${DOCS_DIR}/sitemap.xml"
+cp target/site/404.html "${DOCS_DIR}/"
+cp target/site/search-index.js "${DOCS_DIR}/"
+
+echo "Published to: ${DOCS_DIR}/"