This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch update-4x-docs in repository https://gitbox.apache.org/repos/asf/tika.git
commit 475d03c12c492c3e6eb2d2e898eec8a0855dcb34 Author: tallison <[email protected]> AuthorDate: Tue May 5 08:58:33 2026 -0400 update 4.x docs and docs process --- docs/build-docs.sh | 53 ------ .../ROOT/examples/migration-full-example.json | 2 +- docs/modules/ROOT/examples/pdf-parser-basic.json | 2 +- docs/modules/ROOT/examples/pdf-parser-full.json | 2 +- docs/modules/ROOT/examples/tesseract-basic.json | 2 +- docs/modules/ROOT/examples/tesseract-full.json | 2 +- docs/modules/ROOT/nav.adoc | 1 - .../pages/advanced/charset-detection-design.adoc | 2 +- .../ROOT/pages/advanced/junk-detection-build.adoc | 16 +- .../ROOT/pages/advanced/language-detection.adoc | 19 --- .../pages/configuration/encoding-detectors.adoc | 183 +++++++++++---------- .../configuration/parsers/external-parser.adoc | 8 +- .../pages/maintainers/release-guides/tika.adoc | 123 +++++++++++++- docs/modules/ROOT/pages/maintainers/site.adoc | 36 ++-- .../pages/migration-to-4x/design-notes-4x.adoc | 2 +- docs/modules/ROOT/pages/migration-to-4x/index.adoc | 2 + docs/modules/ROOT/pages/using-tika/grpc/index.adoc | 22 +++ docs/pom.xml | 82 +++++++-- 18 files changed, 343 insertions(+), 216 deletions(-) diff --git a/docs/build-docs.sh b/docs/build-docs.sh deleted file mode 100755 index 030ca1199d..0000000000 --- a/docs/build-docs.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Builds the Antora docs site with the current git commit stamped on the home page. -# Usage: ./build-docs.sh -# Output: target/site/ -# -# To publish to the tika-site SVN repo: -# ./build-docs.sh --publish /path/to/tika-site/publish - -set -euo pipefail -cd "$(dirname "$0")" - -COMMIT=$(git rev-parse --short HEAD) -DATE=$(date -u +%Y-%m-%d) - -# Inject commit into playbook, build, restore -sed -i "/tika-stable-version/a\\ git-commit: '${COMMIT} (${DATE})'" antora-playbook.yml -trap 'git checkout antora-playbook.yml' EXIT - -# Pass remaining args to Maven (filter out our --publish flag) -PUBLISH_DIR="" -MVN_ARGS=() -while [[ $# -gt 0 ]]; do - case $1 in - --publish) - PUBLISH_DIR="$2" - shift 2 - ;; - *) - MVN_ARGS+=("$1") - shift - ;; - esac -done - -../mvnw antora:antora "${MVN_ARGS[@]}" - -echo "Site built at: target/site/" -echo "Commit: ${COMMIT} (${DATE})" - -if [[ -n "${PUBLISH_DIR}" ]]; then - # Flatten: skip the 'tika/' component directory so URLs are /docs/4.0.0-SNAPSHOT/ - # Copy UI assets one level above docs/ since HTML uses ../../_/ relative paths - DOCS_DIR="${PUBLISH_DIR}/docs" - mkdir -p "${DOCS_DIR}" - cp -r target/site/tika/* "${DOCS_DIR}/" - cp -r target/site/_/ "${PUBLISH_DIR}/_/" - # Fix the root redirect to match flattened layout - sed 's|tika/||g' target/site/index.html > "${DOCS_DIR}/index.html" - sed 's|/docs/tika/|/docs/|g' target/site/sitemap.xml > "${DOCS_DIR}/sitemap.xml" - cp target/site/404.html "${DOCS_DIR}/" - cp target/site/search-index.js "${DOCS_DIR}/" - echo "Published to: ${DOCS_DIR}/" -fi diff --git a/docs/modules/ROOT/examples/migration-full-example.json b/docs/modules/ROOT/examples/migration-full-example.json index 05f93d7f23..7ce787b42d 120000 --- a/docs/modules/ROOT/examples/migration-full-example.json +++ b/docs/modules/ROOT/examples/migration-full-example.json @@ -1 +1 @@ -../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json \ No newline at end of file +../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/migration-full-example.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pdf-parser-basic.json b/docs/modules/ROOT/examples/pdf-parser-basic.json index b1a2ae805f..a0fa8b34ca 120000 --- a/docs/modules/ROOT/examples/pdf-parser-basic.json +++ b/docs/modules/ROOT/examples/pdf-parser-basic.json @@ -1 +1 @@ -../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json \ No newline at end of file +../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pdf-parser-full.json b/docs/modules/ROOT/examples/pdf-parser-full.json index 922388d57b..eeaa9e50c2 120000 --- a/docs/modules/ROOT/examples/pdf-parser-full.json +++ b/docs/modules/ROOT/examples/pdf-parser-full.json @@ -1 +1 @@ -../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json \ No newline at end of file +../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/tesseract-basic.json b/docs/modules/ROOT/examples/tesseract-basic.json index 1a508253c6..2844915265 120000 --- a/docs/modules/ROOT/examples/tesseract-basic.json +++ b/docs/modules/ROOT/examples/tesseract-basic.json @@ -1 +1 @@ -../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json \ No newline at end of file +../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/tesseract-full.json b/docs/modules/ROOT/examples/tesseract-full.json index c5d2e0c823..f1e5bc8bde 120000 --- a/docs/modules/ROOT/examples/tesseract-full.json +++ b/docs/modules/ROOT/examples/tesseract-full.json @@ -1 +1 @@ -../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json \ No newline at end of file +../../../../tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 16429e45bf..979555022a 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -46,7 +46,6 @@ ** xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies] ** xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler Requirements] * xref:advanced/index.adoc[Advanced] -** xref:advanced/charset-detection-design.adoc[Charset Detection Pipeline] ** xref:advanced/language-detection.adoc[Language Detection] ** xref:advanced/language-detection-build.adoc[Building the Language Detector] ** xref:advanced/junk-detection.adoc[Text Quality Scoring (Junk Detection)] diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc index 7c45d7ea23..fc870e5d51 100644 --- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc +++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc @@ -510,4 +510,4 @@ with feature hashing. The move to NB was driven by: == See also * Configuration: xref:../configuration/encoding-detectors.adoc[Configuring Encoding Detectors] -* Language detection: xref:language-detection.adoc[Language Detection] +* Language detection: xref:advanced/language-detection.adoc[Language Detection] diff --git a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc index 046099899f..c5397a7464 100644 --- a/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc +++ b/docs/modules/ROOT/pages/advanced/junk-detection-build.adoc @@ -58,11 +58,11 @@ Source data lives in one directory per language (ISO 639 code), each containing up to two files: `sentences_wikipedia.txt`:: - Line-numbered Wikipedia sentences: `{lineNum}{TAB}{text}`. + Line-numbered Wikipedia sentences: `\{lineNum}\{TAB}\{text}`. One sentence per line. `sentences_madlad.txt`:: - Line-numbered MADLAD-400 documents: `{lineNum}{TAB}{text}`. + Line-numbered MADLAD-400 documents: `\{lineNum}\{TAB}\{text}`. Documents contain literal two-character `\n` escape sequences as sub-sentence separators. The tool splits on these before processing. @@ -108,16 +108,16 @@ per script: |=== | File | Split | Purpose -| `{script}.train.gz` +| `\{script}.train.gz` | 80% | Bigram count accumulation in `TrainJunkModel`. -| `{script}.dev.gz` +| `\{script}.dev.gz` | 10% | Calibration (mu/sigma estimation) in `TrainJunkModel`. Also used for iterative evaluation during development. -| `{script}.test.gz` +| `\{script}.test.gz` | 10% | **Held out completely.** Use only for final reported evaluation numbers. Never use to make model or threshold decisions. @@ -146,7 +146,7 @@ Key options: | `--output-dir` | `~/datasets/madlad/junkdetect` -| Where to write `{script}.train.gz`, `.dev.gz`, `.test.gz`, and `manifest.tsv`. +| Where to write `\{script}.train.gz`, `.dev.gz`, `.test.gz`, and `manifest.tsv`. | `--total-budget-bytes` | `50000000` @@ -178,7 +178,7 @@ then calibrates z-score statistics from the `.dev.gz` file. === Bigram table training -[source] +[source,subs="-attributes"] ---- for each sentence in {script}.train.gz: utf8 = sentence.getBytes(UTF-8) @@ -197,7 +197,7 @@ a small but nonzero probability for novel byte sequences. === Calibration -For each sentence in `{script}.dev.gz`: +For each sentence in `\{script}.dev.gz`: [source] ---- diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc b/docs/modules/ROOT/pages/advanced/language-detection.adoc index 9fdbe3c551..b95e06eafb 100644 --- a/docs/modules/ROOT/pages/advanced/language-detection.adoc +++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc @@ -204,25 +204,6 @@ CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of( Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring` component loading. -== Generative Language Model - -In addition to the discriminative models above, Tika ships a -**generative character n-gram model** (`langdetect-generative-v4-20260320.bin`) that -answers a complementary question: _how language-like is this text?_ - -The generative model is used for: - -* **Charset detection tiebreaking** — when the discriminative model cannot - distinguish candidate charsets, the generative model picks the one that - produces the most language-like decoded text. -* **Text quality scoring** — the `tika-eval:languageness` metadata field - provides a z-score indicating how normal or garbled the extracted text is. -* **Training data filtering** — flagging bot-generated or mixed-language - sentences in training corpora. - -For full details, see -xref:advanced/generative-language-model.adoc[Generative Language Model]. - == Training the Models Training is fully reproducible from source. For step-by-step instructions, diff --git a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc index 8205f2145f..9ec15dc5a6 100644 --- a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc +++ b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc @@ -18,52 +18,44 @@ = Configuring Encoding Detectors Tika uses a chain of _encoding detectors_ to determine the character encoding -of plain text and HTML content. `DefaultEncodingDetector` loads detectors -via the Java service-provider interface (SPI) and runs them in registration -order. See xref:../advanced/charset-detection-design.adoc[Charset Detection Pipeline] -for design details. +of plain text and HTML content. `DefaultEncodingDetector` loads detectors via +the Java service-provider interface (SPI) and runs them in registration order; +the first non-null result wins. + +The default chain is `html-encoding-detector`, `universal-encoding-detector`, +and `icu4j-encoding-detector`. == Default Detection Chain -With the stock dependencies on the classpath: +With the stock dependencies on the classpath (the modules +`tika-encoding-detector-html`, `tika-encoding-detector-universal`, and +`tika-encoding-detector-icu4j`): -[cols="1,2,2"] +[cols="1,2,3"] |=== |Step |Detector |Returns non-null when… |1 -|`bom-detector` -|A UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE byte-order mark is present. -Emits DECLARATIVE. +|`html-encoding-detector` +|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag is +found. Fast lenient regex matcher with a curated subset of WHATWG label +aliases. |2 -|`metadata-charset-detector` -|A `charset=` parameter is present in the `Content-Type` metadata field -(populated from an HTTP response header or similar). Emits DECLARATIVE. +|`universal-encoding-detector` +|A state-machine structural prober (juniversalchardet fork) recognises the +byte pattern as a known encoding (UTF-8, GB18030, Big5, EUC-JP, several +ISO-8859 variants, etc.). |3 -|`mojibuster-encoding-detector` -|A structural UTF-32 check, structural UTF-16 specialist, UTF-8 grammar -gate, and 33-class byte-bigram Naive Bayes classifier. STRUCTURAL for -structural hits; STATISTICAL for NB predictions. - -|4 -|`html-encoding-detector` -|An HTML `<meta charset="…">` or `<meta http-equiv="Content-Type">` tag -is found (fast lenient regex matcher, curated WHATWG label aliases). -Emits DECLARATIVE. - -|5 -|`junk-filter-encoding-detector` -|`MetaEncodingDetector` — strips HTML/XML markup, decodes the probe under -each candidate charset, and picks the cleanest decoding via a script-aware -text-quality model. Honours a declaration when its decoding is -byte-identical to at least one other candidate's. Always runs last. +|`icu4j-encoding-detector` +|ICU4J's `CharsetDetector` returns a match. Catches additional single-byte +encodings (Windows code pages, IBM/EBCDIC variants, etc.). |=== -NOTE: `junk-filter-encoding-detector` is supplied by `tika-ml-junkdetect` -and SPI-loads when the module jar is on the classpath. Omit it for plain -first-match-wins — see <<opting-out-of-arbitration>>. +The chain is permissive — first-match-wins. A declared charset +(e.g. from a `<meta charset>` tag) wins over later structural or statistical +detectors. == Available Detectors @@ -74,128 +66,141 @@ referenced by their SPI name in JSON configuration. |=== |Name |Module |Description -|`bom-detector` -|`tika-core` -|Reads the first 4 bytes for BOM signatures. In the default chain. - -|`metadata-charset-detector` -|`tika-core` -|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from -the `Metadata` object. Applies WHATWG label normalization -(ISO-8859-1 and US-ASCII → windows-1252). In the default chain. - -|`mojibuster-encoding-detector` -|`tika-encoding-detector-mojibuster` -|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32 -and UTF-16 and a UTF-8 grammar gate. 33 classes including CJK -multi-byte, EBCDIC variants, DOS code pages, Cyrillic, Windows single-byte, -ISO-8859-3/16, Mac, and UTF-8. In the default chain. See -xref:../advanced/charset-detection-design.adoc[the design doc]. - |`html-encoding-detector` |`tika-encoding-detector-html` |Fast lenient regex matcher for `<meta charset>` / `http-equiv` tags, with a -curated subset of WHATWG label aliases. In the default chain. +curated subset of WHATWG label aliases. Auto-registered (in default chain). + +|`universal-encoding-detector` +|`tika-encoding-detector-universal` +|State-machine structural prober (juniversalchardet fork). Auto-registered +(in default chain). + +|`icu4j-encoding-detector` +|`tika-encoding-detector-icu4j` +|Wraps ICU4J's `CharsetDetector`. Auto-registered (in default chain). |`standard-html-encoding-detector` |`tika-encoding-detector-html` |Spec-strict WHATWG prescan algorithm. Not in the default chain — opt in explicitly if you need strict WHATWG tokenisation (e.g. ignoring charset -declarations inside comments or other contexts the lenient regex may match). +declarations inside HTML comments or other contexts the lenient regex may +match). + +|`mojibuster-encoding-detector` +|`tika-encoding-detector-mojibuster` +|Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32 +and UTF-16 and a UTF-8 grammar gate. Not in the default chain — opt in +explicitly. |`junk-filter-encoding-detector` |`tika-ml-junkdetect` -|Text-quality arbitrator (`MetaEncodingDetector`). In the default chain -when the module jar is on the classpath; always runs last. +|Text-quality arbitrator (`MetaEncodingDetector`) that picks among other +detectors' candidates by decode quality. Not in the default chain — opt in +explicitly. -|`icu4j-encoding-detector` -|`tika-encoding-detector-icu4j` -|Wraps ICU4J `CharsetDetector`. Legacy — the NB pipeline supersedes it -for most cases. Not auto-registered. +|`bom-detector` +|`tika-core` +|Reads the first 4 bytes for BOM signatures. Helper component, used +internally by `AutoDetectReader`. Not normally added to the SPI chain. -|`universal-encoding-detector` -|`tika-encoding-detector-universal` -|State-machine structural prober (juniversalchardet fork). Not -auto-registered; opt in if you specifically need it. +|`metadata-charset-detector` +|`tika-core` +|Reads declarative hints (`Content-Type` charset, `Content-Encoding`) from +the `Metadata` object. Helper component, used by parsers that consult +`Content-Type` directly. Not normally added to the SPI chain. |=== == Configuration Examples === Exclude a detector from the default chain +Use `default-encoding-detector` with an `exclude` list to drop one or more +auto-registered detectors: + [source,json] ---- { "encoding-detectors": [ { "default-encoding-detector": { - "exclude": ["bom-detector"] + "exclude": ["icu4j-encoding-detector"] } } ] } ---- -[[opting-out-of-arbitration]] -=== Strict first-match-wins (no JunkFilter arbitration) +=== Specify the chain explicitly -Omit JunkFilter to get plain first-match-wins (each base detector's top -result wins in registration order): +To replace the SPI-discovered chain with an explicit ordered list: [source,json] ---- { "encoding-detectors": [ - {"bom-detector": {}}, - {"metadata-charset-detector": {}}, {"html-encoding-detector": {}}, - {"mojibuster-encoding-detector": {}} + {"universal-encoding-detector": {}} ] } ---- -Trade-off: lying declarations propagate unfiltered, and Mojibuster's -statistical guess wins ties without any text-quality cross-check. -See -xref:../advanced/charset-detection-design.adoc#opting-out-of-arbitration[the design doc]. - === Configure the HTML detector's read limit -`html-encoding-detector` reads up to 65 536 bytes by default when -scanning for the `<meta charset>` tag. Raise it if your documents embed -large `<script>` blocks before the meta tag (TIKA-2485): +`html-encoding-detector` reads up to 65 536 bytes by default when scanning +for the `<meta charset>` tag. Raise it if your documents embed large +`<script>` blocks before the meta tag (TIKA-2485): [source,json] ---- { "encoding-detectors": [ - {"bom-detector": {}}, - {"metadata-charset-detector": {}}, { "html-encoding-detector": { "markLimit": 131072 } }, - {"mojibuster-encoding-detector": {}}, - {"junk-filter-encoding-detector": {}} + {"universal-encoding-detector": {}}, + {"icu4j-encoding-detector": {}} ] } ---- -=== Legacy chain (ICU4J + juniversalchardet) +=== Use the spec-strict WHATWG HTML detector -Not recommended — the NB pipeline is strictly better on accuracy and -latency — but available for regression testing or comparison: +If your input HTML has charset declarations inside comments (or other +contexts where the lenient regex would false-match), opt in to the +spec-strict prescan: + +[source,json] +---- +{ + "encoding-detectors": [ + {"standard-html-encoding-detector": {}}, + {"universal-encoding-detector": {}}, + {"icu4j-encoding-detector": {}} + ] +} +---- + +=== Add the Mojibuster + JunkFilter chain (opt-in) + +The byte-bigram NB classifier (`mojibuster-encoding-detector`) and the +text-quality arbitrator (`junk-filter-encoding-detector`) are available as +opt-in components. They require the `tika-encoding-detector-mojibuster` +and `tika-ml-junkdetect` modules on the classpath: [source,json] ---- { "encoding-detectors": [ - {"bom-detector": {}}, - {"metadata-charset-detector": {}}, {"html-encoding-detector": {}}, - {"icu4j-encoding-detector": {}}, - {"universal-encoding-detector": {}} + {"mojibuster-encoding-detector": {}}, + {"junk-filter-encoding-detector": {}} ] } ---- + +`junk-filter-encoding-detector` is a `MetaEncodingDetector` — it collects +candidates from the other detectors and picks the cleanest decoding via a +script-aware text-quality model. It must run last. diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc index 281bd6d12d..1bda565f56 100644 --- a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc +++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc @@ -37,7 +37,7 @@ independent handler (any Tika parser): * **`stdoutHandler`** -- processes stdout * **`stderrHandler`** -- processes stderr -* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}` is used) +* **`outputFileHandler`** -- processes the output file (when `$\{OUTPUT_FILE}` is used) Handlers extract metadata, content, or both. `regex-capture-parser` is the most common choice for extracting metadata via regex patterns. @@ -46,8 +46,8 @@ most common choice for extracting metadata via regex patterns. The `contentSource` field controls which stream provides the XHTML text content: -* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command -* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command +* `"stdout"` -- default when no `$\{OUTPUT_FILE}` in the command +* `"outputFile"` -- default when `$\{OUTPUT_FILE}` is in the command * `"stderr"` -- use stderr as the content source * `"none"` -- metadata-only mode, no text content extracted @@ -63,7 +63,7 @@ configured, the raw bytes are written as text. |`commandLine` |`List<String>` -|The command and arguments to run. Use `${INPUT_FILE}` and `${OUTPUT_FILE}` tokens for file paths. +|The command and arguments to run. Use `$\{INPUT_FILE}` and `$\{OUTPUT_FILE}` tokens for file paths. |`checkCommandLine` |`List<String>` diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc index a967c80421..7d65fe8637 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/tika.adoc @@ -27,6 +27,8 @@ Before starting the release process, ensure you have: * A valid GPG key published to a public keyserver * Maven credentials configured in `~/.m2/settings.xml` * Access to Apache's Nexus repository manager +* SVN client (`svn`) — release candidates upload to `dist.apache.org` via SVN, not scp +* Internet access on first build — the Antora docs build downloads Node.js into `~/.cache/tika-antora/` on first run (~100 MB, one-time per machine; reused across clean builds) == Pre-Release Checks @@ -125,9 +127,16 @@ Execute the Maven release perform goal: [source,bash] ---- -mvn release:perform +mvn release:perform -Darguments="-DskipITs" ---- +`-DskipITs` skips integration tests during the inner build. Tests already +ran in `release:prepare`'s `verify` phase; re-running them during perform is +belt-and-suspenders, and some pipes/elasticsearch chaos-monkey tests are +timing-sensitive enough to flake on a tagged build. + +If `release:perform` fails partway through, see <<troubleshooting>>. + Ensure you have valid Maven credentials in `~/.m2/settings.xml`: [source,xml] @@ -152,24 +161,40 @@ Ensure you have valid Maven credentials in `~/.m2/settings.xml`: === Step 10: Upload Distribution Artifacts -Upload artifacts to `dist.apache.org`: +The release-plugin's antrun task assembles a dist directory at +`target/checkout/target/X.Y.Z/` containing the source zip, app jar, server +tarballs, and parser-package jars (each with `.asc` and `.sha512`). + +WARNING: At the end of `release:perform` you will see an echo telling you +to `scp -r ... people.apache.org:public_html/tika/`. *Ignore that.* It +is stale — the current ASF release distribution channel is the SVN repo +under `dist.apache.org`, not `people.apache.org`. + +Check out the dist dev SVN repo and copy the prepared dist directory in: [source,bash] ---- svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev +cp -r target/checkout/target/X.Y.Z tika-dist-dev/ cd tika-dist-dev +svn add X.Y.Z +svn commit -m "Stage Apache Tika X.Y.Z RC<n>" ---- -Upload the following files with their signatures (.asc) and checksums (.sha512): +Verify the directory contains all expected artifacts (each with `.asc` and +`.sha512`): * `tika-X.Y.Z-src.zip` * `tika-app-X.Y.Z.jar` -* `tika-server-standard-X.Y.Z.jar` +* `tika-server-standard-X.Y.Z.jar` (and `-bin.tgz`, `-bin.zip`) +* `tika-parser-scientific-package-X.Y.Z.jar` +* `tika-parser-sqlite3-package-X.Y.Z.jar` +* `tika-parser-nlp-package-X.Y.Z.jar` Also: -* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt` -* Ensure the `KEYS` file contains all contributor signatures +* `CHANGES.txt` (already in the dist directory; rename to `CHANGES-X.Y.Z.txt` if your local copy hasn't been) +* Ensure the `KEYS` file at the parent directory contains your GPG key === Step 11: Call the Vote @@ -214,6 +239,92 @@ svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \ -m "Release Apache Tika X.Y.Z" ---- +[[troubleshooting]] +== Troubleshooting `release:perform` + +The `release:perform` build can fail mid-way for reasons unrelated to the +release itself. This section captures the recoveries learned during recent +releases. Once these get fixed in the build (tracked in the to-fix-before- +beta punch list), this section can be slimmed down. + +=== `tika-docs` assembly fails: "archive cannot be empty" + +[source] +---- +[ERROR] Failed to create assembly: Error creating assembly archive docs: + archive cannot be empty +---- + +Cause: the Antora plugin is not auto-bound to the `package` phase, so +`target/site/` is empty when `maven-assembly-plugin` runs. + +Recovery (resume from `tika-docs`): + +[source,bash] +---- +cd target/checkout +mvn deploy -Papache-release -rf :tika-docs -DskipITs +---- + +If the antora binding (the recommended fix in the to-fix-before-beta punch +list) hasn't yet landed, you may need to manually build the site first: + +[source,bash] +---- +cd target/checkout/docs +mvn antora:antora +cd .. +mvn deploy -Papache-release -rf :tika-docs -DskipITs +---- + +=== Antrun error from a child module: "Could not find file ... -src.zip" + +[source] +---- +Could not find file .../docs/target/X.Y.Z/tika-X.Y.Z-src.zip +to generate checksum for. +---- + +Cause: the root-pom antrun execution lacks `<inherited>false</inherited>`, +so it fires from each child module on a resumed deploy with `$\{basedir}` +pointing at the wrong directory. + +Recovery (run the antrun once at the root): + +[source,bash] +---- +cd target/checkout +mvn deploy --non-recursive -Papache-release -Dmaven.deploy.skip=true +---- + +`--non-recursive` runs only the root pom; `-Dmaven.deploy.skip=true` +prevents re-uploading the root pom artifact (already uploaded earlier). +The antrun fires in the correct basedir and `target/X.Y.Z/` gets +populated. + +=== Nexus staging repository: only one repo when I expected two + +If `release:perform` fails partway and you re-run it, you may see only one +open staging repository on `repository.apache.org` even though both +invocations uploaded artifacts. This is normal: while the staging repo is +*open*, redeploys overwrite earlier artifacts. Confirm by checking the +`Last Modified` timestamp on a representative artifact (e.g. +`tika-core-X.Y.Z.jar`) — it should match the most recent run. + +When in doubt, drop the staging repo and run `release:perform` cleanly +from scratch. It costs ~1 hour but yields a guaranteed single-build set +of artifacts. + +=== gRPC distribution zip is huge (~600+ MB) + +The `tika-grpc-X.Y.Z.zip` artifact bundles every pipes plugin with its full +transitive closure (microsoft-graph, gcs, az-blob, s3, kafka, etc.) plus +multi-platform native libs (rocksdbjni, netty natives). Several hundred MB +of that is duplication of dependencies already in the root `lib/` +directory. This is a known issue tracked for cleanup before beta — see +the to-fix-before-4.0.0-beta punch list. The release can ship as-is; the +zip is correct, just bloated. + == Post-Release === Update Unreleased Modules diff --git a/docs/modules/ROOT/pages/maintainers/site.adoc b/docs/modules/ROOT/pages/maintainers/site.adoc index 2a86d9231d..ce751cf4a1 100644 --- a/docs/modules/ROOT/pages/maintainers/site.adoc +++ b/docs/modules/ROOT/pages/maintainers/site.adoc @@ -28,31 +28,27 @@ The site supports multiple versions through Git branches and includes client-sid * Maven 3.9+ * Git +* Internet access on first build — the Antora plugin downloads Node.js into `~/.cache/tika-antora/` (~100 MB, one-time per machine; reused across clean builds and across worktrees) == Building the Site Locally -To build the documentation locally: +The `docs` module is only included in the reactor under the `apache-release` profile. Build the site from the repo root: [source,bash] ---- -cd docs -mvn antora:antora +./mvnw package -Papache-release -pl :tika-docs -DskipTests ---- -The generated site will be at `docs/target/site/`. +The generated site will be at `docs/target/site/`. The current git commit and date are stamped automatically onto the home page (a generated copy of the playbook lives at `docs/antora-playbook-stamped.yml` — gitignored). -To stamp the build with the current commit hash (shown on the home page), -add `git-commit` to the attributes in `antora-playbook.yml`: +To skip the stamping or override the playbook: -[source,yaml] +[source,bash] ---- -asciidoc: - attributes: - git-commit: 'abc1234' +# build directly with the unstamped playbook +cd docs && mvn antora:antora -Dplaybook=antora-playbook.yml ---- -Or pass it on the command line when you have a playbook that supports CLI attributes. - === Previewing the Site **Option 1: Python HTTP server (recommended)** @@ -113,12 +109,13 @@ The playbook (`antora-playbook.yml`) is configured to build all `docs/*` branche === Publishing to the Site -Use `build-docs.sh` with the `--publish` flag to build and copy to the site SVN checkout: +Build the docs with Maven, then run `publish-docs.sh` to copy the output to a tika-site SVN checkout (with URL flattening so `/docs/tika/X.Y.Z/...` becomes `/docs/X.Y.Z/...`): [source,bash] ---- +./mvnw package -Papache-release -pl :tika-docs -DskipTests cd docs -./build-docs.sh --publish /path/to/tika-site/publish +./publish-docs.sh /path/to/tika-site/publish # Then in the SVN checkout: cd /path/to/tika-site @@ -126,8 +123,9 @@ svn add publish/docs publish/_ --force svn commit -m "Publish 4.0.0-SNAPSHOT docs" ---- -This builds the Antora site, stamps the git commit on the home page, and copies -the output to the site with the correct directory layout: +The Maven `package` step builds the Antora site (stamping the current git +commit and date on the home page); `publish-docs.sh` copies the output to +the site checkout with the correct directory layout: * `publish/docs/4.0.0-SNAPSHOT/` -- the documentation pages * `publish/_/` -- CSS, JS, fonts (shared across versions) @@ -151,8 +149,9 @@ git commit -am "Set docs version to 4.0.0" git push origin docs/4.0.0 # 4. Build and publish +./mvnw package -Papache-release -pl :tika-docs -DskipTests cd docs -./build-docs.sh --publish /path/to/tika-site/publish +./publish-docs.sh /path/to/tika-site/publish # 5. Commit to SVN cd /path/to/tika-site @@ -177,8 +176,9 @@ git commit -am "Fix PDF parser example" git push origin docs/4.0.0 # 4. Rebuild and republish +./mvnw package -Papache-release -pl :tika-docs -DskipTests cd docs -./build-docs.sh --publish /path/to/tika-site/publish +./publish-docs.sh /path/to/tika-site/publish cd /path/to/tika-site svn commit -m "Update 4.0.0 docs" ---- diff --git a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc index d7ea0fb5fa..d8e027d496 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/design-notes-4x.adoc @@ -121,7 +121,7 @@ mvn clean spotless:apply install * Implement flexible component loading without `@TikaComponent` requirements * Enable friendly name usage throughout the codebase -* Resolve gRPC issues * Fix external renderer byte-passing in open containers * Simplify and strengthen serialization code * Consider relocating `TikaConfig` and `ForkParser` to legacy module +* Complete CLI integration for `tika-app` config dump (`--dump-minimal-config`, `--dump-current-config`, `--dump-static-config`, `--dump-static-full-config` are stubbed but not yet wired) diff --git a/docs/modules/ROOT/pages/migration-to-4x/index.adoc b/docs/modules/ROOT/pages/migration-to-4x/index.adoc index eebf29f3db..39675318c5 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/index.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/index.adoc @@ -31,6 +31,8 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and support schedules. * xref:migration-to-4x/design-notes-4x.adoc[Design Notes] - Architectural decisions and design rationale * xref:migration-to-4x/serialization-4x.adoc[Serialization] - JSON serialization design and implementation details +* xref:migration-to-4x/chunk-strategies.adoc[Chunk Strategies] - Chunk emission to Elasticsearch/OpenSearch from the inference pipeline +* xref:migration-to-4x/inference-handler-requirements.adoc[Inference Handler Requirements] - Why inference filters require MARKDOWN content handler == TODOs / Missing Features in 4.x diff --git a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc index 2f1eb24adb..7782e1cd63 100644 --- a/docs/modules/ROOT/pages/using-tika/grpc/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/grpc/index.adoc @@ -24,6 +24,28 @@ This section covers using Apache Tika via gRPC. Tika gRPC provides a high-performance gRPC interface for parsing documents. This is useful for microservices architectures and polyglot environments. +The service definition lives in `tika-grpc/src/main/proto/tika.proto`. Clients +register a fetcher (`SaveFetcher`) and then submit `FetchAndParseRequest` +messages, each of which returns a `FetchAndParseReply` with extracted +metadata and content. + +== Per-Request `ParseContext` + +`FetchAndParseRequest.parse_context_json` lets the caller override the +server's default `ParseContext` on a per-request basis. Keys are +parse-context component names; values are their JSON configs. + +[source,json] +---- +{ + "basic-content-handler-factory": {"type": "HTML"}, + "timeout-limits": {"progressTimeoutMillis": 30000} +} +---- + +See `META-INF/tika/parse-context.idx` (generated at build time from +`@TikaComponent` annotations) for the available component names. + == Topics // Add links to specific topics as they are created diff --git a/docs/pom.xml b/docs/pom.xml index 3759e2b7c4..84d570e554 100644 --- a/docs/pom.xml +++ b/docs/pom.xml @@ -64,34 +64,64 @@ under the License. </filesets> </configuration> </plugin> - <!-- Maven Assembly plugin to create tar.gz --> + <!-- Stamp the current git commit and build date onto a + sibling copy of antora-playbook.yml so the docs home + page shows which commit produced the site. The copy + lives next to the original so Antora's relative-path + resolution (e.g. ./supplemental-ui) still works. The + generated file is gitignored. --> <plugin> - <artifactId>maven-assembly-plugin</artifactId> + <artifactId>maven-antrun-plugin</artifactId> <executions> <execution> - <id>make-docs-archive</id> - <phase>package</phase> + <id>stamp-antora-playbook</id> + <phase>generate-resources</phase> <goals> - <goal>single</goal> + <goal>run</goal> </goals> <configuration> - <descriptors> - <descriptor>src/assembly/docs.xml</descriptor> - </descriptors> - <finalName>${project.artifactId}-${project.version}</finalName> + <target> + <exec executable="git" outputproperty="git.commit.short" failonerror="true"> + <arg value="rev-parse"/> + <arg value="--short"/> + <arg value="HEAD"/> + </exec> + <tstamp> + <format property="build.date" pattern="yyyy-MM-dd"/> + </tstamp> + <copy file="${project.basedir}/antora-playbook.yml" + tofile="${project.basedir}/antora-playbook-stamped.yml" + overwrite="true"/> + <replaceregexp file="${project.basedir}/antora-playbook-stamped.yml" + match="(tika-stable-version:.*)" + replace="\1${line.separator} git-commit: '${git.commit.short} (${build.date})'" + byline="true"/> + </target> </configuration> </execution> </executions> </plugin> - <!-- Antora plugin for building the documentation site --> + <!-- Antora plugin for building the documentation site. + Bound to prepare-package so target/site/ exists before + the assembly runs. + + nodeInstallDirectory is moved out of target/ so 'mvn + clean' does not nuke the downloaded Node.js. First + build per machine downloads ~100MB once; subsequent + builds reuse it. The plugin's nodeExecutable param + cannot reuse a system Node on distros where npm and + node packages aren't co-located (Debian/Ubuntu), so + caching the plugin-managed install is the practical + alternative. --> <plugin> <groupId>org.antora</groupId> <artifactId>antora-maven-plugin</artifactId> <version>${antora.version}</version> <extensions>true</extensions> <configuration> - <playbook>antora-playbook.yml</playbook> + <nodeInstallDirectory>${user.home}/.cache/tika-antora</nodeInstallDirectory> + <playbook>antora-playbook-stamped.yml</playbook> <packages> <package>@antora/[email protected]</package> </packages> @@ -100,6 +130,36 @@ under the License. <option>--log-failure-level=fatal</option> </options> </configuration> + <executions> + <execution> + <id>build-antora-site</id> + <phase>prepare-package</phase> + <goals> + <goal>antora</goal> + </goals> + </execution> + </executions> + </plugin> + + <!-- Maven Assembly plugin to package the built site into a + tar.gz for release distribution. Runs after Antora. --> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>make-docs-archive</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptors> + <descriptor>src/assembly/docs.xml</descriptor> + </descriptors> + <finalName>${project.artifactId}-${project.version}</finalName> + </configuration> + </execution> + </executions> </plugin> </plugins> </build>
