This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4746 in repository https://gitbox.apache.org/repos/asf/tika.git
commit cde93bb66d3fd2e4737211b43ab015d3efef06de Author: tallison <[email protected]> AuthorDate: Mon Jun 1 06:34:36 2026 -0400 TIKA-4746 -- sweep docs --- docs/modules/ROOT/examples/claude-vlm-basic.json | 11 +- docs/modules/ROOT/examples/claude-vlm-full.json | 19 +- docs/modules/ROOT/examples/gemini-vlm-basic.json | 11 +- docs/modules/ROOT/examples/gemini-vlm-full.json | 19 +- docs/modules/ROOT/examples/openai-vlm-basic.json | 12 +- docs/modules/ROOT/examples/openai-vlm-full.json | 19 +- docs/modules/ROOT/examples/tess4j-basic.json | 11 +- docs/modules/ROOT/examples/tess4j-full.json | 19 +- docs/modules/ROOT/examples/vlm-pdf-parsing.json | 17 +- docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/advanced/index.adoc | 11 + .../integration-testing/run-uat-script.adoc | 2 +- .../integration-testing/tika-eval-regression.adoc | 2 +- .../ROOT/pages/advanced/language-detection.adoc | 67 ++--- .../ROOT/pages/advanced/setting-limits.adoc | 6 +- docs/modules/ROOT/pages/configuration/index.adoc | 51 +++- .../pages/configuration/parsers/tess4j-parser.adoc | 4 + .../pages/configuration/parsers/vlm-parsers.adoc | 12 + .../ROOT/pages/developers/serialization.adoc | 20 +- docs/modules/ROOT/pages/maintainers/index.adoc | 1 + .../pages/migration-to-4x/migrating-to-4x.adoc | 15 + .../pages/migration-to-4x/serialization-4x.adoc | 42 +++ docs/modules/ROOT/pages/pipes/configuration.adoc | 58 +++- docs/modules/ROOT/pages/pipes/getting-started.adoc | 4 +- docs/modules/ROOT/pages/pipes/index.adoc | 98 ------- docs/modules/ROOT/pages/pipes/iterators.adoc | 2 +- docs/modules/ROOT/pages/pipes/parse-modes.adoc | 32 ++- .../ROOT/pages/pipes/plugins/filesystem.adoc | 17 +- docs/modules/ROOT/pages/pipes/plugins/index.adoc | 4 +- docs/modules/ROOT/pages/pipes/timeouts.adoc | 34 ++- docs/modules/ROOT/pages/pipes/troubleshooting.adoc | 34 +++ docs/modules/ROOT/pages/pipes/unpack-config.adoc | 85 ++++-- docs/modules/ROOT/pages/using-tika/cli/index.adoc | 317 +++++++++++++++++---- docs/modules/ROOT/pages/using-tika/index.adoc | 1 + .../ROOT/pages/using-tika/server/index.adoc | 201 +++++++++---- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 99 ++++++- .../tika-parser-tess4j-module/pom.xml | 6 + .../ocr/tess4j/Tess4JConfigExamplesTest.java | 72 +++++ .../resources/config-examples/tess4j-basic.json | 11 + .../resources/config-examples/tess4j-full.json | 20 ++ tika-parsers/tika-parsers-ml/tika-vlm/pom.xml | 6 + .../tika/parser/vlm/VLMConfigExamplesTest.java | 98 +++++++ .../config-examples}/claude-vlm-basic.json | 0 .../config-examples}/claude-vlm-full.json | 4 +- .../config-examples}/gemini-vlm-basic.json | 0 .../config-examples}/gemini-vlm-full.json | 4 +- .../config-examples}/openai-vlm-basic.json | 0 .../config-examples}/openai-vlm-full.json | 5 +- .../config-examples}/vlm-pdf-parsing.json | 0 .../resources/config-examples/tesseract-full.json | 1 + .../apache/tika/parser/ocr/TesseractOCRConfig.java | 2 +- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 6 +- .../azblob/AZBlobPipesIteratorFactory.java | 6 +- .../iterator/csv/CSVPipesIteratorFactory.java | 6 +- .../fs/FileSystemPipesIteratorFactory.java | 6 +- .../iterator/gcs/GCSPipesIteratorFactory.java | 6 +- .../iterator/jdbc/JDBCPipesIteratorFactory.java | 6 +- .../json/JsonPipesIteratorFactory.java | 6 +- .../iterator/kafka/KafkaPipesIteratorFactory.java | 6 +- .../pipes/iterator/s3/S3PipesIteratorFactory.java | 6 +- .../iterator/solr/SolrPipesIteratorFactory.java | 6 +- .../apache/tika/plugins/ThreadSafeUnzipper.java | 13 + 62 files changed, 1179 insertions(+), 481 deletions(-) diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json b/docs/modules/ROOT/examples/claude-vlm-basic.json deleted file mode 100644 index 5931df09ed..0000000000 --- a/docs/modules/ROOT/examples/claude-vlm-basic.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "parsers": [ - { - "claude-vlm-parser": { - "apiKey": "sk-ant-your-key-here", - "model": "claude-sonnet-4-20250514" - } - } - ] -} diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json b/docs/modules/ROOT/examples/claude-vlm-basic.json new file mode 120000 index 0000000000..d20891ef0d --- /dev/null +++ b/docs/modules/ROOT/examples/claude-vlm-basic.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json b/docs/modules/ROOT/examples/claude-vlm-full.json deleted file mode 100644 index 9dc7ff67d3..0000000000 --- a/docs/modules/ROOT/examples/claude-vlm-full.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "parsers": [ - { - "claude-vlm-parser": { - "baseUrl": "https://api.anthropic.com", - "model": "claude-sonnet-4-20250514", - "prompt": "Extract all visible text from this image. Return the text in markdown format, preserving the original structure (headings, lists, tables, paragraphs). Do not describe the image. Only return the extracted text.", - "maxTokens": 4096, - "timeoutSeconds": 300, - "apiKey": "sk-ant-your-key-here", - "inlineContent": true, - "skipOcr": false, - "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 - } - } - ] -} diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json b/docs/modules/ROOT/examples/claude-vlm-full.json new file mode 120000 index 0000000000..5392d5c689 --- /dev/null +++ b/docs/modules/ROOT/examples/claude-vlm-full.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json b/docs/modules/ROOT/examples/gemini-vlm-basic.json deleted file mode 100644 index a39ee9ed82..0000000000 --- a/docs/modules/ROOT/examples/gemini-vlm-basic.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "parsers": [ - { - "gemini-vlm-parser": { - "apiKey": "your-gemini-api-key", - "model": "gemini-2.5-flash" - } - } - ] -} diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json b/docs/modules/ROOT/examples/gemini-vlm-basic.json new file mode 120000 index 0000000000..a0354acbab --- /dev/null +++ b/docs/modules/ROOT/examples/gemini-vlm-basic.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json b/docs/modules/ROOT/examples/gemini-vlm-full.json deleted file mode 100644 index ab09b993f0..0000000000 --- a/docs/modules/ROOT/examples/gemini-vlm-full.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "parsers": [ - { - "gemini-vlm-parser": { - "baseUrl": "https://generativelanguage.googleapis.com", - "model": "gemini-2.5-flash", - "prompt": "Extract all visible text from this image. Return the text in markdown format, preserving the original structure (headings, lists, tables, paragraphs). Do not describe the image. Only return the extracted text.", - "maxTokens": 4096, - "timeoutSeconds": 300, - "apiKey": "your-gemini-api-key", - "inlineContent": true, - "skipOcr": false, - "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 - } - } - ] -} diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json b/docs/modules/ROOT/examples/gemini-vlm-full.json new file mode 120000 index 0000000000..94c81ee02e --- /dev/null +++ b/docs/modules/ROOT/examples/gemini-vlm-full.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json b/docs/modules/ROOT/examples/openai-vlm-basic.json deleted file mode 100644 index f54d9063ab..0000000000 --- a/docs/modules/ROOT/examples/openai-vlm-basic.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "parsers": [ - { - "openai-vlm-parser": { - "baseUrl": "http://127.0.0.1:8000", - "model": "jinaai/jina-vlm", - "timeoutSeconds": 300 - } - } - ] -} diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json b/docs/modules/ROOT/examples/openai-vlm-basic.json new file mode 120000 index 0000000000..2a73403e3e --- /dev/null +++ b/docs/modules/ROOT/examples/openai-vlm-basic.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json b/docs/modules/ROOT/examples/openai-vlm-full.json deleted file mode 100644 index 91baafc74e..0000000000 --- a/docs/modules/ROOT/examples/openai-vlm-full.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "parsers": [ - { - "openai-vlm-parser": { - "baseUrl": "http://127.0.0.1:8000", - "model": "jinaai/jina-vlm", - "prompt": "Extract all visible text from this image. Return the text in markdown format, preserving the original structure (headings, lists, tables, paragraphs). Do not describe the image. Only return the extracted text.", - "maxTokens": 4096, - "timeoutSeconds": 300, - "apiKey": "", - "inlineContent": true, - "skipOcr": false, - "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 - } - } - ] -} diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json b/docs/modules/ROOT/examples/openai-vlm-full.json new file mode 120000 index 0000000000..fe81340fd6 --- /dev/null +++ b/docs/modules/ROOT/examples/openai-vlm-full.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/tess4j-basic.json b/docs/modules/ROOT/examples/tess4j-basic.json deleted file mode 100644 index 3fc74587be..0000000000 --- a/docs/modules/ROOT/examples/tess4j-basic.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "parsers": [ - { - "name": "tess4j-parser", - "dataPath": "/usr/share/tesseract-ocr/5/tessdata", - "nativeLibPath": "/usr/lib/x86_64-linux-gnu", - "poolSize": 4 - } - ] -} diff --git a/docs/modules/ROOT/examples/tess4j-basic.json b/docs/modules/ROOT/examples/tess4j-basic.json new file mode 120000 index 0000000000..8be9e0b76a --- /dev/null +++ b/docs/modules/ROOT/examples/tess4j-basic.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/tess4j-full.json b/docs/modules/ROOT/examples/tess4j-full.json deleted file mode 100644 index c2d5170ecf..0000000000 --- a/docs/modules/ROOT/examples/tess4j-full.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "parsers": [ - { - "name": "tess4j-parser", - "dataPath": "/usr/share/tesseract-ocr/5/tessdata", - "nativeLibPath": "/usr/lib/x86_64-linux-gnu", - "language": "eng", - "pageSegMode": 1, - "ocrEngineMode": 3, - "poolSize": 4, - "timeoutSeconds": 120, - "dpi": 300, - "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 2147483647, - "skipOcr": false - } - ] -} diff --git a/docs/modules/ROOT/examples/tess4j-full.json b/docs/modules/ROOT/examples/tess4j-full.json new file mode 120000 index 0000000000..9df8ad46f7 --- /dev/null +++ b/docs/modules/ROOT/examples/tess4j-full.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json b/docs/modules/ROOT/examples/vlm-pdf-parsing.json deleted file mode 100644 index b76c0bbf35..0000000000 --- a/docs/modules/ROOT/examples/vlm-pdf-parsing.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "parsers": [ - { - "default-parser": { - "exclude": ["pdf-parser"] - } - }, - { - "claude-vlm-parser": { - "apiKey": "sk-ant-your-key-here", - "model": "claude-sonnet-4-20250514", - "prompt": "Extract all text from this document. Return the text in markdown format, preserving the original structure (headings, lists, tables, paragraphs). Do not describe the document. Only return the extracted text." - } - } - ] -} diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json b/docs/modules/ROOT/examples/vlm-pdf-parsing.json new file mode 120000 index 0000000000..dd246c4856 --- /dev/null +++ b/docs/modules/ROOT/examples/vlm-pdf-parsing.json @@ -0,0 +1 @@ +../../../../tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 2fa628fef4..070f535ff8 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -27,6 +27,7 @@ ** xref:pipes/iterators.adoc[Iterators] ** xref:pipes/reporters.adoc[Reporters] ** xref:pipes/configuration.adoc[Pipeline Configuration] +** xref:pipes/shared-server-mode.adoc[Shared Server Mode] ** xref:pipes/parse-modes.adoc[Parse Modes] ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] ** xref:pipes/timeouts.adoc[Timeouts] diff --git a/docs/modules/ROOT/pages/advanced/index.adoc b/docs/modules/ROOT/pages/advanced/index.adoc index 72d1252269..abd555de9c 100644 --- a/docs/modules/ROOT/pages/advanced/index.adoc +++ b/docs/modules/ROOT/pages/advanced/index.adoc @@ -19,6 +19,17 @@ This section covers advanced usage and internals of Apache Tika. +NOTE: Most pages here are written from a Java-API perspective. Where a topic +has a JSON-config or CLI equivalent, look first under +xref:configuration/index.adoc[Configuration] (per-parser options), +xref:pipes/index.adoc[Tika Pipes] (pipeline + Pipes-mode tuning), +xref:using-tika/server/index.adoc[Tika Server] (REST + server config), or +xref:using-tika/cli/index.adoc[Tika CLI] (`tika-app` flags). The +xref:advanced/setting-limits.adoc[Setting Limits] page is the model — it +covers Java, JSON, and CLI side by side. Filing issues against specific +advanced pages where the JSON/CLI equivalent isn't documented yet helps us +prioritize the gap. + == Topics * xref:advanced/language-detection.adoc[Language Detection] - Built-in bigram language detector, training pipeline, and comparison with OpenNLP diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc index d8b44453d0..594e9e91ff 100644 --- a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc +++ b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc @@ -46,7 +46,7 @@ Coverage includes: * `/version`, `/parsers`, `/detectors`, `/mime-types` (introspection) * `/detect/stream` (mime detection) * `/tika`, `/tika/text`, `/tika/xml`, `/tika/json` (parse) -* `/meta`, `/meta/{field}` (metadata) +* `/meta`, `/meta/\{field}` (metadata) * `/rmeta`, `/rmeta/text` (recursive metadata) * `/unpack/all` (embedded extraction; verifies the response is a valid zip) * `/language/stream` diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc index 24460db449..a81f6fabd4 100644 --- a/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc +++ b/docs/modules/ROOT/pages/advanced/integration-testing/tika-eval-regression.adoc @@ -200,7 +200,7 @@ Options: * `-i` / `--inputDir` — original binary input directory (optional, lets tika-eval pair extracts to source files even if A or B failed on some) * `-d` / `--db` — H2 database name/path. A short label is fine — - tika-eval will create `{label}.mv.db` and a `{label}-reports/` dir + tika-eval will create `\{label}.mv.db` and a `\{label}-reports/` dir alongside. Persist the db if you want to re-run Report later. * `-r` / `--report` — automatically run the Report step after Compare, and zip the reports directory. diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc b/docs/modules/ROOT/pages/advanced/language-detection.adoc index b95e06eafb..120c2e320f 100644 --- a/docs/modules/ROOT/pages/advanced/language-detection.adoc +++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc @@ -153,56 +153,29 @@ back to the general model transparently. === Overriding Model Selection -The selection strategy can be overridden at construction time or per-document -via `ParseContext`: - -[source,java] ----- -// Always use the short-text model (e.g. for a title-only pipeline) -CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap( - Map.of("strategy", "SHORT_TEXT")); -CharSoupLanguageDetector detector = new CharSoupLanguageDetector(cfg); - -// Always use the general model (e.g. for full-document body text) -CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap( - Map.of("strategy", "STANDARD")); - -// Per-document override via ParseContext -ParseContext context = new ParseContext(); -context.set(CharSoupDetectorConfig.class, CharSoupDetectorConfig.fromMap( - Map.of("strategy", "SHORT_TEXT"))); -detector.reset(context); ----- - -The three strategies are: - -[cols="1,3"] -|=== -| Strategy | Behaviour - -| `AUTOMATIC` (default) -| Use length and feature-density gates to choose between models per chunk. - -| `SHORT_TEXT` -| Always use the short-text model (no-op if the binary is absent). - -| `STANDARD` -| Always use the general model regardless of input length. -|=== - -The thresholds can also be tuned via `CharSoupDetectorConfig`: - -[source,java] +The automatic gates (200-char length, 200-feature density) and the strategy +choice (always-short / always-general / automatic) are currently fixed in +`CharSoupLanguageDetector`. Programmatic knobs are limited to +`setMaxLength(int)` for the rolling buffer and `setPriors(Map)` for language +priors. + +`CharSoupLanguageDetector` is registered as `@TikaComponent(name = +"charsoup-language-detector")` and implements `SelfConfiguring`, so once a +config record is added, declarative tuning will be available via the standard +`parse-context` JSON section: + +[source,json] ---- -CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of( - "strategy", "AUTOMATIC", - "lengthThreshold", 300, // chars; default 200 - "featureThreshold", 300 // n-gram emissions; default 200 -)); +{ + "parse-context": { + "charsoup-language-detector": { + /* future tuning fields will go here */ + } + } +} ---- -Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring` -component loading. +See xref:migration-to-4x/serialization-4x.adoc#discovering-friendly-names[Discovering the friendly name for a component] for how `SelfConfiguring` components are resolved at runtime. == Training the Models diff --git a/docs/modules/ROOT/pages/advanced/setting-limits.adoc b/docs/modules/ROOT/pages/advanced/setting-limits.adoc index a3af216175..a815ae6841 100644 --- a/docs/modules/ROOT/pages/advanced/setting-limits.adoc +++ b/docs/modules/ROOT/pages/advanced/setting-limits.adoc @@ -316,8 +316,10 @@ When the byte limit is reached: [source,json] ---- { - "parseContext": { - "parseMode": "UNPACK", + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { "unpack-config": { "maxUnpackBytes": 104857600 } diff --git a/docs/modules/ROOT/pages/configuration/index.adoc b/docs/modules/ROOT/pages/configuration/index.adoc index 85864b4727..068fc71c9f 100644 --- a/docs/modules/ROOT/pages/configuration/index.adoc +++ b/docs/modules/ROOT/pages/configuration/index.adoc @@ -16,28 +16,69 @@ // = Configuration +:toc: This section covers configuring Apache Tika. == Overview Tika 4.x uses JSON configuration files. Configuration controls parsers, detectors, -content handlers, and other components. +content handlers, server behavior, and the Tika Pipes pipeline. NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the xref:migration-to-4x/index.adoc[Migration Guide] for details on converting to JSON. +== Top-level JSON structure + +A `tika-config.json` is a single JSON object whose keys are the top-level sections +listed below. Every section is optional — omit what you don't need. Defaults are +used wherever a section is missing. + +[source,json] +---- +{ + "parsers": [ /* parser declarations */ ], + "detectors": [ /* detector declarations */ ], + "encoding-detectors": [ /* encoding detector declarations */ ], + "content-handler-factory": { /* handler type for emitted content */ }, + "parse-context": { + "timeout-limits": { /* progress + total task timeouts */ }, + "unpack-config": { /* embedded-byte extraction */ } + /* other SelfConfiguring components by component name */ + }, + "server": { /* tika-server options: enableUnsecureFeatures, cors, ... */ }, + "pipes": { /* Pipes process management: numClients, parseMode, ... */ }, + "fetchers": { /* named fetcher instances */ }, + "emitters": { /* named emitter instances */ }, + "pipes-iterator": { /* iterator (one per pipeline) */ }, + "pipes-reporters": { /* per-document status reporters */ }, + "plugin-roots": "/path/to/plugins" +} +---- + +Per-section documentation: + +* `parsers`, `detectors`, `encoding-detectors`, `content-handler-factory`, + `parse-context` — covered below under <<_topics,Topics>>. +* `server` — see xref:using-tika/server/index.adoc[Tika Server]. +* `pipes`, `fetchers`, `emitters`, `pipes-iterator`, `pipes-reporters`, + `plugin-roots` — see xref:pipes/configuration.adoc[Pipes Configuration] + and xref:pipes/index.adoc[Tika Pipes]. + == Topics === Parser Configuration -* xref:configuration/parsers/pdf-parser.adoc[PDFParser] - PDF parsing options -* xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - OCR options for image-based text extraction +* xref:configuration/parsers/pdf-parser.adoc[PDFParser] — PDF parsing options +* xref:configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] — OCR options for image-based text extraction +* xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR Parser] — in-process OCR via tess4j JNI bindings +* xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers] — Claude, Gemini, OpenAI, Ollama, vLLM +* xref:configuration/parsers/external-parser.adoc[External Parser] — wrap external tools (ffmpeg, exiftool, etc.) === Other Configuration -* xref:configuration/digesters.adoc[Digesters] - Computing cryptographic hashes of documents -* xref:configuration/encoding-detectors.adoc[Encoding Detectors] - Configuring charset/encoding detection +* xref:configuration/digesters.adoc[Digesters] — Computing cryptographic hashes of documents +* xref:configuration/encoding-detectors.adoc[Encoding Detectors] — Configuring charset/encoding detection // Add links to specific topics as they are created // * xref:json-config.adoc[JSON Configuration Reference] diff --git a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc index fb52b1d6e0..4dccac2c4d 100644 --- a/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc +++ b/docs/modules/ROOT/pages/configuration/parsers/tess4j-parser.adoc @@ -177,6 +177,10 @@ throwing an exception. |`2147483647` (~2 GB) |Maximum input file size in bytes. Larger files are skipped. +|`maxImagePixels` +|`100000000` (100 megapixels) +|Maximum decoded-image area. Larger images are skipped. Guards against decompression-bomb inputs that would blow up memory before OCR even starts. + |`skipOcr` |`false` |Runtime kill-switch to disable the parser entirely. diff --git a/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc b/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc index b2b8b454b3..c0f9d53d4b 100644 --- a/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc +++ b/docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc @@ -209,6 +209,18 @@ text into the parent document's content stream. Mirrors |`maxFileSizeToOcr` |`52428800` (50 MB) |Maximum input file size in bytes. + +|`maxImagePixels` +|`100000000` (100 megapixels) +|Maximum decoded image area in pixels. Larger images are rejected before being sent to the model. Set to `-1` to disable the limit. Guards against decompression-bomb inputs and runaway VLM cost on a single huge image. + +|`allowRuntimePrompt` +|`false` +|When `false` (default), the `prompt` is fixed at initialization time and per-request overrides are rejected. When `true`, the prompt can be overridden per request via the `ParseContext` `VLMOCRConfig`. Security-relevant: a runtime-controllable prompt is effectively a prompt-injection surface for any caller that can set the `ParseContext`. Only enable when callers are trusted. + +|`completionsPath` +|`/v1/chat/completions` (OpenAI/vLLM only) +|HTTP path appended to `baseUrl` for the chat-completions endpoint. Used by the OpenAI-compatible parser only. Claude and Gemini hardcode their own API paths (`/v1/messages` and `/v1beta/models/\{model}:generateContent` respectively) and ignore this field. |=== == Markdown-to-XHTML conversion diff --git a/docs/modules/ROOT/pages/developers/serialization.adoc b/docs/modules/ROOT/pages/developers/serialization.adoc index 6ec426b061..b68435d939 100644 --- a/docs/modules/ROOT/pages/developers/serialization.adoc +++ b/docs/modules/ROOT/pages/developers/serialization.adoc @@ -212,7 +212,7 @@ Benefits: [source,json] ---- { - "parseContext": { + "parse-context": { "pdf-parser": { "ocrStrategy": "AUTO", "extractInlineImages": true @@ -231,7 +231,7 @@ For components that need immediate deserialization (not lazy loading): [source,json] ---- { - "parseContext": { + "parse-context": { "typed": { "handler-config": { "type": "XML", @@ -299,23 +299,27 @@ public class UpperCaseFilter implements MetadataFilter { } ---- -Configure in JSON: +Configure in JSON. Metadata filters are loaded via `parse-context` (they +implement the `MetadataFilter` interface, which is a `ParseContext`-keyed +component): [source,json] ---- { - "metadata-filters": [ - {"upper-case-filter": {"fieldName": "dc:title"}} - ] + "parse-context": { + "upper-case-filter": {"fieldName": "dc:title"} + } } ---- -Or with defaults: +Or with defaults (string form constructs with no overrides): [source,json] ---- { - "metadata-filters": ["upper-case-filter"] + "parse-context": { + "upper-case-filter": {} + } } ---- diff --git a/docs/modules/ROOT/pages/maintainers/index.adoc b/docs/modules/ROOT/pages/maintainers/index.adoc index 18735fea90..0452cd8f6a 100644 --- a/docs/modules/ROOT/pages/maintainers/index.adoc +++ b/docs/modules/ROOT/pages/maintainers/index.adoc @@ -16,6 +16,7 @@ // = For Maintainers +:toc: This section contains documentation for Apache Tika project maintainers and committers. diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc index d36fd51c31..2942e18d32 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc @@ -25,6 +25,21 @@ See the xref:roadmap.adoc[Roadmap] for version timelines and support schedules. * Java 17 or later (upgraded from Java 11 in 3.x) +== `tika-app` distribution: jar -> zip + +In 3.x, `tika-app-<version>.jar` was a self-contained fat jar — you could drop it anywhere and run `java -jar tika-app.jar`. In 4.x it is a thin launcher that depends on the parsers, the Tika Pipes processor, and other modules living in an adjacent `lib/` directory. Running the bare jar by itself will fail with `NoClassDefFoundError`. + +Download `tika-app-<version>.zip` and run from inside the unzipped directory so `lib/` (and `plugins/`) sit alongside the jar: + +[source,bash] +---- +unzip tika-app-<version>.zip +cd tika-app-<version> +java -jar tika-app-<version>.jar [option...] [file...] +---- + +If you have build scripts or container images that drop in just the jar, update them to unpack the zip and run from inside it. + == Configuration: XML to JSON Tika 4.x uses JSON configuration files instead of XML. The legacy `tika-config.xml` format diff --git a/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc b/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc index e11bdc4959..8afafbe55e 100644 --- a/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc +++ b/docs/modules/ROOT/pages/migration-to-4x/serialization-4x.adoc @@ -31,6 +31,48 @@ Jackson dependencies are kept out of core modules to maintain flexibility. Implementation uses friendly names like `pdf-parser` rather than full class names. These friendly names are applied to configured items rather than configuration class names. +[#discovering-friendly-names] +==== Discovering the friendly name for a component + +The 4.x JSON config refers to parsers, detectors, fetchers, emitters, and other components +by their friendly name (e.g., `pdf-parser`, `file-system-fetcher`). To map a Java class +to its friendly name (or vice versa), use any of: + +. **`tika-app --list-parser-names` / `--list-detector-names`** — emits each + registered class with its friendly name as tab-separated `class<TAB>friendly-name`: ++ +[source,bash] +---- +java -jar tika-app.jar --list-parser-names +# org.apache.tika.parser.pdf.PDFParser pdf-parser +# org.apache.tika.parser.html.JSoupParser jsoup-parser +# ... +---- ++ +The mapping comes from the `META-INF/tika/parsers.idx` / `detectors.idx` files +generated at compile time by the `@TikaComponent` annotation processor. The +underlying lookup is `o.a.t.config.loader.ComponentRegistry.getFriendlyName(Class)`. +. **Per-parser configuration pages** under xref:configuration/index.adoc[Configuration] + show the friendly name in their page title and JSON examples. +. **The naming convention** — when `@TikaComponent` has no explicit `name`, the + friendly name is derived from the class's simple name via the kebab-case rule + in `o.a.t.config.loader.KebabCaseConverter`. Examples: ++ +[cols="2,2"] +|=== +|Class |Friendly name + +|`PDFParser` |`pdf-parser` +|`TesseractOCRParser` |`tesseract-ocr-parser` +|`AutoDetectParser` |`auto-detect-parser` +|`FileSystemFetcher` |`file-system-fetcher` +|`SolrEmitter` |`solr-emitter` +|=== + +NOTE: The `--list-parsers`, `--list-detectors`, and `--list-parser-details` commands +print the hierarchical, human-oriented view (class names with composite parsers +indented). Use the `--list-*-names` variants when you want a machine-readable mapping. + === Custom Class Support The design permits users to add custom classes through Jackson's polymorphic handling: diff --git a/docs/modules/ROOT/pages/pipes/configuration.adoc b/docs/modules/ROOT/pages/pipes/configuration.adoc index e4e3d0b1c2..7049456f24 100644 --- a/docs/modules/ROOT/pages/pipes/configuration.adoc +++ b/docs/modules/ROOT/pages/pipes/configuration.adoc @@ -84,6 +84,14 @@ See also xref:pipes/timeouts.adoc[Timeouts] for the full timeout model. |`maxWaitForClientMillis` |`60000` |Maximum time (ms) to wait for an available forked process when all are busy. + +|`staleFetcherTimeoutSeconds` +|`600` +|How long (seconds) a fetcher-emitter pairing can sit idle in the cache before it is eligible for eviction. Increase if your pipeline has long quiet periods between tuples that reuse the same fetcher/emitter. + +|`staleFetcherDelaySeconds` +|`60` +|How often (seconds) the stale-fetcher reaper runs. |=== == Parse Behavior @@ -131,7 +139,53 @@ These settings control how parsed results are batched before sending to emitters |`emitIntermediateResults` |`false` -|Emit partial results as they become available (rather than waiting for the full parse to complete). +|When `false`, only successfully-parsed tuples reach the emitter — files that crash, time out, or otherwise fail are dropped from the output. When `true`, every tuple is emitted, including failures (the metadata carries the exception). Turn this on if you need a complete record of what was attempted (audit, retry logic, chaos-monkey tests). +|=== + +== Emit Strategy + +`emitStrategy` controls whether parsed extracts are emitted directly from the forked PipesServer or passed back to the parent process first. The default is balanced for typical workloads — tune only if you have a memory or throughput problem. + +[source,json] +---- +{ + "pipes": { + "emitStrategy": { + "type": "DYNAMIC", + "thresholdBytes": 100000 + } + } +} +---- + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`type` +|`DYNAMIC` +|One of `DYNAMIC`, `EMIT_ALL`, `PASSBACK_ALL`. `DYNAMIC` switches per-extract based on size (see `thresholdBytes`). `EMIT_ALL` always emits from the forked process. `PASSBACK_ALL` always passes extracts back to the parent for emission. + +|`thresholdBytes` +|`100000` +|Only used when `type` is `DYNAMIC`. Extracts larger than this are emitted directly from the forked PipesServer; smaller ones are passed back to the parent. Setting `thresholdBytes` with type `EMIT_ALL` or `PASSBACK_ALL` is a config error. +|=== + +== Distributed Config Store + +For multi-host pipelines (e.g., shared-server clusters) you can store fetcher/emitter configuration in a distributed backend instead of memory. Most users should leave the defaults. + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`configStoreType` +|`"memory"` +|Backend for storing fetcher/emitter configurations. `"memory"` (default) is in-process; `"ignite"` uses Apache Ignite for shared state across nodes. + +|`configStoreParams` +|`"{}"` +|JSON object (as a string) with backend-specific parameters. Structure depends on `configStoreType`. |=== == Shared Server Mode (Experimental) @@ -162,7 +216,7 @@ include::example$pipes-fs-pipeline.json[] icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View source on GitHub] -Tokens (`FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`) are substituted by the test harness — replace them with real paths in production configs. +Tokens (`FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`, `EMIT_INTERMEDIATE_RESULTS`) are substituted by the test harness — replace them with real values in production configs. The first three are paths; `EMIT_INTERMEDIATE_RESULTS` is the boolean `emitIntermediateResults` flag. [#emit-all] === Emit-all variant diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc b/docs/modules/ROOT/pages/pipes/getting-started.adoc index e52e02f1ac..db6955aeb7 100644 --- a/docs/modules/ROOT/pages/pipes/getting-started.adoc +++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc @@ -66,13 +66,13 @@ include::example$pipes-fs-pipeline.json[] ---- icon:github[] https://github.com/apache/tika/blob/main/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json[View source on GitHub] -NOTE: The values shown like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and `PLUGINS_PATHS` are placeholders the integration tests substitute at runtime. Replace them with real paths in your own config. +NOTE: The values shown like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`, and `EMIT_INTERMEDIATE_RESULTS` are placeholders the integration tests substitute at runtime. Replace them with real paths (or, for `EMIT_INTERMEDIATE_RESULTS`, the boolean `true`/`false`) in your own config. Run it with: [source,bash] ---- -java -jar tika-app.jar --config tika-config.json -i /data/input -o /data/output +java -jar tika-app.jar --config=tika-config.json -i /data/input -o /data/output ---- NOTE: The `-i` and `-o` flags override the `basePath` values in the config when used diff --git a/docs/modules/ROOT/pages/pipes/index.adoc b/docs/modules/ROOT/pages/pipes/index.adoc index 7bd2078238..8037b8857f 100644 --- a/docs/modules/ROOT/pages/pipes/index.adoc +++ b/docs/modules/ROOT/pages/pipes/index.adoc @@ -52,104 +52,6 @@ against problematic files. * xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] -- extract raw bytes from embedded documents * xref:pipes/timeouts.adoc[Timeouts] -- two-tier timeout system for handling long-running and hung parsers -== Emitters - -=== ES Emitter (`es-emitter`) - -The ES emitter sends parsed documents to any ES-compatible REST API (ES 7+/8+) via -the `_bulk` endpoint. It uses plain HTTP (Apache HttpClient) — there is no dependency -on the ES Java client, which carries a non-ASL license. - -[source,json] ----- -"emitters": { - "my-es": { - "es-emitter": { - "esUrl": "https://localhost:9200/my-index", - "idField": "_id", - "attachmentStrategy": "SEPARATE_DOCUMENTS", - "updateStrategy": "UPSERT", - "embeddedFileFieldName": "embedded", - "apiKey": "<base64-encoded id:api_key>" - } - } -} ----- - -[cols="1,1,3"] -|=== -|Field |Default |Description - -|`esUrl` -|_required_ -|Full URL including the index name, e.g. `https://localhost:9200/my-index` - -|`idField` -|`_id` -|Metadata field used as the document `_id` - -|`attachmentStrategy` -|`SEPARATE_DOCUMENTS` -|How embedded documents are stored. `SEPARATE_DOCUMENTS` gives each embedded -file its own flat document. `PARENT_CHILD` uses an ES join field so embedded -files are linked to their container via `relation_type`. - -|`updateStrategy` -|`OVERWRITE` -|`OVERWRITE` uses a bulk `index` action (full replace). -`UPSERT` uses a bulk `update` / `doc_as_upsert` action (field-level merge). - -|`embeddedFileFieldName` -|`embedded` -|Name of the join-field relation used in `PARENT_CHILD` mode. - -|`apiKey` -|_none_ -|Base64-encoded `id:api_key` sent as `Authorization: ApiKey <value>`. -Takes precedence over `httpClientConfig` basic auth. - -|`httpClientConfig` -|_none_ -|Optional block for `userName`, `password`, `authScheme`, `connectionTimeout`, -`socketTimeout`, `proxyHost`, `proxyPort`, and `verifySsl` (boolean, default `false`). -|=== - -[WARNING] -==== -By default (`verifySsl: false`) TLS certificate verification is disabled — all -certificates are trusted and hostname verification is skipped. Set -`httpClientConfig.verifySsl: true` to enable proper certificate and hostname -validation using the JVM's default trust store. When `verifySsl` is `false`, -do not transmit credentials over plain HTTP in production; prefer HTTPS with -network-level controls (VPN, private endpoint) until verification is enabled. -==== - -=== ES Pipes Reporter (`es-pipes-reporter`) - -The ES reporter writes per-document parse status back into the same index, -so you can query the processing outcome alongside the extracted content. - -[source,json] ----- -"pipes-reporters": { - "es-pipes-reporter": { - "esUrl": "https://localhost:9200/my-index", - "keyPrefix": "tika_", - "includeRouting": false - } -} ----- - -The reporter adds `<keyPrefix>parse_status`, `<keyPrefix>parse_time_ms`, -and (when the forked JVM exits abnormally) `<keyPrefix>exit_value` fields -to each document via an upsert. - -=== OpenSearch Emitter - -The OpenSearch emitter is configured identically but uses `opensearch-emitter` as the -plugin key and `openSearchUrl` as the URL field. It also ships with an -`opensearch-pipes-reporter`. - == Advanced Topics * xref:pipes/shared-server-mode.adoc[Shared Server Mode] - Experimental mode for reduced memory usage diff --git a/docs/modules/ROOT/pages/pipes/iterators.adoc b/docs/modules/ROOT/pages/pipes/iterators.adoc index a3e3bc7292..f58f5a1fbb 100644 --- a/docs/modules/ROOT/pages/pipes/iterators.adoc +++ b/docs/modules/ROOT/pages/pipes/iterators.adoc @@ -34,7 +34,7 @@ The iterator runs on its own thread; the pipeline reads tuples as fast as the wo [#wiring] == Wiring an Iterator Into a Pipeline -The iterator lives under the singular top-level `pipes-iterator` key. The inner map key is the iterator's component name. `fetcherId` and `emitterId` are *flat fields* on the iterator config — they are not wrapped in a `baseConfig` block. +The iterator lives under the singular top-level `pipes-iterator` key. The inner map key is the iterator's component name. `fetcherId` and `emitterId` are flat fields on the iterator config, alongside the iterator-specific options: [source,json] ---- diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc index 69ba2204cd..ab81227ac9 100644 --- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -19,9 +19,10 @@ :toc: :toclevels: 3 -Tika Pipes uses `ParseMode` to control how documents are parsed and how results are emitted. -The parse mode is configured in the `pipes` section of the JSON config, or overridden per-request -in the `parseContext` field of a `FetchEmitTuple`. +Tika Pipes uses parse modes to control how documents are parsed and how results are emitted. +The mode is set as `parseMode` in the `pipes` section of the JSON config, and can be overridden +per-request from Java code by attaching a `ParseMode` to the `ParseContext` on the +`FetchEmitTuple` you submit. == Available Parse Modes @@ -51,9 +52,24 @@ See <<no-parse-mode>>. == Content Handler Types -The content handler type determines the format of the extracted text. It is set on the -`ContentHandlerFactory` configured in `parseContext` (or via the CLI `--handler` flag), and applies -to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`). +The content handler type determines the format of the extracted text. It is set in the +top-level `content-handler-factory` section of the JSON config (or via the CLI `--handler` flag), +and applies to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`). + +[source,json] +---- +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT" + } + } +} +---- + +Accepted `type` values: `TEXT`, `HTML`, `XML`, `MARKDOWN`, `BODY`, `IGNORE`. The CLI +`--handler` flag uses single-letter shortcuts (`t`, `h`, `x`, `m`, `b`, `i`) that map onto +these values. [cols="1,1,2"] |=== @@ -147,7 +163,7 @@ only `X-TIKA:content` and `X-TIKA:container_exception`. If you set your own === CLI usage -The `tika-app` batch processor supports `CONTENT_ONLY` via the `--content-only` +The `tika-app` Pipes processor supports `CONTENT_ONLY` via the `--content-only` flag: [source,bash] @@ -166,7 +182,7 @@ extracted markdown content. See <<_content_handler_types>> for the available han [source,json] ---- { - "parseContext": { + "pipes": { "parseMode": "NO_PARSE" } } diff --git a/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc index 85fba5889e..034a5d7b93 100644 --- a/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc +++ b/docs/modules/ROOT/pages/pipes/plugins/filesystem.adoc @@ -44,7 +44,7 @@ The File System plugin (`tika-pipes-file-system`) is the most common starting po == Complete Pipeline Example -The example below is the canonical filesystem-to-filesystem integration test config. Tokens like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, and `PLUGINS_PATHS` are placeholders the test harness substitutes; replace them with real paths in your own config. +The example below is the canonical filesystem-to-filesystem integration test config. Tokens like `FETCHER_BASE_PATH`, `EMITTER_BASE_PATH`, `PLUGINS_PATHS`, and `EMIT_INTERMEDIATE_RESULTS` are placeholders the test harness substitutes; replace the path tokens with real paths and `EMIT_INTERMEDIATE_RESULTS` with the boolean `true` or `false`. See xref:pipes/configuration.adoc[Pipes Configuration] for what each setting does. [source,json,subs=none] ---- @@ -208,7 +208,7 @@ Maintains a JSON status file that summarizes pipeline progress. The reporter wri |`statusFile` |_required_ -|Path of the JSON status file. The file is created on first write and overwritten in place. +|Path of the JSON status file. Absolute paths are written as given; relative paths resolve against the JVM's working directory at startup. Parent directories that don't exist are created automatically on first write. Always include a parent component (e.g., `./status.json` rather than bare `status.json`) — the auto-create step fails on a path with no parent. The file is created on first write and overwritten in place. |`reportUpdateMs` |_no default_ @@ -219,14 +219,17 @@ Maintains a JSON status file that summarizes pipeline progress. The reporter wri The reporter serializes an `AsyncStatus` object to JSON, containing: +* `started` — ISO-8601 timestamp of when the reporter was constructed. +* `lastUpdate` — ISO-8601 timestamp of the most recent write. * `asyncStatus` — current pipeline phase (`STARTED`, `COMPLETED`, `CRASHED`). -* `counts` — map of `RESULT_STATUS` to count (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`, `TIMEOUT`, `OOM`). -* `totalCountResult` — total documents processed and whether the enumeration is complete. -* `timestamp` — when the file was last written. -* `crashMessage` — populated only on fatal pipeline failure. +* `statusCounts` — map of `PipesResult.RESULT_STATUS` to count (e.g., `PARSE_SUCCESS`, `PARSE_EXCEPTION`, `TIMEOUT`, `OOM`, `EMIT_SUCCESS`, `EMIT_EXCEPTION`). +* `totalCountResult` — total documents discovered by the iterator and whether the enumeration is complete. +* `crashMessage` — empty string under normal operation; populated with a stack trace on fatal pipeline failure. The file is rewritten in full on each tick, not appended. +NOTE: The write is **not** atomic — the reporter opens the target path with `Files.newBufferedWriter`, truncates, and streams the JSON. A watcher reading concurrently with a write can observe a truncated or partial document. Have the watcher treat a parse error as "stale read, try again on the next poll" rather than as a real error. + [#watching] === Live status for watching applications @@ -237,7 +240,7 @@ The reporter is designed to support external "watchers" — UIs, dashboards, or "reportUpdateMs": 250 ---- -The watcher polls `statusFile` on its own interval and reads the most recent snapshot. Because the file is rewritten in full with the latest status, watchers do not need to handle partial reads. +The watcher polls `statusFile` on its own interval and reads the most recent snapshot. Each tick rewrites the file in full, so successive snapshots are always coherent — but because the write is not atomic, a watcher reading mid-write can see a truncated document. Tolerate JSON parse errors as transient and retry on the next poll (see the NOTE under <<_status_file_schema,Status file schema>>). This pattern is used by `tika-gui-v2` to drive its progress UI: the GUI starts a pipeline subprocess, points the reporter at a temp file, and polls that file every few hundred milliseconds. diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc b/docs/modules/ROOT/pages/pipes/plugins/index.adoc index d5173d2032..9846a4a1ea 100644 --- a/docs/modules/ROOT/pages/pipes/plugins/index.adoc +++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc @@ -68,7 +68,7 @@ Many plugins implement more than one (e.g., the S3 plugin provides fetcher, emit |— |✓ -|xref:pipes/plugins/solr.adoc[Solr] +|xref:pipes/plugins/solr.adoc[Apache Solr] |— |✓ |✓ @@ -80,7 +80,7 @@ Many plugins implement more than one (e.g., the S3 plugin provides fetcher, emit |✓ |✓ -|xref:pipes/plugins/kafka.adoc[Kafka] +|xref:pipes/plugins/kafka.adoc[Apache Kafka] |— |✓ |✓ diff --git a/docs/modules/ROOT/pages/pipes/timeouts.adoc b/docs/modules/ROOT/pages/pipes/timeouts.adoc index 77008aa9b9..bbb94abfd5 100644 --- a/docs/modules/ROOT/pages/pipes/timeouts.adoc +++ b/docs/modules/ROOT/pages/pipes/timeouts.adoc @@ -154,11 +154,41 @@ For processing many small documents where you want fast failure: == CLI Usage -When using `tika-app` with `--fork`, the `--fork-timeout` flag sets `progressTimeoutMillis`: +=== Standard mode (single file) + +For single-document parsing, `--fork` runs the parser in a forked JVM and `--fork-timeout` (milliseconds) caps how long it may run: + +[source,bash] +---- +java -jar tika-app.jar --fork --fork-timeout=120000 document.pdf +---- + +=== Pipes mode (`-i` / `-o`) + +In Pipes mode the parser ALREADY runs in forked JVMs — that's what `numClients` controls — so `--fork` does not apply. Setting it on the command line is silently ignored because tika-app routes `-i`/`-o` straight into the async dispatcher before its standard-mode flags are processed. + +Set per-parse timeouts in your `tika-config.json` instead: + +[source,json] +---- +{ + "pipes": { + "numClients": 4 + }, + "parse-context": { + "timeout-limits": { + "progressTimeoutMillis": 120000, + "totalTaskTimeoutMillis": 3600000 + } + } +} +---- + +Then run: [source,bash] ---- -java -jar tika-app.jar --fork --fork-timeout=120000 -i /input -o /output +java -jar tika-app.jar --config=tika-config.json -i /input -o /output ---- == Living Code Reference diff --git a/docs/modules/ROOT/pages/pipes/troubleshooting.adoc b/docs/modules/ROOT/pages/pipes/troubleshooting.adoc index d7e53de937..87077e8758 100644 --- a/docs/modules/ROOT/pages/pipes/troubleshooting.adoc +++ b/docs/modules/ROOT/pages/pipes/troubleshooting.adoc @@ -192,6 +192,40 @@ response-body bytes for HTTP-style fetchers (configurable via log catches the thrown exception. Lower `maxErrMsgSize` -- or set it to zero -- if your responses can contain sensitive data. +== Logging + +Tika uses https://logging.apache.org/log4j/2.x/[Log4j 2] for both tika-app and tika-server. Default output goes to `SYSTEM_ERR` with the pattern `%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n`. Each forked PipesServer logs with its own line prefix so parent and child output stays distinguishable; see <<_telling_fork_lines_from_parent_lines,Telling fork lines from parent lines>>. + +=== Default log4j2 configuration + +Each distribution ships its own `log4j2.xml` bundled inside the jar: + +* tika-app: `org/apache/tika/cli/log4j2.xml` (in `tika-app-<version>.jar`). +* tika-server: `org/apache/tika/server/log4j2.xml` (in the relevant `tika-server-*.jar`). + +Root level defaults to `INFO`. The bundled configurations are the source of truth — pull them out of the jar if you want to see exactly which loggers are tuned. + +=== Changing the log level + +In order of increasing reach: + +. **`tika-app` `-v` / `--verbose`** — sets the root logger to `DEBUG` for the current invocation only. Cheapest knob if you just want a noisier one-off run. +. **`tika-server` `logLevel` config field** — set `"server": {"logLevel": "debug"}` (or `"info"`) in `tika-config.json`. Applied at server startup. +. **Custom `log4j2.xml`** — for fine-grained control (per-logger levels, custom appenders, JSON output, file rotation), supply your own configuration via the standard Log4j 2 system property: ++ +[source,bash] +---- +java -Dlog4j.configurationFile=/path/to/my-log4j2.xml -jar tika-app.jar ... +---- ++ +Your file overrides the bundled one entirely. Start from a copy of the bundled config and tighten or relax loggers from there. + +=== Forked-process logging + +Forked PipesServer JVMs inherit the parent's log4j2 configuration unless `tika.pipes.server.stdio=discard` is set (in which case all child stdout/stderr is suppressed at the OS level — see <<_configuration_knobs_reference,Configuration knobs reference>>). + +To debug a specific fork, leave stdio on `inherit` (the default) and grep parent log output for the `pipesClientId=<n>` marker that each fork includes. + == Configuration knobs reference [cols="2,3"] diff --git a/docs/modules/ROOT/pages/pipes/unpack-config.adoc b/docs/modules/ROOT/pages/pipes/unpack-config.adoc index f3bc1fe5f4..5ad9615301 100644 --- a/docs/modules/ROOT/pages/pipes/unpack-config.adoc +++ b/docs/modules/ROOT/pages/pipes/unpack-config.adoc @@ -15,32 +15,62 @@ // limitations under the License. // -= UnpackConfig: Extracting Embedded Document Bytes += unpack-config: Extracting Embedded Document Bytes When processing container files (ZIP, DOCX, PDF with attachments, etc.), you may want to -extract the raw bytes of embedded documents in addition to parsing them. `UnpackConfig` -controls how embedded bytes are extracted and emitted. +extract the raw bytes of embedded documents in addition to parsing them. The +`unpack-config` component (Java: `UnpackConfig`) controls how embedded bytes are +extracted and emitted. == Quick Start -Use `ParseMode.UNPACK` to automatically extract embedded document bytes: +To turn on byte extraction for every document the pipeline processes, set +`parseMode` to `UNPACK` in the `pipes` section of your `tika-config.json`. +That's the minimum configuration — extraction defaults are fine for most cases. [source,json] ---- { - "id": "doc1", - "fetchKey": {"fetcherId": "fsf", "fetchKey": "container.docx"}, - "emitKey": {"emitterId": "fse", "emitKey": "container.docx"}, - "parseContext": { + "pipes": { "parseMode": "UNPACK" } } ---- +To tune extraction (size limits, naming, ZIP output, etc.), add an `unpack-config` +block under the top-level `parse-context` section. All the options listed below +live inside that block: + +[source,json] +---- +{ + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { + "unpack-config": { + "maxUnpackBytes": 104857600, + "zipEmbeddedFiles": true + } + } +} +---- + This extracts both metadata (like `RMETA` mode) and embedded document bytes. +[NOTE] +==== +You can also set `UnpackConfig` programmatically per request from Java code by +calling `parseContext.set(UnpackConfig.class, ...)` on the `ParseContext` +attached to your `FetchEmitTuple`. The JSON `parse-context` section above is the +declarative equivalent. +==== + == Configuration Options +All options below are fields of the `unpack-config` block — nest them inside +`parse-context.unpack-config` as shown in the Quick Start. + [cols="2,1,2,3"] |=== |Property |Type |Default |Description @@ -105,7 +135,7 @@ Extract embedded bytes with default naming: [source,json] ---- { - "parseContext": { + "pipes": { "parseMode": "UNPACK" } } @@ -118,8 +148,10 @@ Collect all embedded files into a ZIP with metadata: [source,json] ---- { - "parseContext": { - "parseMode": "UNPACK", + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { "unpack-config": { "zipEmbeddedFiles": true, "includeMetadataInZip": true, @@ -136,8 +168,10 @@ Control output file naming: [source,json] ---- { - "parseContext": { - "parseMode": "UNPACK", + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { "unpack-config": { "zeroPadName": 8, "suffixStrategy": "DETECTED", @@ -156,8 +190,10 @@ Prevent unbounded extraction from malicious files: [source,json] ---- { - "parseContext": { - "parseMode": "UNPACK", + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { "unpack-config": { "maxUnpackBytes": 104857600 } @@ -200,13 +236,15 @@ manifest with file checksums and MIME types, making it easy to verify and proces === Enabling Frictionless Output -Set `outputFormat` to `FRICTIONLESS` in your UnpackConfig: +Set `outputFormat` to `FRICTIONLESS` in your `unpack-config`: [source,json] ---- { - "parseContext": { - "parseMode": "UNPACK", + "pipes": { + "parseMode": "UNPACK" + }, + "parse-context": { "unpack-config": { "outputFormat": "FRICTIONLESS", "includeFullMetadata": true @@ -256,13 +294,20 @@ The `datapackage.json` file contains: === CLI Usage -Extract files in Frictionless format using the CLI: +Extract files in Frictionless format using the CLI. The `-Z` flag turns on recursive +unpack (the Pipes-mode counterpart of standard-mode `-z`), and `-i`/`-o` are the +Pipes input/output directories: [source,bash] ---- -java -jar tika-app.jar --unpack --unpack-format=FRICTIONLESS -i input.docx -o output/ +java -jar tika-app.jar -Z --unpack-format=FRICTIONLESS -i /path/to/input -o /path/to/output ---- +NOTE: `-i` expects a directory of containers to unpack, not a single file. For +one-off unpacking of a single document, see the standard-mode `-z`/`--extract` +flag — though as of 4.x that path also routes through the Pipes machinery and +expects an input directory. + == Code Examples For working code examples, see: diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc b/docs/modules/ROOT/pages/using-tika/cli/index.adoc index 594828fc78..a5284985b6 100644 --- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc @@ -20,12 +20,22 @@ WARNING: The tika-app command line interface is still in flux for 4.x. Options and behavior may change before the final release. -This section covers using Apache Tika from the command line via `tika-app`. +This section covers using Apache Tika from the command line via `tika-app`. The +authoritative option list is `java -jar tika-app.jar --help` — this page mirrors +that output and adds usage context. If the two disagree, `--help` wins; please +file a ticket. == Overview The Tika application (`tika-app`) is a command line utility for extracting -text content and metadata from all sorts of files. +text content and metadata from all sorts of files. It operates in three modes: + +* **Standard mode** — parse a single file, URL, or stdin and write the result + to stdout. +* **GUI mode** — `--gui` launches a desktop window for drag-and-drop parsing. +* **Tika Pipes mode** — process many documents from a directory (or S3, GCS, + Azure, JDBC, etc.) via the asynchronous Pipes pipeline. Activated by any of + the Pipes-only flags listed below. == Installation @@ -54,7 +64,13 @@ unzipped distribution. java -jar tika-app.jar [option...] [file|port...] ---- -== Command Line Options +If no file or URL is given (or `-` is given), `tika-app` parses standard input. +If no arguments are given at all and no stdin is piped in, the GUI launches. + +== Standard-mode Options + +These options apply to single-document parsing (the default mode). For Pipes-mode +options see <<_tika_pipes_processing,Tika Pipes Processing>> below. === Help and Information @@ -63,29 +79,36 @@ java -jar tika-app.jar [option...] [file|port...] |Option |Description |`-?` or `--help` -|Display usage instructions +|Print the usage message |`-v` or `--verbose` -|Enable debug-level output +|Print debug-level messages |`-V` or `--version` -|Show version details +|Print the Apache Tika version |=== -=== Operation Modes +=== GUI [cols="1,3"] |=== |Option |Description |`-g` or `--gui` -|Launch the graphical interface +|Launch the graphical interface (drag-and-drop parsing) +|=== -|`-s` or `--server` -|Start the web server +=== Configuration -|`-f` or `--fork` -|Enable fork mode for isolated extraction +[cols="1,3"] +|=== +|Option |Description + +|`--config=<tika-config.json>` +|TikaConfig file (JSON as of Tika 4.x). Must appear before `-g` or `-f`. + +|`--convert-config-xml-to-json=<input.xml>` +|Convert a legacy 3.x XML config to 4.x JSON format (parsers section only) and write to stdout. Redirect to save, e.g. `--convert-config-xml-to-json=tika-config.xml > tika-config.json`. |=== === Output Formatting @@ -95,22 +118,150 @@ java -jar tika-app.jar [option...] [file|port...] |Option |Description |`-x` or `--xml` -|Output XHTML (default) +|Output XHTML content (default) |`-h` or `--html` -|Output HTML +|Output HTML content |`-t` or `--text` -|Output plain text +|Output plain text content (body) |`--md` -|Output Markdown +|Output Markdown content (body) + +|`-T` or `--text-main` +|Output plain text — main content only, via the boilerpipe handler + +|`-A` or `--text-all` +|Output all text content |`-m` or `--metadata` |Output metadata only |`-j` or `--json` -|Output JSON metadata +|Output metadata in JSON + +|`-y` or `--xmp` +|Output metadata in XMP + +|`-J` or `--jsonRecursive` +|Output metadata and content from all embedded files. Combine with `-x`/`-h`/`-t`/`-m` to choose the content type (default: `-x`). + +|`-r` or `--pretty-print` +|For JSON, XML, and XHTML output, add newlines and whitespace for readability. + +|`-e<X>` or `--encoding=<X>` +|Use output encoding `<X>` (e.g. `UTF-8`). +|=== + +=== Detection and Language + +[cols="1,3"] +|=== +|Option |Description + +|`-d` or `--detect` +|Detect the document type and print the media type. + +|`-l` or `--language` +|Detect and print only the language. +|=== + +=== Content Options + +[cols="1,3"] +|=== +|Option |Description + +|`-p<X>` or `--password=<X>` +|Use document password `<X>` (for encrypted PDFs, OOXML, etc.). + +|`--digest=<X>` +|Include a digest of the parsed bytes. Supported: `md2`, `md5`, `sha1`, `sha256`, `sha384`, `sha512`, `sha3_256`, `sha3_384`, `sha3_512`. See xref:configuration/digesters.adoc[Digesters] for the underlying providers. +|=== + +=== Attachment Extraction (single-document) + +[cols="1,3"] +|=== +|Option |Description + +|`-z` or `--extract` +|Extract all attachments into the current directory. + +WARNING: As of 4.x `-z` routes through the async (Pipes) machinery, which expects an input directory, not a single file. Single-file attachment extraction is currently broken in this mode — see <<_tika_pipes_processing,Tika Pipes Processing>> below for the working `-Z` alternative. + +|`--extract-dir=<dir>` +|Target directory for `-z`. + +|`--on-exists=<mode>` +|Behavior when an output file already exists: `exception` (default), `replace`, or `skip`. + +|`--maxEmbeddedDepth=<X>` +|Maximum depth for embedded document extraction. + +|`--maxEmbeddedCount=<X>` +|Maximum number of embedded documents to extract. +|=== + +=== Async Mode + +[cols="1,3"] +|=== +|Option |Description + +|`-a` or `--async` +|Run Tika in async mode. Requires a `tikaConfig` file describing the pipeline. Activates Tika Pipes mode — see below. +|=== + +=== Listing and Inspection + +[cols="1,3"] +|=== +|Option |Description + +|`--list-parsers` +|List the available document parsers. + +|`--list-parser-details` +|List the available parsers and their supported mime types. + +|`--list-parser-details-apt` +|Same as `--list-parser-details` in apt format. + +|`--list-detectors` +|List the available document detectors. + +|`--list-met-models` +|List the available metadata models and their supported keys. + +|`--list-supported-types` +|List all known media types and related information. + +|`--compare-file-magic=<dir>` +|Compare Tika's known media types to the `file(1)` tool's magic directory. +|=== + +=== Fork Mode (process isolation) + +Fork mode parses the document in a separate JVM, protecting the main process +from parser crashes, OOM, and timeouts. + +[cols="1,3"] +|=== +|Option |Description + +|`-f` or `--fork` +|Run parsing in a forked JVM process. + +|`--fork-timeout=<ms>` +|Parse timeout in milliseconds (default: 60000). + +|`--fork-jvm-args=<args>` +|JVM args for the forked process, comma-separated. Example: `--fork-jvm-args=-Xmx512m,-Dsome.prop=value`. + +|`--fork-plugins-dir=<dir>` +|Directory containing plugin zips for the forked process. |=== == Examples @@ -129,31 +280,23 @@ java -jar tika-app.jar --text document.pdf java -jar tika-app.jar --json document.docx ---- -=== Pipeline processing - -Extract text from a remote document and search for keywords: +=== Extract Markdown from a file [source,bash] ---- -curl http://example.com/document.doc | java -jar tika-app.jar --text | grep -q keyword +java -jar tika-app.jar --md document.docx ---- -=== Tika Pipes processing +=== Reading from stdin -Process many documents by specifying input and output paths. Inputs can be a -local directory, S3, GCS, Azure, JDBC, and others via Tika Pipes fetchers: +Extract text from a remote document and search for keywords: [source,bash] ---- -java -jar tika-app.jar -i /path/to/input -o /path/to/output +curl http://example.com/document.doc | java -jar tika-app.jar --text | grep -q keyword ---- -=== Extract Markdown from a file - -[source,bash] ----- -java -jar tika-app.jar --md document.docx ----- +`tika-app` reads from standard input when no file argument is given (or when `-` is given). For batch processing of many documents, see <<_tika_pipes_processing,Tika Pipes Processing>> below. === Custom configuration @@ -178,13 +321,25 @@ it switches into Pipes mode so you can confirm which path is running. * Two positional arguments are given and the first is an existing directory (`tika-app.jar /in /out`). -* Any of these options are present: `-i`, `-o`, `--input`, `--output`, - `--fileList`, `-z`/`-Z`/`--extract`/`--extract-dir`, or `-a`/`--async`. * A single `.json` argument is given — it is treated as a Tika Pipes config file. +* Any of these options are present: `-i`, `--input`, `-o`, `--output`, + `--fileList`, `-z`, `--extract`, `--extract-dir`, `-Z`, or `-a`/`--async`. -Anything else (single file, URL, stdin, `--gui`, `--server`) stays in standard +Anything else (single file, URL, stdin, `--gui`) stays in standard single-document mode. +NOTE: The activation list mixes standard-mode and Pipes-only flags (`-z`, +`--extract`, `--extract-dir`). Passing one of those with a single file routes +into Pipes mode and then fails because the async dispatcher expects an input +directory. If you want unpack-while-pipes behaviour, use the Pipes-specific +`-Z` instead. + +CAUTION: Use the GNU-style double-dash form for long flags. `--input /path` +works; `-input /path` (single dash plus the long name) does not — `tika-app` +rejects single-dash long names with an `IllegalArgumentException` pointing +you at the right form. Single-letter short flags use one dash +(e.g., `-i`, `-eUTF-8`, `-X512m`). + === Basic Pipes Usage [source,bash] @@ -192,41 +347,92 @@ single-document mode. java -jar tika-app.jar -i /path/to/input -o /path/to/output ---- -This processes all files in the input directory and writes JSON metadata (RMETA format) -to the output directory. +This processes all files in the input directory and writes JSON metadata +(RMETA format) to the output directory. === Tika Pipes Options +==== Input and output + [cols="1,3"] |=== |Option |Description -|`-i` -|Input directory +|`-i` or `--input=<dir>` +|Input directory. + +|`-o` or `--output=<dir>` +|Output directory. + +|`--fileList=<path>` +|File list (one path per line, relative to `-i` or absolute). + +|`--on-exists=<mode>` +|Behavior when an output file already exists: `exception` (default), `replace`, or `skip`. +|=== + +==== Output formatting -|`-o` -|Output directory +[cols="1,3"] +|=== +|Option |Description -|`--handler` -|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore (default: `t`) +|`--handler=<X>` +|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore. Default: `t`. |`--concatenate` -|Concatenate content from all embedded documents into a single content field +|Concatenate content from all embedded documents into a single content field. |`--content-only` -|Output only extracted content (no metadata, no JSON wrapper); implies `--concatenate` +|Output only the extracted content (no metadata, no JSON wrapper). Implies `--concatenate`. +|=== + +==== Execution + +[cols="1,3"] +|=== +|Option |Description + +|`-n` or `--numClients=<N>` +|Number of parallel forked processes. + +|`-X<size>` +|`-Xmx` size for the forked processes (e.g. `-X512m`). + +|`-T` or `--timeoutMs=<ms>` +|Timeout for each parse in milliseconds. +|=== + +==== Configuration + +[cols="1,3"] +|=== +|Option |Description + +|`-c` or `--config=<file>` +|Tika config file. `--config=<file>` (the standard-mode long form) also works in Pipes mode. + +|`-p` or `--pluginsDir=<dir>` +|Plugins directory. +|=== + +==== Unpack (recursive attachment extraction) + +[cols="1,3"] +|=== +|Option |Description -|`--on-exists` -|Behavior when an output file already exists: `exception` (default), `replace` or `skip` +|`-Z` +|Recursively unpack all attachments. This is the Pipes-mode counterpart to standard-mode `-z`. -|`-T` or `--timeoutMs` -|Timeout for each parse in milliseconds +|`--unpack-format=<format>` +|Output format: `REGULAR` (default) or `FRICTIONLESS`. -|`-n` or `--numClients` -|Number of parallel forked processes +|`--unpack-mode=<mode>` +|Output mode: `ZIPPED` (default) or `DIRECTORY`. -|`-p` or `--pluginsDir` -|Plugins directory +|`--unpack-include-metadata` +|Include `metadata.json` in Frictionless output. |=== === Tika Pipes Examples @@ -255,3 +461,10 @@ Use a Tika config file alongside the Pipes options. Both `--config=foo.json` ---- java -jar tika-app.jar -i /path/to/input -o /path/to/output --config=tika-config.json ---- + +Recursively unpack attachments into the output directory: + +[source,bash] +---- +java -jar tika-app.jar -i /path/to/input -o /path/to/output -Z +---- diff --git a/docs/modules/ROOT/pages/using-tika/index.adoc b/docs/modules/ROOT/pages/using-tika/index.adoc index eaf944757b..a81c5ee6c5 100644 --- a/docs/modules/ROOT/pages/using-tika/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/index.adoc @@ -16,6 +16,7 @@ // = Getting Started with Apache Tika +:toc: Apache Tika can be used in several ways depending on your needs. Choose the approach that best fits your use case. diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc b/docs/modules/ROOT/pages/using-tika/server/index.adoc index 315a1f91a4..ffe142677a 100644 --- a/docs/modules/ROOT/pages/using-tika/server/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc @@ -24,6 +24,13 @@ This section covers running Apache Tika as a REST server via `tika-server`. Tika Server provides a RESTful HTTP interface for parsing documents and extracting content. It can be deployed as a standalone service or in a containerized environment. +In Tika 4.x, all parsing happens in forked child processes via the Tika Pipes +infrastructure — the request-handling process never loads parser libraries directly. +This provides process isolation (a parser crash or OOM cannot take down the server) +at the cost of requiring a Pipes configuration. See +xref:migration-to-4x/migrating-tika-server-4x.adoc[Migrating Tika Server to 4.x] +for the full breaking-change list when upgrading from 3.x. + == Basic Usage [source,bash] @@ -31,84 +38,189 @@ content. It can be deployed as a standalone service or in a containerized enviro java -jar tika-server-standard-X.Y.Z.jar ---- -The server starts on port 9998 by default. +The server starts on `localhost:9998` by default. + +== Command Line Options + +[cols="1,3"] +|=== +|Option |Description + +|`-h <host>` or `--host <host>` +|Hostname to bind to. Default `localhost`. Use `*` to bind to all interfaces. + +|`-p <port>` or `--port <port>` +|Listen port. Default `9998`. + +|`-c <file>` or `--config <file>` +|Path to `tika-config.json`. See <<_configuration,Configuration>> below. + +|`-a <file>` or `--pluginsConfig <file>` +|Path to the Tika Pipes plugins configuration file. + +|`-i <id>` or `--id <id>` +|Server ID, surfaced in the `/status` endpoint and in logs. + +|`-?` or `--help` +|Print the usage message. +|=== + +NOTE: Other behavior — `enableUnsecureFeatures`, CORS, TLS, timeouts — is configured +in the JSON config file (see <<_configuration,Configuration>>), not via CLI flags. == Endpoints +For the canonical endpoint inventory, including the PUT vs POST split and the +multipart-config pattern introduced in 4.x, see the +xref:migration-to-4x/migrating-tika-server-4x.adoc#_new_tika_endpoint_structure[New `/tika` Endpoint Structure] +section of the migration guide. The most-used endpoints are summarized below. + === Content Extraction (`/tika`) -The `/tika` endpoint extracts content from a document as plain text. +Simple PUT — the entire request body is the document, no metadata: [source,bash] ---- +# Default: raw XHTML curl -T document.pdf http://localhost:9998/tika ----- -==== Markdown Output (`/tika/md`) +# Explicit handler +curl -T document.pdf http://localhost:9998/tika/text +curl -T document.docx http://localhost:9998/tika/html +curl -T document.docx http://localhost:9998/tika/md +curl -T document.pdf http://localhost:9998/tika/json +---- -The `/tika/md` endpoint extracts content as Markdown, preserving structural semantics -like headings, lists, tables, and emphasis: +POST with multipart for custom per-request configuration: [source,bash] ---- -curl -T document.docx http://localhost:9998/tika/md +curl -X POST http://localhost:9998/tika/json \ + -F "[email protected]" \ + -F "config={\"pdf-parser\":{\"ocrStrategy\":\"no_ocr\"}};type=application/json" ---- -==== Custom Handler Type +Valid handler paths under `/tika/`: `text`, `html`, `xml`, `md`, `json`. For +the JSON variant, you can also nest a handler — `/tika/json/text`, +`/tika/json/html`, etc. — to choose the content-field format inside the JSON +envelope; that nested handler accepts the full set (`text`, `html`, `xml`, +`md`, `markdown`, `body`, `ignore`). + +==== `X-Tika-Handler` header -Use the `X-Tika-Handler` header to control the output format. Valid values: `text` (default), -`html`, `xml`, `markdown`, `ignore`. +For the root `/tika` PUT endpoint you can also pick the handler with a header: [source,bash] ---- curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika ---- +Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`. + === Recursive Metadata (`/rmeta`) -The `/rmeta` endpoint returns metadata for the container document and all embedded documents -as a JSON array of metadata objects. +Returns metadata for the container document and all embedded documents as a JSON +array of metadata objects. The handler controls the content field of each entry: [source,bash] ---- -curl -T document.pdf http://localhost:9998/rmeta +curl -T document.pdf http://localhost:9998/rmeta # default: text +curl -T document.pdf http://localhost:9998/rmeta/text +curl -T document.pdf http://localhost:9998/rmeta/html +curl -T document.pdf http://localhost:9998/rmeta/xml +curl -T document.docx http://localhost:9998/rmeta/markdown # or /md +curl -T document.pdf http://localhost:9998/rmeta/ignore # metadata only ---- -Content handler can be specified in the URL path: +=== Metadata only (`/meta`) -* `/rmeta/text` - plain text content (default) -* `/rmeta/html` - HTML content -* `/rmeta/xml` - XHTML content -* `/rmeta/markdown` - Markdown content -* `/rmeta/ignore` - metadata only, no content +Returns container-document metadata only (no recursive embedded list, no content): [source,bash] ---- -curl -T document.docx http://localhost:9998/rmeta/markdown +curl -T document.pdf http://localhost:9998/meta +curl -T document.pdf http://localhost:9998/meta/Content-Type # single field ---- +=== Other endpoints + +* `/version` — server version +* `/status` — health/status (includes server ID) +* `/parsers` and `/parsers/details` — registered parsers +* `/detectors` — registered detectors +* `/mime-types` — known MIME types +* `/detect/stream` — type detection only (no parsing) +* `/language/stream`, `/language/string` — language detection +* `/translate/all/\{translator}/\{src}/\{dest}` — translation +* `/pipes`, `/async` — Pipes-based bulk processing + +== Configuration + +Server behavior beyond host/port is controlled by a JSON config file passed via +`-c`/`--config`. The `server` section in that file maps to fields on +`TikaServerConfig`; commonly-set fields include: + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`enableUnsecureFeatures` +|`false` +|Enable the `/config` family of endpoints (see <<_security_configuration,Security Configuration>>). + +|`cors` +|`""` (off) +|`*` to allow any origin, or an explicit origin string. Empty disables CORS. + +|`returnStackTrace` +|`false` +|Include parser stack traces in error responses. Useful in dev, dangerous in production (leaks internals). + +|`digest` +|`""` (off) +|Compute a digest of the parsed bytes. Comma-separated algorithm names: `md5`, `sha1`, `sha256`, `sha384`, `sha512`. + +|`digestMarkLimit` +|`20971520` (20 MiB) +|Max bytes buffered for digest computation. + +|`logLevel` +|_inherited_ +|`debug` or `info` to override the runtime log level. + +|`idBase` +|random UUID +|Override the auto-generated server ID (the `-i` CLI flag is the same setting). +|=== + +For the full Pipes-related sections (`pipes`, `fetchers`, `emitters`, `parse-context`) +that tika-server 4.x requires, see +xref:migration-to-4x/migrating-tika-server-4x.adoc#_configuration_changes[Configuration Changes]. + == Topics -* xref:using-tika/server/tls.adoc[TLS/SSL Configuration] - Secure your server with TLS and mutual authentication +* xref:using-tika/server/tls.adoc[TLS/SSL Configuration] — Secure your server with TLS and mutual authentication +* xref:migration-to-4x/migrating-tika-server-4x.adoc[Migrating Tika Server to 4.x] — Breaking changes from 3.x == Security Configuration === Config Endpoint Protection -By default, the `/config` endpoints that expose server configuration are disabled for security -reasons. These endpoints can reveal sensitive information about your server configuration, -including parser settings and system properties (see https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271[CVE-2015-3271]). +By default, the `/config` family of endpoints that expose server configuration are +disabled. These endpoints can reveal sensitive information about your server, +including parser settings and system properties (see +https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2015-3271[CVE-2015-3271]). -The protected endpoints include: +Protected endpoints include: -* `/config` - Returns the server's full configuration -* `/config/parsers` - Returns configured parsers -* `/config/detectors` - Returns configured detectors -* `/config/mimeTypes` - Returns MIME type mappings +* `/tika/config` and `/tika/config/{text,html,xml,md,json}` — POST with multipart config +* `/rmeta/config` — POST with multipart config +* `/meta/config` — POST with multipart config === Enabling Config Endpoints -To enable these endpoints: +The setting is JSON-only — there is no CLI flag. Set `enableUnsecureFeatures` in +your config file's `server` section: [source,json] ---- @@ -119,24 +231,17 @@ To enable these endpoints: } ---- -WARNING: Only enable `enableUnsecureFeatures` if you have secured access to Tika Server through -network controls (firewalls, private subnets), a reverse proxy (nginx, Apache httpd), or -xref:using-tika/server/tls.adoc[2-way TLS authentication]. Exposing config endpoints to -untrusted networks can help attackers identify vulnerabilities and craft targeted attacks. - -=== Command Line Usage - -You can also enable unsecure features via command line: - -[source,bash] ----- -java -jar tika-server-standard-X.Y.Z.jar --enableUnsecureFeatures ----- +WARNING: Only enable `enableUnsecureFeatures` if you have secured access to Tika +Server through network controls (firewalls, private subnets), a reverse proxy +(nginx, Apache httpd), or +xref:using-tika/server/tls.adoc[2-way TLS authentication]. Exposing config endpoints +to untrusted networks can help attackers identify vulnerabilities and craft +targeted attacks. === Security Best Practices -1. **Keep config endpoints disabled** in production (default behavior) -2. **Use network controls** to restrict access to the Tika Server (firewall rules, private subnets) -3. **Consider TLS** for encrypted communication - see xref:using-tika/server/tls.adoc[TLS Configuration] -4. **Run with minimal privileges** - don't run Tika Server as root -5. **Monitor logs** for unusual access patterns +1. **Keep config endpoints disabled** in production (default behavior). +2. **Use network controls** to restrict access (firewall rules, private subnets). +3. **Consider TLS** for encrypted communication — see xref:using-tika/server/tls.adoc[TLS Configuration]. +4. **Run with minimal privileges** — don't run Tika Server as root. +5. **Monitor logs** for unusual access patterns. diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 980f2833c5..3746260c3c 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -38,11 +38,13 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; @@ -68,6 +70,7 @@ import org.apache.tika.Tika; import org.apache.tika.async.cli.TikaAsyncCLI; import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.config.TimeoutLimits; +import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.Detector; @@ -470,9 +473,15 @@ public class TikaCLI { } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) { pipeMode = false; displayParsers(false, false); + } else if (arg.equals("--list-parser-names")) { + pipeMode = false; + displayParserNames(); } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) { pipeMode = false; displayDetectors(); + } else if (arg.equals("--list-detector-names")) { + pipeMode = false; + displayDetectorNames(); } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) { pipeMode = false; displayParsers(true, false); @@ -560,13 +569,28 @@ public class TikaCLI { maxEmbeddedCount = Integer.parseInt(arg.substring("--maxEmbeddedCount=".length())); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { prettyPrint = true; - } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) { - throw new IllegalArgumentException("As of Tika 2.0, the server option is no longer supported in tika-app.\n" + "See https://wiki.apache.org/tika/TikaJAXRS for usage."); } else if (arg.startsWith("-c")) { networkURI = new URI(arg.substring("-c".length())); } else if (arg.startsWith("--client=")) { networkURI = new URI(arg.substring("--client=".length())); } else { + // Any arg that reaches here is either "-" (stdin), an existing + // file, a URL, or an unknown/typo'd flag. The default fallthrough + // lets typos like "-input /path" hit new URL("-input") and + // surface as a confusing MalformedURLException. Catch dash-prefixed + // args that aren't the stdin marker or an existing file and emit + // an actionable error before that happens. + if (arg.startsWith("-") && !arg.equals("-") && !new File(arg).exists()) { + String hint = " Run with --help for the full option list."; + // Heuristic: single-dash + multi-letter (e.g. "-input") is + // usually a long-form-with-one-dash typo. Single-dash + one + // letter (e.g. "-s") or "--<unknown>" is just an unknown flag. + if (arg.length() > 2 && !arg.startsWith("--")) { + hint = " Long-form flags require two dashes (try '-" + + arg + "' instead of '" + arg + "')." + hint; + } + throw new IllegalArgumentException("Unknown option '" + arg + "'." + hint); + } pipeMode = false; configure(); @@ -815,7 +839,8 @@ public class TikaCLI { out.println(" -l or --language Output only language"); out.println(" -d or --detect Detect document type"); out.println(" --digest=X Include digest X (md2, md5, sha1,"); - out.println(" sha256, sha384, sha512"); + out.println(" sha256, sha384, sha512,"); + out.println(" sha3_256, sha3_384, sha3_512)"); out.println(" -eX or --encoding=X Use output encoding X"); out.println(" -pX or --password=X Use document password X"); out.println(" -z or --extract Extract all attachements into current directory"); @@ -837,12 +862,17 @@ public class TikaCLI { out.println(); out.println(" --list-parsers"); out.println(" List the available document parsers"); + out.println(" --list-parser-names"); + out.println(" List parsers as tab-separated class-name<TAB>friendly-name"); + out.println(" (friendly names are the kebab-case keys used in JSON config)"); out.println(" --list-parser-details"); out.println(" List the available document parsers and their supported mime types"); out.println(" --list-parser-details-apt"); out.println(" List the available document parsers and their supported mime types in apt format."); out.println(" --list-detectors"); out.println(" List the available document detectors"); + out.println(" --list-detector-names"); + out.println(" List detectors as tab-separated class-name<TAB>friendly-name"); out.println(" --list-met-models"); out.println(" List the available metadata models, and their supported keys"); out.println(" --list-supported-types"); @@ -878,8 +908,8 @@ public class TikaCLI { out.println(" java -jar tika-app.jar <inputDirectory> <outputDirectory>"); out.println(); out.println("Tika Pipes Options:"); - out.println(" -i Input directory"); - out.println(" -o Output directory"); + out.println(" -i, --input=<dir> Input directory"); + out.println(" -o, --output=<dir> Output directory"); out.println(" -n, --numClients Number of forked processes"); out.println(" -X -Xmx in the forked processes"); out.println(" -T, --timeoutMs Timeout for each parse in milliseconds"); @@ -1056,6 +1086,65 @@ public class TikaCLI { } } + private void displayParserNames() throws TikaException, IOException, SAXException { + configure(); + Set<Class<?>> seen = new LinkedHashSet<>(); + collectParserClasses(parser, seen); + printNames(seen, "parsers"); + } + + private void displayDetectorNames() throws TikaException, IOException, SAXException { + configure(); + Set<Class<?>> seen = new LinkedHashSet<>(); + collectDetectorClasses(detector, seen); + printNames(seen, "detectors"); + } + + private void collectParserClasses(Parser p, Set<Class<?>> out) { + if (p instanceof ParserDecorator) { + p = ((ParserDecorator) p).getWrappedParser(); + } + if (p instanceof CompositeParser) { + for (Parser sub : ((CompositeParser) p).getParsers().values()) { + collectParserClasses(sub, out); + } + } else { + out.add(p.getClass()); + } + } + + private void collectDetectorClasses(Detector d, Set<Class<?>> out) { + if (d instanceof CompositeDetector) { + for (Detector sub : ((CompositeDetector) d).getDetectors()) { + collectDetectorClasses(sub, out); + } + } else { + out.add(d.getClass()); + } + } + + private void printNames(Set<Class<?>> classes, String indexFileName) throws TikaException { + // Look up friendly names via ComponentRegistry, which reads them from + // the META-INF/tika/<indexFileName>.idx files generated at compile + // time by the @TikaComponent annotation processor. (The annotation + // itself has CLASS retention, so reflection on the class can't see it + // at runtime — the .idx file is the authoritative source.) + ComponentRegistry registry; + try { + registry = new ComponentRegistry(indexFileName, Thread.currentThread().getContextClassLoader()); + } catch (TikaException e) { + throw e; + } + // Sort by class name for stable output; tab-separated so downstream + // scripts can `cut -f2` to get the JSON-config names. + List<Class<?>> sorted = new ArrayList<>(classes); + sorted.sort(Comparator.comparing(Class::getName)); + for (Class<?> cls : sorted) { + String fname = registry.getFriendlyName(cls); + System.out.println(cls.getName() + "\t" + (fname != null ? fname : "(not registered)")); + } + } + private String indent(int indent) { return " ".substring(0, indent); } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml index fed6ba3af7..ccaf17bc91 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/pom.xml @@ -51,6 +51,12 @@ <type>test-jar</type> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java new file mode 100644 index 0000000000..d7c5e1e4aa --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/java/org/apache/tika/parser/ocr/tess4j/Tess4JConfigExamplesTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr.tess4j; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.parser.Parser; + +/** + * Validates the Tess4J configuration examples used in the docs. + * + * <p>The JSON files under {@code src/test/resources/config-examples/} are + * symlinked from {@code docs/modules/ROOT/examples/}, so any change that + * keeps these tests passing also keeps the published docs correct. + * + * <p>If you change a tess4j example JSON in the docs tree, this test will fail + * unless the JSON still loads against {@link Tess4JConfig} / {@link Tess4JParser}. + * That's the point: documentation drift is caught at build time. + */ +public class Tess4JConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + + @TempDir + Path tempDir; + + private Parser loadAndValidate(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + String json = new String(is.readAllBytes(), StandardCharsets.UTF_8); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + Parser parser = loader.loadParsers(); + assertNotNull(parser, "Parser should not be null for: " + resourceName); + return parser; + } + } + + @Test + public void testTess4JBasicConfig() throws Exception { + loadAndValidate("tess4j-basic.json"); + } + + @Test + public void testTess4JFullConfig() throws Exception { + loadAndValidate("tess4j-full.json"); + } +} diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json new file mode 100644 index 0000000000..f75ee7a74f --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-basic.json @@ -0,0 +1,11 @@ +{ + "parsers": [ + { + "tess4j-parser": { + "dataPath": "/usr/share/tesseract-ocr/5/tessdata", + "nativeLibPath": "/usr/lib/x86_64-linux-gnu", + "poolSize": 4 + } + } + ] +} diff --git a/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json new file mode 100644 index 0000000000..8ad9c9d661 --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-parser-tess4j-module/src/test/resources/config-examples/tess4j-full.json @@ -0,0 +1,20 @@ +{ + "parsers": [ + { + "tess4j-parser": { + "dataPath": "/usr/share/tesseract-ocr/5/tessdata", + "nativeLibPath": "/usr/lib/x86_64-linux-gnu", + "language": "eng", + "pageSegMode": 1, + "ocrEngineMode": 3, + "poolSize": 4, + "timeoutSeconds": 120, + "dpi": 300, + "minFileSizeToOcr": 0, + "maxFileSizeToOcr": 2147483647, + "maxImagePixels": 100000000, + "skipOcr": false + } + } + ] +} diff --git a/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml index c2cdeac4ee..81f31363c5 100644 --- a/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-vlm/pom.xml @@ -76,6 +76,12 @@ <artifactId>junit-jupiter</artifactId> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> diff --git a/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java new file mode 100644 index 0000000000..7b3da8a5f0 --- /dev/null +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/java/org/apache/tika/parser/vlm/VLMConfigExamplesTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.vlm; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.parser.Parser; + +/** + * Validates the VLM (OpenAI / Claude / Gemini) configuration examples used + * in the docs. + * + * <p>The JSON files under {@code src/test/resources/config-examples/} are + * symlinked from {@code docs/modules/ROOT/examples/}, so any change that + * keeps these tests passing also keeps the published docs correct. + * + * <p>The tests only validate that the JSON deserializes and the parser + * constructs — no HTTP call is made to any model endpoint, so they're safe + * to run without network access or API keys. + */ +public class VLMConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + + @TempDir + Path tempDir; + + private Parser loadAndValidate(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + String json = new String(is.readAllBytes(), StandardCharsets.UTF_8); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + Parser parser = loader.loadParsers(); + assertNotNull(parser, "Parser should not be null for: " + resourceName); + return parser; + } + } + + @Test + public void testOpenAIVLMBasic() throws Exception { + loadAndValidate("openai-vlm-basic.json"); + } + + @Test + public void testOpenAIVLMFull() throws Exception { + loadAndValidate("openai-vlm-full.json"); + } + + @Test + public void testClaudeVLMBasic() throws Exception { + loadAndValidate("claude-vlm-basic.json"); + } + + @Test + public void testClaudeVLMFull() throws Exception { + loadAndValidate("claude-vlm-full.json"); + } + + @Test + public void testGeminiVLMBasic() throws Exception { + loadAndValidate("gemini-vlm-basic.json"); + } + + @Test + public void testGeminiVLMFull() throws Exception { + loadAndValidate("gemini-vlm-full.json"); + } + + @Test + public void testVLMForPdfParsing() throws Exception { + loadAndValidate("vlm-pdf-parsing.json"); + } +} diff --git a/docs/modules/ROOT/examples/claude-vlm-basic.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json similarity index 100% copy from docs/modules/ROOT/examples/claude-vlm-basic.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-basic.json diff --git a/docs/modules/ROOT/examples/claude-vlm-full.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json similarity index 84% copy from docs/modules/ROOT/examples/claude-vlm-full.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json index 9dc7ff67d3..682540197b 100644 --- a/docs/modules/ROOT/examples/claude-vlm-full.json +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/claude-vlm-full.json @@ -11,7 +11,9 @@ "inlineContent": true, "skipOcr": false, "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 + "maxFileSizeToOcr": 52428800, + "maxImagePixels": 100000000, + "allowRuntimePrompt": false } } ] diff --git a/docs/modules/ROOT/examples/gemini-vlm-basic.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json similarity index 100% copy from docs/modules/ROOT/examples/gemini-vlm-basic.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-basic.json diff --git a/docs/modules/ROOT/examples/gemini-vlm-full.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json similarity index 84% copy from docs/modules/ROOT/examples/gemini-vlm-full.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json index ab09b993f0..8773b52eac 100644 --- a/docs/modules/ROOT/examples/gemini-vlm-full.json +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/gemini-vlm-full.json @@ -11,7 +11,9 @@ "inlineContent": true, "skipOcr": false, "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 + "maxFileSizeToOcr": 52428800, + "maxImagePixels": 100000000, + "allowRuntimePrompt": false } } ] diff --git a/docs/modules/ROOT/examples/openai-vlm-basic.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json similarity index 100% copy from docs/modules/ROOT/examples/openai-vlm-basic.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-basic.json diff --git a/docs/modules/ROOT/examples/openai-vlm-full.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json similarity index 77% copy from docs/modules/ROOT/examples/openai-vlm-full.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json index 91baafc74e..9c80fb77b4 100644 --- a/docs/modules/ROOT/examples/openai-vlm-full.json +++ b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/openai-vlm-full.json @@ -3,6 +3,7 @@ { "openai-vlm-parser": { "baseUrl": "http://127.0.0.1:8000", + "completionsPath": "/v1/chat/completions", "model": "jinaai/jina-vlm", "prompt": "Extract all visible text from this image. Return the text in markdown format, preserving the original structure (headings, lists, tables, paragraphs). Do not describe the image. Only return the extracted text.", "maxTokens": 4096, @@ -11,7 +12,9 @@ "inlineContent": true, "skipOcr": false, "minFileSizeToOcr": 0, - "maxFileSizeToOcr": 52428800 + "maxFileSizeToOcr": 52428800, + "maxImagePixels": 100000000, + "allowRuntimePrompt": false } } ] diff --git a/docs/modules/ROOT/examples/vlm-pdf-parsing.json b/tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json similarity index 100% copy from docs/modules/ROOT/examples/vlm-pdf-parsing.json copy to tika-parsers/tika-parsers-ml/tika-vlm/src/test/resources/config-examples/vlm-pdf-parsing.json diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json index 4e3e75aeae..96282dbe14 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/tesseract-full.json @@ -23,6 +23,7 @@ "outputType": "TXT", "pageSeparator": "", "pageSegMode": "1", + "preloadLangs": false, "preserveInterwordSpacing": false, "resize": 200, "skipOcr": false, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index f006d43222..72f9f0f42f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -440,7 +440,7 @@ public class TesseractOCRConfig implements Serializable { /** * @param resize the resize to set. Valid range of values is 100-900. - * Default value is 900. + * Default value is 200 (see the {@code resize} field initializer). */ public void setResize(int resize) { for (int i = 1; i < 10; i++) { diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 845b5b1940..fb19447111 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -57,7 +57,11 @@ import org.apache.tika.utils.StringUtils; public class TikaAsyncCLI { private static final long TIMEOUT_MS = 600_000; - private static final Logger LOG = LoggerFactory.getLogger(TikaAsyncCLI.class); + // Use the user-facing "tika.pipes" name rather than the FQ class name so + // the internal TikaAsyncCLI detail doesn't leak into user-visible logs. + // tika-app users invoke the Pipes processor via -i/-o flags and shouldn't + // need to know about the underlying async CLI class. + private static final Logger LOG = LoggerFactory.getLogger("tika.pipes"); private static Options getOptions() { Options options = new Options(); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java index 16f2f2b3e0..67afe6e5c6 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/main/java/org/apache/tika/pipes/iterator/azblob/AZBlobPipesIteratorFactory.java @@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "endpoint": "https://account.blob.core.windows.net", * "container": "my-container", * "prefix": "documents/", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java index c052ce03a6..bfd9dda160 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-csv/src/main/java/org/apache/tika/pipes/iterator/csv/CSVPipesIteratorFactory.java @@ -34,10 +34,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "csvPath": "/path/to/files.csv", * "fetchKeyColumn": "path", * "emitKeyColumn": "id", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java index 91df67eb42..6a6ab42033 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/iterator/fs/FileSystemPipesIteratorFactory.java @@ -33,10 +33,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "file-system-pipes-iterator": { * "basePath": "/path/to/files", * "countTotal": true, - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java index cba8c18336..748d0f467c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/main/java/org/apache/tika/pipes/iterator/gcs/GCSPipesIteratorFactory.java @@ -34,10 +34,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "projectId": "my-project", * "bucket": "my-bucket", * "prefix": "documents/", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java index d805cf468c..a4571ba37b 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/iterator/jdbc/JDBCPipesIteratorFactory.java @@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "select": "select id, path from documents", * "fetchKeyColumn": "path", * "idColumn": "id", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java index b6f6c683c0..ee950154b9 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIteratorFactory.java @@ -32,10 +32,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "pipes-iterator": { * "json-pipes-iterator": { * "jsonPath": "/path/to/files.json", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java index 4698e20337..ff7336292e 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-kafka/src/main/java/org/apache/tika/pipes/iterator/kafka/KafkaPipesIteratorFactory.java @@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "bootstrapServers": "localhost:9092", * "groupId": "my-group", * "autoOffsetReset": "earliest", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java index 05aa62132d..33b52cae6c 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/main/java/org/apache/tika/pipes/iterator/s3/S3PipesIteratorFactory.java @@ -36,10 +36,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "prefix": "documents/", * "credentialsProvider": "profile", * "profile": "default", - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java index f822e2c68f..c863afe1ac 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/main/java/org/apache/tika/pipes/iterator/solr/SolrPipesIteratorFactory.java @@ -35,10 +35,8 @@ import org.apache.tika.plugins.ExtensionConfig; * "solrUrls": ["http://localhost:8983/solr"], * "idField": "id", * "rows": 5000, - * "baseConfig": { - * "fetcherId": "my-fetcher", - * "emitterId": "my-emitter" - * } + * "fetcherId": "my-fetcher", + * "emitterId": "my-emitter" * } * } * </pre> diff --git a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java index 7287a4cbd1..40bd04b137 100644 --- a/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java +++ b/tika-plugins-core/src/main/java/org/apache/tika/plugins/ThreadSafeUnzipper.java @@ -71,6 +71,19 @@ public class ThreadSafeUnzipper { return; } + // Destination exists but has no completion marker. Possible causes: + // a previous extraction was killed mid-stream, the marker was deleted + // out from under us, or something other than our extractor put files + // there. Without this cleanup the subsequent Files.move() below will + // fail with DirectoryNotEmptyException on every run until a human + // manually removes the directory. Treat the half-extracted state as + // garbage and rebuild. + if (Files.exists(destination)) { + LOG.warn("destination {} exists without a completion marker; " + + "treating as stale partial extraction and removing", destination); + deleteRecursively(destination); + } + // Extract to a unique temp directory Path tempDir = destination.resolveSibling( destination.getFileName() + ".tmp." + UUID.randomUUID());
