This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 231ac690b6 TIKA-4625: Add AsciiDoc documentation module (#2536)
231ac690b6 is described below
commit 231ac690b6665598a330af4b98ff6710b6e787d3
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 23 10:57:09 2026 -0500
TIKA-4625: Add AsciiDoc documentation module (#2536)
* TIKA-4625: Create initial POC for AsciiDoc documentation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <[email protected]>
---
.github/workflows/main-jdk17-build.yml | 4 +
.../main-jdk17-windows-build-multi-locale.yml | 4 +
.github/workflows/main-jdk17-windows-build.yml | 4 +
.github/workflows/main-jdk21-build.yml | 2 +
.github/workflows/main-jdk25-build.yml | 2 +
docs/pom.xml | 96 ++++++++
docs/src/assembly/docs.xml | 37 +++
docs/src/main/asciidoc/advanced/index.adoc | 31 +++
docs/src/main/asciidoc/advanced/robustness.adoc | 137 +++++++++++
.../{ => src/main/asciidoc/advanced}/spooling.adoc | 0
docs/src/main/asciidoc/configuration/index.adoc | 40 +++
.../asciidoc/configuration/parsers/pdf-parser.adoc | 43 ++++
.../parsers/tesseract-ocr-parser.adoc | 67 +++++
docs/src/main/asciidoc/faq.adoc | 28 +++
docs/src/main/asciidoc/index.adoc | 72 ++++++
docs/src/main/asciidoc/maintainers/index.adoc | 29 +++
.../maintainers/release-guides/docker.adoc | 133 ++++++++++
.../asciidoc/maintainers/release-guides/grpc.adoc | 32 +++
.../asciidoc/maintainers/release-guides/helm.adoc | 138 +++++++++++
.../asciidoc/maintainers/release-guides/index.adoc | 32 +++
.../asciidoc/maintainers/release-guides/tika.adoc | 271 +++++++++++++++++++++
.../asciidoc/migration-to-4x/design-notes-4x.adoc | 127 ++++++++++
docs/src/main/asciidoc/migration-to-4x/index.adoc | 32 +++
.../migration-to-4x/metadata-changes-4x.adoc | 121 +++++++++
.../asciidoc/migration-to-4x/migrating-to-4x.adoc | 157 ++++++++++++
.../asciidoc/migration-to-4x/serialization-4x.adoc | 101 ++++++++
docs/src/main/asciidoc/pipes/index.adoc | 37 +++
docs/src/main/asciidoc/roadmap.adoc | 96 ++++++++
docs/src/main/asciidoc/security.adoc | 34 +++
docs/src/main/asciidoc/using-tika/cli/index.adoc | 39 +++
docs/src/main/asciidoc/using-tika/grpc/index.adoc | 32 +++
docs/src/main/asciidoc/using-tika/index.adoc | 65 +++++
.../using-tika/java-api/getting-started.adoc | 130 ++++++++++
.../main/asciidoc/using-tika/java-api/index.adoc | 38 +++
.../src/main/asciidoc/using-tika/server/index.adoc | 42 ++++
pom.xml | 3 +
.../org/apache/tika/config/ConfigExamplesTest.java | 97 ++++++++
.../config-examples/migration-full-example.json | 26 ++
.../config-examples/pdf-parser-basic.json | 10 +
.../resources/config-examples/pdf-parser-full.json | 53 ++++
.../resources/config-examples/tesseract-basic.json | 10 +
.../resources/config-examples/tesseract-full.json | 35 +++
.../apache/tika/pipes/fs/ConfigExamplesTest.java | 69 ++++++
.../config-examples/file-system-emitter.json | 13 +
.../config-examples/file-system-fetcher.json | 11 +
.../config-examples/file-system-pipeline.json | 27 ++
.../config/loader/TikaObjectMapperFactory.java | 4 +
.../tika/server/core/ConfigExamplesTest.java | 64 +++++
.../resources/config-examples/server-basic.json | 13 +
.../config-examples/server-with-parsers.json | 24 ++
50 files changed, 2712 insertions(+)
diff --git a/.github/workflows/main-jdk17-build.yml
b/.github/workflows/main-jdk17-build.yml
index 5dd7d568bc..73005b208d 100644
--- a/.github/workflows/main-jdk17-build.yml
+++ b/.github/workflows/main-jdk17-build.yml
@@ -20,8 +20,12 @@ name: main jdk17 build
on:
pull_request:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
push:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
jobs:
build:
diff --git a/.github/workflows/main-jdk17-windows-build-multi-locale.yml
b/.github/workflows/main-jdk17-windows-build-multi-locale.yml
index c545dfeb57..cd07d76ec3 100644
--- a/.github/workflows/main-jdk17-windows-build-multi-locale.yml
+++ b/.github/workflows/main-jdk17-windows-build-multi-locale.yml
@@ -20,8 +20,12 @@ name: main jdk17 windows build (multi-locale)
on:
pull_request:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
push:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
jobs:
build:
diff --git a/.github/workflows/main-jdk17-windows-build.yml
b/.github/workflows/main-jdk17-windows-build.yml
index 49f14377b6..26a288043f 100644
--- a/.github/workflows/main-jdk17-windows-build.yml
+++ b/.github/workflows/main-jdk17-windows-build.yml
@@ -20,8 +20,12 @@ name: main jdk17 windows build
on:
pull_request:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
push:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
jobs:
build:
diff --git a/.github/workflows/main-jdk21-build.yml
b/.github/workflows/main-jdk21-build.yml
index 3d21916680..02b970716e 100644
--- a/.github/workflows/main-jdk21-build.yml
+++ b/.github/workflows/main-jdk21-build.yml
@@ -20,6 +20,8 @@ name: main jdk21 build
on:
push:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
jobs:
build:
diff --git a/.github/workflows/main-jdk25-build.yml
b/.github/workflows/main-jdk25-build.yml
index 2109aea8f6..20dbcde48c 100644
--- a/.github/workflows/main-jdk25-build.yml
+++ b/.github/workflows/main-jdk25-build.yml
@@ -20,6 +20,8 @@ name: main jdk25 build
on:
push:
branches: [ main ]
+ paths-ignore:
+ - 'docs/**'
jobs:
build:
diff --git a/docs/pom.xml b/docs/pom.xml
new file mode 100644
index 0000000000..040f0cc66c
--- /dev/null
+++ b/docs/pom.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika</artifactId>
+ <version>4.0.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-docs</artifactId>
+ <packaging>pom</packaging>
+ <name>Apache Tika Documentation</name>
+
+ <properties>
+ <!-- Update this when a new stable version is released -->
+ <tika.stable.version>3.2.3</tika.stable.version>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.asciidoctor</groupId>
+ <artifactId>asciidoctor-maven-plugin</artifactId>
+ <version>3.2.0</version>
+ <executions>
+ <execution>
+ <id>output-html</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>process-asciidoc</goal>
+ </goals>
+ <configuration>
+ <doctype>article</doctype>
+ <attributes>
+
<source-highlighter>coderay</source-highlighter>
+ <toc />
+ <linkcss>false</linkcss>
+ <icons>font</icons>
+
<tika-stable-version>${tika.stable.version}</tika-stable-version>
+ <!-- Paths to config examples for include
directives -->
+
<parser-examples>${project.basedir}/../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples</parser-examples>
+
<server-examples>${project.basedir}/../tika-server/tika-server-core/src/test/resources/config-examples</server-examples>
+
<pipes-fs-examples>${project.basedir}/../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples</pipes-fs-examples>
+ </attributes>
+ </configuration>
+ </execution>
+ </executions>
+ <configuration>
+ <sourceDirectory>src/main/asciidoc</sourceDirectory>
+ <preserveDirectories>true</preserveDirectories>
+ </configuration>
+ </plugin>
+
+ <!-- Maven Assembly plugin to create tar.gz -->
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>make-docs-archive</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>src/assembly/docs.xml</descriptor>
+ </descriptors>
+
<finalName>${project.artifactId}-${project.version}</finalName>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
diff --git a/docs/src/assembly/docs.xml b/docs/src/assembly/docs.xml
new file mode 100644
index 0000000000..5a4b5c5746
--- /dev/null
+++ b/docs/src/assembly/docs.xml
@@ -0,0 +1,37 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3
+ https://maven.apache.org/xsd/assembly-1.1.3.xsd">
+ <id>docs</id>
+ <formats>
+ <format>tar.gz</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>${project.build.directory}/generated-docs</directory>
+ <outputDirectory>/</outputDirectory>
+ <includes>
+ <include>**/*</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/docs/src/main/asciidoc/advanced/index.adoc
b/docs/src/main/asciidoc/advanced/index.adoc
new file mode 100644
index 0000000000..f8350c86b8
--- /dev/null
+++ b/docs/src/main/asciidoc/advanced/index.adoc
@@ -0,0 +1,31 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Advanced Topics
+
+This section covers advanced usage and internals of Apache Tika.
+
+== Topics
+
+* xref:robustness.adoc[Robustness] - Process isolation and fault tolerance
when parsing untrusted content
+* xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how
TikaInputStream handles buffering, caching, and spooling to disk
+
+// Add links to specific topics as they are created
+// * link:custom-parsers.html[Writing Custom Parsers]
+// * link:custom-detectors.html[Writing Custom Detectors]
+// * link:configuration.html[Advanced Configuration]
+// * link:performance.html[Performance Tuning]
diff --git a/docs/src/main/asciidoc/advanced/robustness.adoc
b/docs/src/main/asciidoc/advanced/robustness.adoc
new file mode 100644
index 0000000000..7547cf8eb2
--- /dev/null
+++ b/docs/src/main/asciidoc/advanced/robustness.adoc
@@ -0,0 +1,137 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= The Robustness of Apache Tika
+
+Running parsers on untrusted data carries inherent risks. In rare cases, Tika
can
+encounter infinite loops or allocate unexpected amounts of memory
(OutOfMemoryErrors).
+When processing documents at scale, you must implement protective measures.
+
+IMPORTANT: Avoid running Tika in the same process as critical infrastructure
like
+indexers or search systems.
+
+== Process Isolation
+
+The primary defense against parser failures is process isolation. By running
parsers
+in separate processes, you protect your main application from:
+
+* OutOfMemoryErrors
+* Infinite loops
+* Native code crashes
+* Resource exhaustion
+
+=== Tika 4.x
+
+**In Tika 4.x, xref:../pipes/index.adoc[Tika Pipes] is the recommended
approach for
+robust document processing.** It provides:
+
+* Automatic process isolation
+* Fault tolerance and recovery
+* Scalable parallel processing
+* Unified architecture for all deployment scenarios
+
+Pipes can be used in multiple ways:
+
+* **Programmatically** - Via `PipesForkParser` in the `tika-pipes-fork-parser`
module
+ (see xref:../using-tika/java-api/getting-started.adoc[Java API Getting
Started])
+* **Via tika-server** - REST endpoints for pipes-based processing
+* **Via tika-grpc** - gRPC interface with pipes backend
+
+In Tika 4.x, the approach to robustness has been simplified. Previous versions
offered
+four different forking mechanisms:
+
+[cols="1,2,1"]
+|===
+|Mechanism |Description |Status in 4.x
+
+|ForkParser
+|Spawned child processes for individual parse operations
+|Deprecated
+
+|tika-batch
+|Desktop/VM-scale batch processing
+|Deprecated
+
+|tika-server (forked mode)
+|REST server with forked parsing processes
+|Available, but Pipes recommended
+
+|tika-pipes
+|Scalable, fault-tolerant pipeline processing
+|*Recommended approach*
+|===
+
+=== Tika 3.x and Earlier
+
+If you are using Tika 3.x or earlier, you have several options for process
isolation:
+
+ForkParser::
+Spawns child processes to protect against out-of-memory errors and infinite
loops.
+Suitable for programmatic use in Java applications.
+
+tika-batch::
+For desktop/VM-scale processing (not cloud-scale):
++
+[source,bash]
+----
+java -jar tika-app.jar -i <input_dir> -o <output_dir>
+----
+
+tika-server::
+In version 2.x and later, parsing defaults to forked processes. Clients must
handle
+tika-server restarts gracefully.
+
+tika-pipes::
+Available through programmatic use, tika-app `-a` option, or tika-server's
`/async`
+and `/pipes` endpoints.
+
+== Security Testing and Prevention
+
+The Apache Tika team implements several measures to identify and prevent
vulnerabilities:
+
+* **Regression testing** against ~2 million files from Common Crawl before
releases
+* **Code reviews** of dependencies to identify vulnerability patterns
+* **Fuzzing modules** for automated vulnerability discovery
+* **Collaboration** with security researchers
+* **Maintained forks** of parsers with critical fixes (released independently
when needed)
+* **Public documentation** of vulnerabilities at
xref:../security.adoc[security page]
+
+== MockParser for Testing
+
+Tika provides a `MockParser` tool for testing your system's robustness. You can
+configure it to simulate various failure modes:
+
+* Infinite loops
+* OutOfMemoryErrors
+* Excessive runtime
+* Large output generation
+
+This allows you to verify that your integration handles parser failures
gracefully.
+
+== Recommendations
+
+1. **Use Tika Pipes** (4.x) for production workloads with untrusted content
+2. **Isolate Tika** from critical systems - never run in the same JVM as your
indexer
+3. **Set timeouts** for all parsing operations
+4. **Monitor memory usage** and set appropriate limits
+5. **Plan for failures** - your system should handle parser crashes gracefully
+6. **Stay updated** - apply security updates promptly
+
+== Further Reading
+
+* xref:../pipes/index.adoc[Tika Pipes] - Recommended approach for robust
processing
+* xref:../security.adoc[Security] - Known vulnerabilities and security model
diff --git a/docs/spooling.adoc b/docs/src/main/asciidoc/advanced/spooling.adoc
similarity index 100%
rename from docs/spooling.adoc
rename to docs/src/main/asciidoc/advanced/spooling.adoc
diff --git a/docs/src/main/asciidoc/configuration/index.adoc
b/docs/src/main/asciidoc/configuration/index.adoc
new file mode 100644
index 0000000000..215e1f4c71
--- /dev/null
+++ b/docs/src/main/asciidoc/configuration/index.adoc
@@ -0,0 +1,40 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Configuration
+
+This section covers configuring Apache Tika.
+
+== Overview
+
+Tika 4.x uses JSON configuration files. Configuration controls parsers,
detectors,
+content handlers, and other components.
+
+NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the
+xref:../migration-to-4x/index.adoc[Migration Guide] for details on converting
to JSON.
+
+== Topics
+
+=== Parser Configuration
+
+* xref:parsers/pdf-parser.adoc[PDFParser] - PDF parsing options
+* xref:parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - OCR options for
image-based text extraction
+
+// Add links to specific topics as they are created
+// * xref:json-config.adoc[JSON Configuration Reference]
+// * xref:detectors.adoc[Configuring Detectors]
+// * xref:mime-types.adoc[MIME Type Configuration]
diff --git a/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
b/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
new file mode 100644
index 0000000000..cee58a3b70
--- /dev/null
+++ b/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc
@@ -0,0 +1,43 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= PDFParser Configuration
+
+This page documents the configuration options for `PDFParser` in Tika 4.x.
+
+== Basic Configuration
+
+[source,json]
+----
+include::{parser-examples}/pdf-parser-basic.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json[View
source on GitHub]
+
+== Full Configuration
+
+The following example shows all available configuration options with their
default values.
+Comments indicate the available options for enum fields.
+
+[source,json]
+----
+include::{parser-examples}/pdf-parser-full.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json[View
source on GitHub]
+
+== Changes from 3.x
+
+See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for
general migration guidance.
diff --git
a/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
b/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
new file mode 100644
index 0000000000..5b1b2b67e6
--- /dev/null
+++ b/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc
@@ -0,0 +1,67 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= TesseractOCRParser Configuration
+
+This page documents the configuration options for `TesseractOCRParser` in Tika
4.x.
+
+== Basic Configuration
+
+[source,json]
+----
+include::{parser-examples}/tesseract-basic.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json[View
source on GitHub]
+
+== Full Configuration
+
+The following example shows all available configuration options with their
default values.
+Comments indicate the available options for enum fields.
+
+[source,json]
+----
+include::{parser-examples}/tesseract-full.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json[View
source on GitHub]
+
+== Changes from 3.x
+
+In Tika 3.x, the `otherTesseractSettings` was a list of space-delimited
key-value strings:
+
+[source,xml]
+----
+<!-- 3.x XML format -->
+<param name="otherTesseractSettings" type="list">
+ <string>textord_initialx_ile 0.75</string>
+ <string>textord_noise_hfract 0.15625</string>
+</param>
+----
+
+In Tika 4.x, this is replaced with `otherTesseractConfig` as a proper map:
+
+[source,json]
+----
+// 4.x JSON format
+"otherTesseractConfig": {
+ "textord_initialx_ile": "0.75",
+ "textord_noise_hfract": "0.15625"
+}
+----
+
+The automatic converter handles this transformation.
+
+See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for
general migration guidance.
diff --git a/docs/src/main/asciidoc/faq.adoc b/docs/src/main/asciidoc/faq.adoc
new file mode 100644
index 0000000000..168c9a9547
--- /dev/null
+++ b/docs/src/main/asciidoc/faq.adoc
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= FAQ and Troubleshooting
+
+This page covers frequently asked questions and common issues when using
Apache Tika.
+
+== Frequently Asked Questions
+
+// TODO: Add FAQs
+
+== Troubleshooting
+
+// TODO: Add common issues and solutions
diff --git a/docs/src/main/asciidoc/index.adoc
b/docs/src/main/asciidoc/index.adoc
new file mode 100644
index 0000000000..5edc9e54ee
--- /dev/null
+++ b/docs/src/main/asciidoc/index.adoc
@@ -0,0 +1,72 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Apache Tika Documentation
+
+WARNING: This reference guide was generated with the assistance of AI and
requires
+human review before it can be fully trusted. This documentation serves as an
example
+and a starting point, but more work remains. Contributions and corrections are
welcome.
+
+== Overview
+
+Apache Tika is a content detection and extraction framework written in Java.
+
+== Using Tika
+
+* xref:using-tika/index.adoc[Getting Started] - Choose your integration method
+* xref:pipes/index.adoc[Pipes] - Scalable, fault-tolerant document processing
+
+== Configuration
+
+* xref:configuration/index.adoc[Configuration] - JSON configuration options
+
+== Migration
+
+* xref:migration-to-4x/index.adoc[Migrating to 4.x] - Guides and background
for upgrading to Tika 4.x
+
+== Advanced
+
+* xref:advanced/index.adoc[Advanced Topics] - Custom parsers, performance
tuning, internals
+
+== FAQ
+
+* xref:faq.adoc[FAQ and Troubleshooting] - Common questions and issues
+
+== Security
+
+* xref:security.adoc[Security] - Security considerations and reporting
vulnerabilities
+
+== Roadmap
+
+* xref:roadmap.adoc[Roadmap] - Planned features and improvements for upcoming
releases
+
+== For Maintainers
+
+* xref:maintainers/index.adoc[Maintainer Documentation] - Release guides and
project maintenance
+
+== Links
+
+* https://tika.apache.org/[Apache Tika Website] - Official project website
+* https://tika.apache.org/{tika-stable-version}/formats.html[Supported
Formats] - File formats Tika can parse
+* https://tika.apache.org/{tika-stable-version}/api/[API Documentation] -
Javadoc
+* https://issues.apache.org/jira/projects/TIKA[JIRA] - Issue tracker
+*
https://repository.apache.org/content/repositories/snapshots/org/apache/tika/[Maven
Snapshots] - SNAPSHOT builds in Apache's Maven repository
+* https://ci-builds.apache.org/job/Tika/[CI Builds] - Continuous integration
builds
+* https://cwiki.apache.org/confluence/display/TIKA/[Confluence Wiki] - Legacy
wiki documentation
++
+NOTE: As of Tika 4.x, we are migrating content from Confluence to these
AsciiDoc pages.
+The Confluence wiki will eventually be retired.
diff --git a/docs/src/main/asciidoc/maintainers/index.adoc
b/docs/src/main/asciidoc/maintainers/index.adoc
new file mode 100644
index 0000000000..bab767b707
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/index.adoc
@@ -0,0 +1,29 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= For Maintainers
+
+This section contains documentation for Apache Tika project maintainers and
committers.
+
+== Topics
+
+* xref:release-guides/index.adoc[Release Guides] - How to release Apache Tika
+
+// Add links to specific topics as they are created
+// * link:voting.html[Voting Procedures]
+// * link:ci.html[Continuous Integration]
+// * link:website.html[Website Maintenance]
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
new file mode 100644
index 0000000000..a8f2f8cbc7
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc
@@ -0,0 +1,133 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Releasing Tika Docker Images
+
+This guide covers the process for releasing Apache Tika Docker images.
+
+== Prerequisites
+
+=== DockerHub Access
+
+You need permissions on the `apache/tika` repository on DockerHub. To obtain
access,
+create an INFRA JIRA ticket with the "Docker" label.
+
+=== Repository Access
+
+Clone the tika-docker repository:
+
+[source,bash]
+----
+git clone https://github.com/apache/tika-docker
+cd tika-docker
+----
+
+== Image Types
+
+The tika-docker repository produces two types of images:
+
+Minimal::
+Apache Tika with base dependencies (Java only)
+
+Full::
+Apache Tika plus Tesseract OCR and GDAL
+
+== Helper Tools
+
+docker-tool.sh::
+Automates building, testing, and publishing Docker images
+
+republish-images.sh::
+Legacy script for batch republishing images
+
+NOTE: The repository also contains Docker Compose files for advanced scenarios
+(Vision, Grobid, OCR, NER), but these are not used for official releases.
+
+== Release Process
+
+=== Step 1: Update README
+
+Update the "Available Tags" section in `README.md` to include the new version.
+
+=== Step 2: Update Version
+
+Increment the TAG version in the `.env` file.
+
+=== Step 3: Update Changelog
+
+Update `CHANGES.md` with release information and date.
+
+=== Step 4: Test Locally
+
+Test the release locally before publishing:
+
+[source,bash]
+----
+./docker-tool.sh build <docker-version> <tika-version>
+./docker-tool.sh test <docker-version>
+----
+
+=== Step 5: Commit Changes
+
+Commit all changes:
+
+[source,bash]
+----
+git add README.md .env CHANGES.md
+git commit -m "Prepare for Docker release <docker-version>"
+git push
+----
+
+=== Step 6: Build and Publish
+
+Build and publish the images using the docker-tool script.
+
+Example for version 3.1.0.0 based on Tika 3.1.0:
+
+[source,bash]
+----
+# Build the images
+./docker-tool.sh build 3.1.0.0 3.1.0
+
+# Test the images
+./docker-tool.sh test 3.1.0.0
+
+# Publish to DockerHub
+./docker-tool.sh publish 3.1.0.0 3.1.0
+----
+
+NOTE: Multi-architecture building takes time. The publish step automatically
+updates the `-latest` tag on DockerHub.
+
+=== Step 7: Tag the Release
+
+Create and push a git tag for the release:
+
+[source,bash]
+----
+git tag -a 3.1.0.0 -m "New release for 3.1.0.0"
+git push --tags
+----
+
+== Post-Release
+
+After publishing the Docker images:
+
+* Verify the images are available on DockerHub at
https://hub.docker.com/r/apache/tika
+* Test pulling and running the new images
+* Update the main Tika website if needed
+* Proceed to release the link:helm.html[Helm charts] if applicable
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
new file mode 100644
index 0000000000..0576d23bb8
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc
@@ -0,0 +1,32 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Releasing Tika gRPC
+
+This guide covers the process for releasing Apache Tika gRPC components.
+
+== Prerequisites
+
+// TODO: Add prerequisites
+
+== Release Process
+
+// TODO: Add release steps
+
+== Post-Release
+
+// TODO: Add post-release steps
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
new file mode 100644
index 0000000000..aa80120c6f
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc
@@ -0,0 +1,138 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Releasing Tika Helm Charts
+
+This guide covers the process for releasing Apache Tika Helm charts.
+
+== Prerequisites
+
+=== Apache JFrog Artifactory Access
+
+You need permissions to release the Apache Tika Helm chart to the Apache Infra
+Artifactory instance. Access is controlled by ASF Infra and can be requested
+via a JIRA ticket with the "Artifactory" label.
+
+=== Repository Access
+
+Clone the tika-helm repository:
+
+[source,bash]
+----
+git clone https://github.com/apache/tika-helm
+cd tika-helm
+----
+
+Apache Tika committers should have existing access to this repository.
+
+=== Install Helm and Plugins
+
+Install Helm and the Artifactory plugin:
+
+[source,bash]
+----
+# Install Helm (macOS)
+brew install helm
+
+# Install the Artifactory push plugin
+helm plugin install https://github.com/belitre/helm-push-artifactory-plugin
--version 1.0.2
+----
+
+== Docker Image Types
+
+The Helm chart deploys one of two upstream Docker image types:
+
+Minimal::
+Contains Apache Tika and base dependencies (Java only)
+
+Full::
+Includes Tika, dependencies, Tesseract OCR, GDAL, etc.
+
+The Helm Chart uses the *Full* image by default, though either can be specified
+during Kubernetes deployment.
+
+== Versioning
+
+tika-helm Charts follow the https://semver.org/spec/v2.0.0.html[Semantic
Versioning 2.0.0]
+specification, regardless of upstream container image versioning.
+
+== Release Process
+
+=== Step 1: Update Chart Configuration
+
+For each new upstream tika-docker FULL release, update the following files:
+
+Chart.yaml::
+* Line 22: Update `version` (chart version)
+* Line 23: Update `appVersion` (must match upstream tika-docker FULL release
tag)
+
+values.yaml::
+* Line 26: Update the default image tag
+
+=== Step 2: Commit and Tag
+
+Commit the changes and create a release tag:
+
+[source,bash]
+----
+export RELEASE_VERSION=v3.2.2
+
+git add -A
+git commit -m "Release tika-helm $RELEASE_VERSION"
+git push origin main
+
+git tag -a $RELEASE_VERSION -m "Release tika-helm $RELEASE_VERSION"
+git push --tags
+----
+
+=== Step 3: Create GitHub Release
+
+. Navigate to the pushed tag on GitHub
+. Click the three-dot menu
+. Select "Create release"
+. Add release notes and publish
+
+=== Step 4: Publish to Apache JFrog Artifactory
+
+Add the Tika Helm repository and push the chart:
+
+[source,bash]
+----
+# Add the Tika Helm repository
+helm repo add tika https://apache.jfrog.io/artifactory/tika
+
+# Set your credentials
+export HELM_REPO_USERNAME="your-apache-id"
+export HELM_REPO_PASSWORD="your-password"
+
+# Push the chart to Artifactory
+helm push-artifactory . https://apache.jfrog.io/artifactory/tika
+----
+
+== Post-Release
+
+After publishing the Helm chart:
+
+* Verify the chart is available at https://apache.jfrog.io/artifactory/tika
+* Test installing the chart in a Kubernetes cluster
+* Update any documentation referencing the chart version
+
+== Questions
+
+For questions about the Helm release process, contact:
+
+* [email protected] mailing list
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
new file mode 100644
index 0000000000..1f618e9892
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/release-guides/index.adoc
@@ -0,0 +1,32 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Release Guides
+
+This section contains guides for releasing Apache Tika components.
+
+== Overview
+
+Apache Tika follows the standard Apache release process. This section provides
+step-by-step guides for releasing the various Tika components.
+
+== Topics
+
+* xref:tika.adoc[Releasing Apache Tika] - Main Tika project release process
+* xref:docker.adoc[Releasing Tika Docker Images] - Docker image release process
+* xref:helm.adoc[Releasing Tika Helm Charts] - Helm chart release process
+* xref:grpc.adoc[Releasing Tika gRPC] - gRPC component release process
diff --git a/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
b/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
new file mode 100644
index 0000000000..a967c80421
--- /dev/null
+++ b/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc
@@ -0,0 +1,271 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Releasing Apache Tika
+
+This guide covers the process for releasing the main Apache Tika project.
+
+== Prerequisites
+
+Before starting the release process, ensure you have:
+
+* Commit access to the Apache Tika repository
+* A valid GPG key published to a public keyserver
+* Maven credentials configured in `~/.m2/settings.xml`
+* Access to Apache's Nexus repository manager
+
+== Pre-Release Checks
+
+Before starting the release, run vulnerability and dependency audits:
+
+[source,bash]
+----
+# Identify vulnerable dependencies
+mvn ossindex:audit -Dossindex.fail=true
+
+# Check for outdated plugins
+mvn versions:display-plugin-updates
+
+# Check for outdated dependencies
+mvn versions:display-dependency-updates
+
+# Run full regression tests
+mvn -Prelease-profile clean verify
+----
+
+== Release Process
+
+=== Step 1: Clone the Repository
+
+Clone the repository if you haven't already:
+
+[source,bash]
+----
+git clone https://github.com/apache/tika.git
+cd tika
+----
+
+=== Step 2: Update Documentation
+
+Update `CHANGES.txt` with the release date:
+
+[source]
+----
+Release X.Y.Z - MM/dd/yyyy
+----
+
+Add any changelog entries as needed.
+
+=== Step 3: JIRA Management
+
+. Create versions X.Y.Z, X.(Y+1), and X.(Y+2) in JIRA if they don't exist
+. Reassign any unresolved X.Y.Z issues to X.(Y+1) via bulk change
+
+=== Step 4: Verify License Headers
+
+Run the Apache RAT plugin to verify all files have proper license headers:
+
+[source,bash]
+----
+mvn apache-rat:check
+----
+
+=== Step 5: Commit Changes
+
+Commit the CHANGES.txt updates:
+
+[source,bash]
+----
+git add CHANGES.txt
+git commit -m "Prepare for X.Y.Z release"
+git push
+----
+
+=== Step 6: Set Maven Memory
+
+Configure Maven memory settings:
+
+[source,bash]
+----
+export MAVEN_OPTS="-Xms128m -Xmx256m"
+----
+
+=== Step 7: Prepare the Release
+
+Execute the Maven release prepare goal:
+
+[source,bash]
+----
+mvn release:prepare
+----
+
+This will prompt you to confirm:
+
+* The release version (X.Y.Z)
+* The SCM tag name
+* The next development version
+
+=== Step 8: Perform the Release
+
+Execute the Maven release perform goal:
+
+[source,bash]
+----
+mvn release:perform
+----
+
+Ensure you have valid Maven credentials in `~/.m2/settings.xml`:
+
+[source,xml]
+----
+<servers>
+ <server>
+ <id>apache.releases.https</id>
+ <username>your-apache-id</username>
+ <password>your-password</password>
+ </server>
+</servers>
+----
+
+=== Step 9: Verify Staging Repository
+
+. Access Apache's Nexus at https://repository.apache.org
+. Log in with your Apache credentials
+. Navigate to "Staging Repositories"
+. Find the org.apache.tika staging repository
+. Verify it contains all expected artifacts
+. Click "Close" with an appropriate message
+
+=== Step 10: Upload Distribution Artifacts
+
+Upload artifacts to `dist.apache.org`:
+
+[source,bash]
+----
+svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev
+cd tika-dist-dev
+----
+
+Upload the following files with their signatures (.asc) and checksums
(.sha512):
+
+* `tika-X.Y.Z-src.zip`
+* `tika-app-X.Y.Z.jar`
+* `tika-server-standard-X.Y.Z.jar`
+
+Also:
+
+* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt`
+* Ensure the `KEYS` file contains all contributor signatures
+
+=== Step 11: Call the Vote
+
+Send a vote request to the [email protected] mailing list:
+
+[source]
+----
+Subject: [VOTE] Release Apache Tika X.Y.Z
+
+Hi all,
+
+I have created a candidate build for Apache Tika X.Y.Z.
+
+The release candidate artifacts can be found at:
+https://dist.apache.org/repos/dist/dev/tika/
+
+The staging repository is:
+https://repository.apache.org/content/repositories/orgapachetika-XXXX
+
+The Git tag is:
+https://github.com/apache/tika/tree/X.Y.Z
+
+Please vote:
+[ ] +1 Release this package
+[ ] +0 No opinion
+[ ] -1 Do not release (please provide reason)
+
+This vote will remain open for at least 72 hours.
+----
+
+=== Step 12: Release the Artifacts
+
+Upon successful vote (at least 3 +1 votes from PMC members):
+
+. Release the Nexus staging repository (click "Release" button)
+. Move artifacts from dev to release distribution:
+
+[source,bash]
+----
+svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \
+ https://dist.apache.org/repos/dist/release/tika/X.Y.Z \
+ -m "Release Apache Tika X.Y.Z"
+----
+
+== Post-Release
+
+=== Update Unreleased Modules
+
+Update any modules that weren't part of the release to the next SNAPSHOT
version.
+
+=== Update Website
+
+Refresh the website documentation to reflect the new release:
+
+* Update download links
+* Update version numbers in documentation
+* Add release notes
+
+=== Release Docker and Helm Images
+
+Follow the separate guides for releasing:
+
+* link:docker.html[Docker images]
+* link:helm.html[Helm charts]
+
+=== Send Announcements
+
+Send release announcements to:
+
+* [email protected]
+* [email protected]
+* [email protected]
+
+[source]
+----
+Subject: [ANNOUNCE] Apache Tika X.Y.Z Released
+
+The Apache Tika team is pleased to announce the release of Apache Tika X.Y.Z.
+
+Apache Tika is a toolkit for detecting and extracting metadata and text
+from various types of files.
+
+This release includes:
+[List major changes/features]
+
+For a complete list of changes, see:
+https://tika.apache.org/X.Y.Z/changes.html
+
+Download:
+https://tika.apache.org/download.html
+
+Thanks to everyone who contributed to this release!
+
+The Apache Tika Team
+----
+
+=== Register the Release
+
+Register the release at https://reporter.apache.org
diff --git a/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
new file mode 100644
index 0000000000..006c4775f9
--- /dev/null
+++ b/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc
@@ -0,0 +1,127 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Design Notes for Tika 4.x
+
+This document captures the design decisions and architectural changes in
Apache Tika 4.x.
+
+== Metadata Keys
+
+The design addresses security concerns by implementing namespaced metadata
keys. This prevents
+user-controlled data from potentially overwriting existing metadata values in
the Metadata object.
+
+See link:migrating-to-4x.html[Migrating to Tika 4.x] for details on specific
+metadata key changes.
+
+== Fat Jars and Maven Shade Strategy
+
+Tika 4.x moves away from fat jar/shaded artifacts. The `tika-app` and
`tika-server` now use
+separate `lib` and `plugins` directories alongside the jar file, enabling
standard `java -jar`
+execution.
+
+== Plugins and PF4J Framework
+
+=== Plugin Packaging
+
+PF4J plugins are packaged exclusively as zips (not jars) to align with the
move away from fat
+jars. Custom code addresses race conditions during the unzipping process
across threads and
+processes.
+
+=== Classloader Management
+
+The team disabled PF4J's default classpath loading to avoid complexity in unit
tests. A
+configured plugins directory is now required.
+
+This strict boundary prevents issues when components are loaded separately.
For example, JSON
+strings replace `JsonNode` objects to avoid problems with independent Jackson
loading in plugins.
+
+IMPORTANT: We tried to have as few Tika dependencies in the plugins as
possible.
+
+== Serialization Architecture
+
+=== Design Principles
+
+* Maximize Jackson usage while minimizing custom serialization code
+* Exclude Jackson from `tika-core` and `tika-parsers-standard-modules`
dependencies
+* Enable runtime configuration updates via Jackson's `readerForUpdating`
+
+=== Security Model
+
+Configuration files at initialization are treated as trusted sources. Runtime
+serialization/deserialization uses an allowlist of permitted packages via
+`PolymorphicObjectMapperFactory`.
+
+Custom components can add patterns to
`META-INF/tika-serialization-allowlist.txt`.
+
+=== Implementation Challenges
+
+* Converted code to true Java beans with matching getters/setters
+* Used `ObjectMapper.DefaultTyping.OBJECT_AND_NON_CONCRETE` for polymorphic
typing
+* Replaced generic collections (`List`, `Set`) with concrete types
(`ArrayList`, `HashSet`)
+* Converted `Path` fields to `String` due to Jackson constraints
+* Avoided Java records to enable `readerForUpdating` functionality
+
+== Annotations System
+
+The `@TikaComponent` annotation handles:
+
+* Automatic service file generation at build time
+* Creation of `META-INF/tika/*.idx` mapping files
+* Kebab-case conversion of class names to friendly identifiers (e.g.,
`PDFParser` → `pdf-parser`)
+* Manual name overrides via `name` attribute
+* Optional `spi=false` setting for non-service-file registration
+
+== Migration Strategy
+
+The plan is to stabilize 4.x structures before backporting capabilities to 3.x
and deprecating
+`TikaConfig` and `tika-config.xml`.
+
+A converter tool for transforming `tika-config.xml` to `tika-config.json` is
planned, with
+support focused on components in `tika-parsers-standard-modules`.
+
+== Development Tips
+
+=== Common Issues
+
+* Plugin directories and `@TikaComponent` annotations becoming out of sync
across modules
+* IntelliJ conflicts with command-line builds
+* Checkstyle running before Spotless, causing preventable failures
+
+=== Recommended Build Commands
+
+For faster builds during development:
+
+[source,bash]
+----
+mvn clean install -am -pl :tika-app -Pfast
+----
+
+To apply formatting and build:
+
+[source,bash]
+----
+mvn clean spotless:apply install
+----
+
+== Outstanding Tasks
+
+* Implement flexible component loading without `@TikaComponent` requirements
+* Enable friendly name usage throughout the codebase
+* Resolve gRPC issues
+* Fix mutool renderer byte-passing in open containers
+* Simplify and strengthen serialization code
+* Consider relocating `TikaConfig` and `ForkParser` to legacy module
diff --git a/docs/src/main/asciidoc/migration-to-4x/index.adoc
b/docs/src/main/asciidoc/migration-to-4x/index.adoc
new file mode 100644
index 0000000000..c8d5be9f5d
--- /dev/null
+++ b/docs/src/main/asciidoc/migration-to-4x/index.adoc
@@ -0,0 +1,32 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Migrating to Tika 4.x
+
+This section provides guides and background documentation for migrating to
Apache Tika 4.x.
+
+See the xref:../roadmap.adoc[Roadmap] for version timelines and support
schedules.
+
+== Migration Guides
+
+* xref:migrating-to-4x.adoc[Migration Guide] - Step-by-step guide for
upgrading from Tika 3.x to 4.x
+* xref:metadata-changes-4x.adoc[Metadata Changes] - Detailed metadata key
changes and migration examples
+
+== Background Documentation
+
+* xref:design-notes-4x.adoc[Design Notes] - Architectural decisions and design
rationale
+* xref:serialization-4x.adoc[Serialization] - JSON serialization design and
implementation details
diff --git a/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
new file mode 100644
index 0000000000..e129d33008
--- /dev/null
+++ b/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc
@@ -0,0 +1,121 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Metadata Changes in Tika 4.x
+
+This document details the metadata key changes in Apache Tika 4.x.
+
+== Overview
+
+Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and
improve
+namespace clarity. This is a security-focused change that prevents
user-controlled data
+from potentially overwriting existing metadata values in the Metadata object.
+
+== Metadata Key Changes
+
+[cols="2,2,3"]
+|===
+|Category |Change |Details
+
+|HTML custom metadata
+|Prefixed with `html:`
+|Custom metadata from HTML documents now uses the `html:` prefix
+
+|MAPI metadata
+|Prefix changed to `mapi:`
+|Microsoft MAPI properties now use the `mapi:` prefix
+
+|Resource name
+|Renamed
+|`resourceName` changed to `X-TIKA:resourceName`
+
+|Unrecognized image metadata
+|Prefixed with `img:`
+|Unrecognized image metadata keys now use the `img:` prefix
+
+|Office metadata
+|Prefix changed
+|Changed from `meta` prefix to `office` prefix
+|===
+
+== Migration Steps
+
+When upgrading to Tika 4.x, you will need to update any code that references
metadata keys
+directly:
+
+=== HTML Metadata
+
+[source,java]
+----
+// Before (3.x)
+String value = metadata.get("custom-key");
+
+// After (4.x)
+String value = metadata.get("html:custom-key");
+----
+
+=== MAPI Metadata
+
+[source,java]
+----
+// Before (3.x)
+String value = metadata.get("mapi:some-property");
+
+// After (4.x) - prefix remains mapi: but verify specific keys
+String value = metadata.get("mapi:some-property");
+----
+
+=== Resource Name
+
+[source,java]
+----
+// Before (3.x)
+String name = metadata.get("resourceName");
+
+// After (4.x)
+String name = metadata.get("X-TIKA:resourceName");
+----
+
+=== Image Metadata
+
+[source,java]
+----
+// Before (3.x)
+String value = metadata.get("unknown-image-key");
+
+// After (4.x)
+String value = metadata.get("img:unknown-image-key");
+----
+
+=== Office Metadata
+
+[source,java]
+----
+// Before (3.x)
+String value = metadata.get("meta:some-property");
+
+// After (4.x)
+String value = metadata.get("office:some-property");
+----
+
+== Rationale
+
+The namespacing of metadata keys provides several benefits:
+
+* *Security*: Prevents user-controlled content from overwriting internal
metadata
+* *Clarity*: Makes it clear which parser or source generated a metadata key
+* *Consistency*: Provides a uniform approach to metadata naming across all
parsers
diff --git a/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
new file mode 100644
index 0000000000..ba26d25acc
--- /dev/null
+++ b/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc
@@ -0,0 +1,157 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Migrating to Tika 4.x
+
+This guide covers the changes required when upgrading from Apache Tika 3.x to
4.x.
+
+See the xref:../roadmap.adoc[Roadmap] for version timelines and support
schedules.
+
+== Requirements
+
+* Java 17 or later (upgraded from Java 11 in 3.x)
+
+== Configuration: XML to JSON
+
+Tika 4.x uses JSON configuration files instead of XML. The legacy
`tika-config.xml` format
+is no longer supported.
+
+=== Automatic Conversion
+
+Tika provides a conversion tool in `tika-app` to help migrate your XML
configuration:
+
+[source,bash]
+----
+java -jar tika-app.jar
--convert-config-xml-to-json=tika-config.xml,tika-config.json
+----
+
+The converter currently supports:
+
+* **Parsers section** - parser declarations with parameters and exclusions
+* **Parameter types** - bool, int, long, double, float, string, list, and map
+* **Special handling** - TesseractOCR's `otherTesseractSettings` list is
automatically
+ converted to the `otherTesseractConfig` map format
+
+=== Example Conversion
+
+**XML Format (3.x):**
+[source,xml]
+----
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ <param name="maxMainMemoryBytes" type="long">1000000</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ </parsers>
+</properties>
+----
+
+**JSON Format (4.x):**
+[source,json]
+----
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "sortByPosition": true,
+ "maxMainMemoryBytes": 1000000
+ }
+ },
+ {
+ "default-parser": {
+ "_exclude": ["pdf-parser"]
+ }
+ }
+ ]
+}
+----
+
+=== Key Differences
+
+[cols="1,1,2"]
+|===
+|Aspect |XML (3.x) |JSON (4.x)
+
+|Class references
+|Full class name (`org.apache.tika.parser.pdf.PDFParser`)
+|Kebab-case component name (`pdf-parser`)
+
+|Parameters
+|`<param name="..." type="...">value</param>`
+|Direct key-value pairs
+
+|Exclusions
+|`<parser-exclude class="..."/>`
+|`"_exclude": ["component-name"]`
+|===
+
+NOTE: When you configure a parser with specific settings in JSON, the loader
automatically
+excludes it from SPI loading. Explicit exclusions are only needed when you
want to disable
+a parser entirely without providing custom configuration.
+
+=== Limitations
+
+The automatic converter has some limitations:
+
+* Only the `parsers` section is currently converted
+* Detectors and other sections require manual migration
+* Custom or third-party parsers not in the registry will use kebab-case name
conversion
+
+=== Parser Configuration Changes
+
+WARNING: The configuration options for `PDFParser` and `TesseractOCRParser`
have changed
+significantly in 4.x. The automatic converter will migrate your parameter
names, but you
+should review the updated documentation to ensure your configuration is
optimal.
+
+See:
+
+* xref:../configuration/parsers/pdf-parser.adoc[PDFParser Configuration] -
Updated options for PDF parsing
+* xref:../configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser
Configuration] - Updated OCR options
+
+=== Full Configuration Example
+
+Below is a complete example of a Tika 4.x JSON configuration file with
commonly configured parsers:
+
+[source,json]
+----
+include::{parser-examples}/migration-full-example.json[]
+----
+
+NOTE: This example shows common options. See the individual parser
configuration pages for
+complete documentation of all available options.
+
+== Metadata Key Changes
+
+Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and
improve
+namespace clarity.
+
+See xref:metadata-changes-4x.adoc[Metadata Changes in 4.x] for complete
details, including
+a full table of changes and code migration examples.
+
+== API Changes
+
+// TODO: Document API changes
+
+== Deprecations and Removals
+
+// TODO: Document deprecated and removed features
diff --git a/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
b/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
new file mode 100644
index 0000000000..e11bdc4959
--- /dev/null
+++ b/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc
@@ -0,0 +1,101 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Serialization in Tika 4.x
+
+This document describes the JSON serialization design and implementation
details for Apache Tika 4.x.
+
+== High-Level Goals
+
+=== Jackson Framework Integration
+
+Use Jackson as much as possible with as few custom serializers and as few
annotations as possible.
+Jackson dependencies are kept out of core modules to maintain flexibility.
+
+=== Friendly Naming Conventions
+
+Implementation uses friendly names like `pdf-parser` rather than full class
names. These friendly
+names are applied to configured items rather than configuration class names.
+
+=== Custom Class Support
+
+The design permits users to add custom classes through Jackson's polymorphic
handling:
+
+* `org.apache.tika` patterns are allowed by default
+* Users can define additional inclusion patterns for security
+
+=== Configuration Consistency
+
+The approach seeks to make initialization and runtime configuration look
exactly the same and use
+the same underlying code where possible. However, security constraints may
require differences in
+which fields are modifiable at runtime.
+
+=== Configuration Objects Over Annotations
+
+Preference for config objects rather than field annotations to support
multithreading. Parsers
+retrieve settings from `ParseContext` at runtime.
+
+=== Cross-System Configuration Flow
+
+Configuration must pass seamlessly from:
+
+. User clients
+. Through tika-server REST APIs
+. Into tika-pipes infrastructure
+
+== Initialization Structure
+
+=== Tier 1 Objects
+
+ID Objects::
+Fetchers, emitters - components with unique identifiers
+
+Composite Objects::
+Parsers, detectors - components that aggregate other components
+
+Single Objects::
+Pipes, gRPC, server configurations
+
+=== Tier 2 Objects
+
+Components that can be read via friendly names using `@TikaComponent`
annotations in an
+`other-config` section.
+
+== Runtime Patterns
+
+=== Backwards Compatibility
+
+The design maintains backwards compatibility by allowing `ParseContext`
additions where the
+interface serves as the key.
+
+=== Partial Configuration Updates
+
+Users can specify only updates to the initialization configuration through
partial JSON objects,
+rather than requiring complete configuration documents.
+
+=== Self-Configuring Components in Pipes
+
+In the pipes infrastructure, objects should configure themselves to avoid
classloading
+dependencies on components like `PDFParser`.
+
+== Security Considerations
+
+* Configuration files at initialization are treated as trusted sources
+* Runtime serialization/deserialization uses an allowlist of permitted packages
+* Custom components can register patterns in
`META-INF/tika-serialization-allowlist.txt`
+
+See link:design-notes-4x.html[Design Notes for 4.x] for additional
architectural context.
diff --git a/docs/src/main/asciidoc/pipes/index.adoc
b/docs/src/main/asciidoc/pipes/index.adoc
new file mode 100644
index 0000000000..e7b49ebc3c
--- /dev/null
+++ b/docs/src/main/asciidoc/pipes/index.adoc
@@ -0,0 +1,37 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika Pipes
+
+This section covers Tika Pipes for scalable, fault-tolerant document
processing.
+
+== Overview
+
+Tika Pipes provides a framework for processing large volumes of documents with:
+
+* **Fetchers** - Retrieve documents from various sources (filesystem, S3,
HTTP, etc.)
+* **Emitters** - Send parsed results to various destinations (filesystem,
OpenSearch, Solr, etc.)
+* **Pipelines** - Configure processing workflows
+
+== Topics
+
+// Add links to specific topics as they are created
+// * link:getting-started.html[Getting Started]
+// * link:fetchers.html[Fetchers]
+// * link:emitters.html[Emitters]
+// * link:configuration.html[Configuration]
+// * link:async.html[Async Processing]
diff --git a/docs/src/main/asciidoc/roadmap.adoc
b/docs/src/main/asciidoc/roadmap.adoc
new file mode 100644
index 0000000000..3e28829a43
--- /dev/null
+++ b/docs/src/main/asciidoc/roadmap.adoc
@@ -0,0 +1,96 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Apache Tika Roadmap
+
+This page outlines the planned features and improvements for Apache Tika
releases.
+
+NOTE: All dates are in Open Source Standard Time which does not always neatly
align
+with traditional calendars.
+
+== Release Timeline
+
+[cols="1,3"]
+|===
+|Date |Milestone
+
+|October 2024
+|Release 3.0.0
+
+|October 2024
+|Move main branch to 4.x (Java 17) after 3.0.0 release
+
+|April 2025
+|End support for 2.x (and Java 8)
+
+|January 2026
+|Release 4.0.0
+
+|June 2026
+|End support for 3.x (and Java 11)
+|===
+
+== Version Support Matrix
+
+[cols="1,1,1,2,2"]
+|===
+|Version |Java |Jakarta/javax |Availability |Planned EOL
+
+|2.x
+|8
+|javax
+|Now
+|April 2025
+
+|3.x
+|11
+|jakarta
+|October 2024
+|June 2026 or 6 months after 4.0.0 release
+
+|4.x
+|17
+|jakarta
+|January 2026
+|TBD
+
+|5.x
+|21
+|jakarta
+|TBD
+|TBD
+
+|6.x
+|25
+|jakarta
+|TBD
+|TBD
+|===
+
+== Metadata Changes in 4.x
+
+Tika 4.x implements namespaced metadata keys to prevent overwrites and improve
namespace clarity.
+
+See xref:migration-to-4x/metadata-changes-4x.adoc[Metadata Changes in 4.x] for
complete details and
+migration examples.
+
+== Long-term Goals
+
+// Add long-term goals as they are defined
+// * Improved streaming support
+// * Enhanced language detection
+// * Better support for modern document formats
diff --git a/docs/src/main/asciidoc/security.adoc
b/docs/src/main/asciidoc/security.adoc
new file mode 100644
index 0000000000..ddc09b7215
--- /dev/null
+++ b/docs/src/main/asciidoc/security.adoc
@@ -0,0 +1,34 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Security
+
+This page covers security considerations when using Apache Tika.
+
+== Security Model
+
+Apache Tika's security model describes the trust boundaries and assumptions
that govern
+how Tika processes content. Understanding this model is essential for
deploying Tika securely.
+
+* https://tika.apache.org/security-model.html[Apache Tika Security Model]
+
+== Known Vulnerabilities
+
+For information about known security vulnerabilities (CVEs) in Apache Tika and
their
+remediation, please see:
+
+* https://tika.apache.org/security.html[Apache Tika Security Vulnerabilities]
diff --git a/docs/src/main/asciidoc/using-tika/cli/index.adoc
b/docs/src/main/asciidoc/using-tika/cli/index.adoc
new file mode 100644
index 0000000000..56105528d7
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/cli/index.adoc
@@ -0,0 +1,39 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika Command Line Interface
+
+This section covers using Apache Tika from the command line via `tika-app`.
+
+== Overview
+
+The Tika application (`tika-app.jar`) provides a command-line interface for
+parsing documents, detecting content types, and extracting metadata.
+
+== Basic Usage
+
+[source,bash]
+----
+java -jar tika-app.jar [options] <file>
+----
+
+== Topics
+
+// Add links to specific topics as they are created
+// * link:installation.html[Installation]
+// * link:options.html[Command Line Options]
+// * link:batch.html[Batch Processing]
diff --git a/docs/src/main/asciidoc/using-tika/grpc/index.adoc
b/docs/src/main/asciidoc/using-tika/grpc/index.adoc
new file mode 100644
index 0000000000..2f1eb24adb
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/grpc/index.adoc
@@ -0,0 +1,32 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika gRPC
+
+This section covers using Apache Tika via gRPC.
+
+== Overview
+
+Tika gRPC provides a high-performance gRPC interface for parsing documents.
+This is useful for microservices architectures and polyglot environments.
+
+== Topics
+
+// Add links to specific topics as they are created
+// * link:getting-started.html[Getting Started]
+// * link:api.html[gRPC API]
+// * link:clients.html[Client Libraries]
diff --git a/docs/src/main/asciidoc/using-tika/index.adoc
b/docs/src/main/asciidoc/using-tika/index.adoc
new file mode 100644
index 0000000000..ada34abc4c
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/index.adoc
@@ -0,0 +1,65 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Getting Started with Apache Tika
+
+Apache Tika can be used in several ways depending on your needs. Choose the
approach
+that best fits your use case.
+
+== Choose Your Integration Method
+
+xref:java-api/index.adoc[Java API]::
+Use Tika directly in your Java application. Best for tight integration and
full control
+over parsing behavior.
+
+xref:cli/index.adoc[Command Line (tika-app)]::
+Run Tika from the command line. Best for quick extraction, scripting, and
one-off tasks.
+
+xref:server/index.adoc[Server (REST API)]::
+Run Tika as a standalone server with a REST API. Best for language-agnostic
integration
+and microservice architectures.
+
+xref:grpc/index.adoc[gRPC]::
+Use Tika via gRPC protocol. Best for high-performance, cross-language
communication.
+
+== Which Should I Use?
+
+[cols="1,3"]
+|===
+|Use Case |Recommended Approach
+
+|Java application needing content extraction
+|Java API
+
+|Shell scripts or batch processing
+|Command Line
+
+|Non-Java application (Python, Node.js, etc.)
+|Server (REST) or gRPC
+
+|High-throughput processing pipeline
+|Server or gRPC with xref:../pipes/index.adoc[Pipes]
+
+|Quick one-time extraction
+|Command Line
+|===
+
+== Scalable Processing
+
+For processing large volumes of documents, see xref:../pipes/index.adoc[Tika
Pipes],
+which provides fault-tolerant, scalable document processing and works with all
of the
+above integration methods.
diff --git a/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
b/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
new file mode 100644
index 0000000000..ff8df846d4
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc
@@ -0,0 +1,130 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Getting Started with the Java API
+
+== Before You Start
+
+Before embedding Tika directly in your Java application, consider whether a
+client-server architecture would better suit your needs.
+
+=== Recommended: Use tika-server or tika-grpc
+
+For most use cases, we recommend running Tika as a separate service rather than
+embedding it directly:
+
+* **xref:../server/index.adoc[tika-server]** - REST API, language-agnostic
+* **xref:../grpc/index.adoc[tika-grpc]** - High-performance gRPC protocol
+
+**Why?**
+
+* **Process isolation** - Parser crashes don't affect your application
+* **Easier deployment** - Use official Docker images
+* **Language flexibility** - Call from any language, not just Java
+* **Simpler upgrades** - Update Tika independently of your application
+
+Docker images are available at https://hub.docker.com/r/apache/tika[Docker
Hub].
+
+=== When to Use the Java API
+
+The Java API is appropriate when you:
+
+* Need tight integration with Tika internals
+* Cannot use a network service
+* Have specific customization requirements
+
+== Using PipesForkParser (Recommended)
+
+If you must use Tika as a library, use `PipesForkParser` from the
+`tika-pipes-fork-parser` module. It provides process isolation to protect your
+application from parser crashes, memory leaks, and infinite loops.
+
+=== Maven Dependency
+
+[source,xml]
+----
+<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-pipes-fork-parser</artifactId>
+ <version>${tika.version}</version>
+</dependency>
+----
+
+=== Basic Example
+
+[source,java]
+----
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.pipes.fork.PipesForkParser;
+import org.apache.tika.pipes.fork.PipesForkResult;
+
+try (PipesForkParser parser = new PipesForkParser();
+ TikaInputStream tis = TikaInputStream.get(filePath)) {
+
+ PipesForkResult result = parser.parse(tis);
+
+ if (result.isSuccess()) {
+ String content = result.getContent();
+ // process content...
+ } else {
+ // handle failure
+ }
+}
+----
+
+=== Key Features
+
+* **Process isolation** - Parsing runs in a separate JVM
+* **Automatic restart** - If the forked process crashes, it restarts
automatically
+* **Configurable timeouts** - Prevent infinite loops
+* **Thread-safe** - Reuse across multiple threads
+
+=== Complete Examples
+
+See
+https://github.com/apache/tika/blob/main/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java[PipesForkParserExample.java]
+in the `tika-example` module for comprehensive examples including:
+
+* Basic parsing
+* Handling embedded documents
+* Custom configuration
+* Error handling
+* Batch processing
+
+== Without Pipes: Understanding the Risks
+
+If you choose not to use `PipesForkParser` and instead use Tika's parsers
directly
+(e.g., `AutoDetectParser`), you are responsible for handling the risks of
parsing
+untrusted content.
+
+WARNING: Running parsers directly on untrusted data can cause
OutOfMemoryErrors,
+infinite loops, and crashes that will affect your entire application.
+
+Before proceeding without process isolation, read:
+
+* xref:../../advanced/robustness.adoc[The Robustness of Apache Tika] -
Understanding parser risks and mitigations
+* https://tika.apache.org/security-model.html[Apache Tika Security Model] -
Trust boundaries and assumptions
+
+If you still need to use parsers directly, your application is responsible for
+implementing its own process isolation so that you can:
+
+* Set parse timeouts (Tika cannot enforce timeouts without process isolation)
+* Configure memory limits (requires separate JVM)
+* Kill runaway processes
+* Recover from crashes
+
+Never run Tika in the same JVM as critical infrastructure.
diff --git a/docs/src/main/asciidoc/using-tika/java-api/index.adoc
b/docs/src/main/asciidoc/using-tika/java-api/index.adoc
new file mode 100644
index 0000000000..703a2cf2c2
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/java-api/index.adoc
@@ -0,0 +1,38 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Using Tika as a Library (Java API)
+
+This section covers using Apache Tika programmatically in your Java
applications.
+
+== Overview
+
+Tika can be embedded directly into your Java applications as a library. This
gives you
+full control over parsing, detection, and configuration.
+
+However, for most use cases we recommend using
xref:../server/index.adoc[tika-server]
+or xref:../grpc/index.adoc[tika-grpc] instead. See
+xref:getting-started.adoc[Getting Started] for guidance on choosing the right
approach.
+
+== Topics
+
+* xref:getting-started.adoc[Getting Started] - Recommendations and
PipesForkParser usage
+
+// Add links to specific topics as they are created
+// * link:parsing.html[Parsing Documents]
+// * link:detection.html[Content Detection]
+// * link:configuration.html[Configuration]
diff --git a/docs/src/main/asciidoc/using-tika/server/index.adoc
b/docs/src/main/asciidoc/using-tika/server/index.adoc
new file mode 100644
index 0000000000..accfc02700
--- /dev/null
+++ b/docs/src/main/asciidoc/using-tika/server/index.adoc
@@ -0,0 +1,42 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Tika Server
+
+This section covers running Apache Tika as a REST server via `tika-server`.
+
+== Overview
+
+Tika Server provides a RESTful HTTP interface for parsing documents and
extracting
+content. It can be deployed as a standalone service or in a containerized
environment.
+
+== Basic Usage
+
+[source,bash]
+----
+java -jar tika-server-standard.jar
+----
+
+The server starts on port 9998 by default.
+
+== Topics
+
+// Add links to specific topics as they are created
+// * link:installation.html[Installation]
+// * link:endpoints.html[REST Endpoints]
+// * link:configuration.html[Configuration]
+// * link:docker.html[Docker Deployment]
diff --git a/pom.xml b/pom.xml
index 9e451d1006..417aab5f35 100644
--- a/pom.xml
+++ b/pom.xml
@@ -62,6 +62,9 @@
<profiles>
<profile>
<id>apache-release</id>
+ <modules>
+ <module>docs</module>
+ </modules>
<properties>
<username>${user.name}</username>
</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java
new file mode 100644
index 0000000000..1429984cbf
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Validates configuration examples used in documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code
include::} directive.
+ * This test class validates that each example is valid and can be loaded by
TikaLoader.
+ * <p>
+ * <strong>Important:</strong> When modifying examples in the config-examples
directory,
+ * ensure the JSON remains valid and these tests pass. The documentation will
automatically
+ * reflect your changes.
+ * <p>
+ * TODO: Consider auto-generating the full config JSON files from the actual
config classes
+ * (e.g., PDFParserConfig, TesseractOCRConfig) during the build process. This
would:
+ * <ul>
+ * <li>Guarantee JSON always matches actual defaults</li>
+ * <li>Automatically catch when fields are added/removed</li>
+ * <li>Use Jackson's ORDER_MAP_ENTRIES_BY_KEYS for consistent ordering</li>
+ * </ul>
+ * Challenge: Jackson doesn't write comments in JSON output, so enum options
would need
+ * to be documented via annotations and a post-processor, or in the AsciiDoc
directly.
+ */
+public class ConfigExamplesTest {
+
+ private static final String EXAMPLES_DIR = "/config-examples/";
+
+ @TempDir
+ Path tempDir;
+
+ private Parser loadAndValidate(String resourceName) throws Exception {
+ try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR +
resourceName)) {
+ assertNotNull(is, "Resource not found: " + resourceName);
+ String json = new String(is.readAllBytes(),
StandardCharsets.UTF_8);
+ Path configFile = tempDir.resolve("tika-config.json");
+ Files.writeString(configFile, json, StandardCharsets.UTF_8);
+ TikaLoader loader = TikaLoader.load(configFile);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser, "Parser should not be null for: " +
resourceName);
+ return parser;
+ }
+ }
+
+ @Test
+ public void testPdfParserBasicConfig() throws Exception {
+ loadAndValidate("pdf-parser-basic.json");
+ }
+
+ @Test
+ public void testPdfParserFullConfig() throws Exception {
+ loadAndValidate("pdf-parser-full.json");
+ }
+
+ @Test
+ public void testTesseractBasicConfig() throws Exception {
+ loadAndValidate("tesseract-basic.json");
+ }
+
+ @Test
+ public void testTesseractFullConfig() throws Exception {
+ loadAndValidate("tesseract-full.json");
+ }
+
+ @Test
+ public void testFullMigrationExample() throws Exception {
+ loadAndValidate("migration-full-example.json");
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json
new file mode 100644
index 0000000000..014a7b69d7
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json
@@ -0,0 +1,26 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "extractInlineImages": true,
+ "extractUniqueInlineImagesOnly": true,
+ "sortByPosition": true,
+ "maxMainMemoryBytes": 1000000000
+ }
+ },
+ {
+ "tesseract-ocr-parser": {
+ "language": "eng+fra",
+ "pageSegMode": "1",
+ "timeoutSeconds": 300,
+ "otherTesseractConfig": {
+ "textord_initialx_ile": "0.75",
+ "textord_noise_hfract": "0.15625"
+ }
+ }
+ },
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json
new file mode 100644
index 0000000000..591e214ee6
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json
@@ -0,0 +1,10 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ "extractInlineImages": true,
+ "sortByPosition": true
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json
new file mode 100644
index 0000000000..9f455918de
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json
@@ -0,0 +1,53 @@
+{
+ "parsers": [
+ {
+ "pdf-parser": {
+ // Options: DONT_CHECK, ALLOW_EXTRACTION_FOR_ACCESSIBILITY,
IGNORE_ACCESSIBILITY_ALLOWANCE
+ "accessCheckMode": "DONT_CHECK",
+ "averageCharTolerance": 0.3,
+ "catchIntermediateIOExceptions": true,
+ "detectAngles": false,
+ "dropThreshold": 2.5,
+ "enableAutoSpace": true,
+ "extractAcroFormContent": true,
+ "extractActions": false,
+ "extractAnnotationText": true,
+ "extractBookmarksText": true,
+ "extractFontNames": false,
+ "extractIncrementalUpdateInfo": true,
+ "extractInlineImageMetadataOnly": false,
+ "extractInlineImages": false,
+ "extractMarkedContent": false,
+ "extractUniqueInlineImagesOnly": true,
+ "ifXFAExtractOnlyXFA": false,
+ "ignoreContentStreamSpaceGlyphs": false,
+ // Options: NONE, RAW_IMAGES, RENDER_PAGES_BEFORE_PARSE,
RENDER_PAGES_AT_PAGE_END
+ "imageStrategy": "NONE",
+ "maxIncrementalUpdates": 10,
+ "maxMainMemoryBytes": 536870912,
+ "ocr": {
+ "dpi": 300,
+ // Options: PNG, TIFF, JPEG
+ "imageFormat": "PNG",
+ "imageQuality": 1.0,
+ // Options: RGB, GRAY
+ "imageType": "GRAY",
+ // Options: NO_TEXT, TEXT_ONLY, VECTOR_GRAPHICS_ONLY, ALL
+ "renderingStrategy": "ALL",
+ // Options: AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION
+ "strategy": "AUTO",
+ "strategyAuto": {
+ "totalCharsPerPage": 10,
+ "unmappedUnicodeCharsPerPage": 10
+ }
+ },
+ "parseIncrementalUpdates": false,
+ "setKCMS": false,
+ "sortByPosition": false,
+ "spacingTolerance": 0.5,
+ "suppressDuplicateOverlappingText": false,
+ "throwOnEncryptedPayload": false
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json
new file mode 100644
index 0000000000..f41a367acc
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json
@@ -0,0 +1,10 @@
+{
+ "parsers": [
+ {
+ "tesseract-ocr-parser": {
+ "language": "eng",
+ "timeoutSeconds": 120
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json
new file mode 100644
index 0000000000..4e3e75aeae
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json
@@ -0,0 +1,35 @@
+{
+ "parsers": [
+ {
+ "tesseract-ocr-parser": {
+ "applyRotation": false,
+ "colorspace": "gray",
+ "density": 300,
+ "depth": 4,
+ "enableImagePreprocessing": false,
+ "filter": "triangle",
+ "imageMagickPath": "",
+ "inlineContent": false,
+ "language": "eng",
+ "maxFileSizeToOcr": 2147483647,
+ "minFileSizeToOcr": 0,
+ // Additional Tesseract configuration parameters as key-value pairs
+ "otherTesseractConfig": {
+ "preserve_interword_spaces": "1",
+ "textord_initialx_ile": "0.75",
+ "textord_noise_hfract": "0.15625"
+ },
+ // Options: TXT, HOCR
+ "outputType": "TXT",
+ "pageSeparator": "",
+ "pageSegMode": "1",
+ "preserveInterwordSpacing": false,
+ "resize": 200,
+ "skipOcr": false,
+ "tessdataPath": "",
+ "tesseractPath": "",
+ "timeoutSeconds": 120
+ }
+ }
+ ]
+}
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java
new file mode 100644
index 0000000000..70fe7947bb
--- /dev/null
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fs;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+
+/**
+ * Validates file system fetcher/emitter configuration examples used in
documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code
include::} directive.
+ */
+public class ConfigExamplesTest {
+
+ private static final String EXAMPLES_DIR = "/config-examples/";
+
+ @TempDir
+ Path tempDir;
+
+ private void loadAndValidate(String resourceName) throws Exception {
+ try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR +
resourceName)) {
+ assertNotNull(is, "Resource not found: " + resourceName);
+ String json = new String(is.readAllBytes(),
StandardCharsets.UTF_8);
+ Path configFile = tempDir.resolve("tika-config.json");
+ Files.writeString(configFile, json, StandardCharsets.UTF_8);
+ TikaLoader loader = TikaLoader.load(configFile);
+ assertNotNull(loader, "TikaLoader should not be null for: " +
resourceName);
+ }
+ }
+
+ @Test
+ public void testFileSystemFetcherConfig() throws Exception {
+ loadAndValidate("file-system-fetcher.json");
+ }
+
+ @Test
+ public void testFileSystemEmitterConfig() throws Exception {
+ loadAndValidate("file-system-emitter.json");
+ }
+
+ @Test
+ public void testFileSystemPipelineConfig() throws Exception {
+ loadAndValidate("file-system-pipeline.json");
+ }
+}
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json
new file mode 100644
index 0000000000..4f01761e45
--- /dev/null
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json
@@ -0,0 +1,13 @@
+{
+ "emitters": [
+ {
+ "file-system-emitter": {
+ "id": "my-emitter",
+ "basePath": "/data/output",
+ "fileExtension": "json",
+ "onExists": "REPLACE",
+ "prettyPrint": true
+ }
+ }
+ ]
+}
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json
new file mode 100644
index 0000000000..201d4fa099
--- /dev/null
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json
@@ -0,0 +1,11 @@
+{
+ "fetchers": [
+ {
+ "file-system-fetcher": {
+ "id": "my-fetcher",
+ "basePath": "/data/documents",
+ "extractFileSystemMetadata": true
+ }
+ }
+ ]
+}
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json
new file mode 100644
index 0000000000..3d95755eff
--- /dev/null
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json
@@ -0,0 +1,27 @@
+{
+ "fetchers": [
+ {
+ "file-system-fetcher": {
+ "id": "input-fetcher",
+ "basePath": "/data/input",
+ "extractFileSystemMetadata": true
+ }
+ }
+ ],
+ "emitters": [
+ {
+ "file-system-emitter": {
+ "id": "output-emitter",
+ "basePath": "/data/output",
+ "fileExtension": "json",
+ "onExists": "SKIP",
+ "prettyPrint": false
+ }
+ }
+ ],
+ "parsers": [
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
index 3984952485..b8b7e4389d 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.config.loader;
+import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
@@ -68,6 +69,9 @@ public class TikaObjectMapperFactory {
public static ObjectMapper createMapper() {
ObjectMapper mapper = new ObjectMapper();
+ // Allow comments in JSON config files (// and /* */ style)
+ mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
+
// Fail on unknown properties to catch configuration errors early
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,
true);
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java
new file mode 100644
index 0000000000..43dd1391e3
--- /dev/null
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+
+/**
+ * Validates server configuration examples used in documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code
include::} directive.
+ */
+public class ConfigExamplesTest {
+
+ private static final String EXAMPLES_DIR = "/config-examples/";
+
+ @TempDir
+ Path tempDir;
+
+ private void loadAndValidate(String resourceName) throws Exception {
+ try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR +
resourceName)) {
+ assertNotNull(is, "Resource not found: " + resourceName);
+ String json = new String(is.readAllBytes(),
StandardCharsets.UTF_8);
+ Path configFile = tempDir.resolve("tika-config.json");
+ Files.writeString(configFile, json, StandardCharsets.UTF_8);
+ TikaLoader loader = TikaLoader.load(configFile);
+ assertNotNull(loader, "TikaLoader should not be null for: " +
resourceName);
+ }
+ }
+
+ @Test
+ public void testServerBasicConfig() throws Exception {
+ loadAndValidate("server-basic.json");
+ }
+
+ @Test
+ public void testServerWithParsersConfig() throws Exception {
+ loadAndValidate("server-with-parsers.json");
+ }
+}
diff --git
a/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json
b/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json
new file mode 100644
index 0000000000..d133c0deeb
--- /dev/null
+++
b/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json
@@ -0,0 +1,13 @@
+{
+ "server": {
+ "port": 9998,
+ "host": "localhost",
+ "taskTimeoutMillis": 300000,
+ "enableUnsecureFeatures": false
+ },
+ "parsers": [
+ {
+ "default-parser": {}
+ }
+ ]
+}
diff --git
a/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
new file mode 100644
index 0000000000..fadb08a55f
--- /dev/null
+++
b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
@@ -0,0 +1,24 @@
+{
+ "server": {
+ "port": 9998,
+ "host": "0.0.0.0",
+ "taskTimeoutMillis": 600000,
+ "returnStackTrace": true
+ },
+ "parsers": [
+ {
+ "pdf-parser": {
+ "extractInlineImages": true,
+ "ocrStrategy": "AUTO"
+ }
+ },
+ {
+ "default-parser": {}
+ }
+ ],
+ "detectors": [
+ {
+ "default-detector": {}
+ }
+ ]
+}