This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4733 in repository https://gitbox.apache.org/repos/asf/tika.git
commit b86b4733a5cbfe5111eefe5a6dafc15e7986dc4e Author: tallison <[email protected]> AuthorDate: Wed May 20 09:24:10 2026 -0400 TIKA-4733 -- improve release artifact robustness and documentation --- docs/modules/ROOT/pages/pipes/parse-modes.adoc | 4 +- docs/modules/ROOT/pages/using-tika/cli/index.adoc | 35 ++++++++++++--- pom.xml | 50 ++++++++++++++++++++++ tika-app/pom.xml | 4 ++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 32 ++++++++++++-- tika-eval/tika-eval-app/pom.xml | 4 ++ tika-server/tika-server-standard/pom.xml | 37 ++++++++++++++++ .../src/main/assembly/assembly.xml | 3 +- 8 files changed, 156 insertions(+), 13 deletions(-) diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc index 6e5f47fa4e..9c1bf96860 100644 --- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -147,12 +147,12 @@ only `X-TIKA:content` and `X-TIKA:container_exception`. If you set your own === CLI usage -The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the `--content-only` +The `tika-app` batch processor supports `CONTENT_ONLY` via the `--content-only` flag: [source,bash] ---- -java -jar tika-async-cli.jar -i /input -o /output -h m --content-only +java -jar tika-app.jar -i /input -o /output -h m --content-only ---- This produces `.md` files (when using the `m` handler type) containing only the diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc b/docs/modules/ROOT/pages/using-tika/cli/index.adoc index 17a631e1f8..e3abc00a3c 100644 --- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc +++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc @@ -24,9 +24,29 @@ This section covers using Apache Tika from the command line via `tika-app`. == Overview -The Tika application (`tika-app.jar`) is a standalone command line utility for extracting +The Tika application (`tika-app`) is a command line utility for extracting text content and metadata from all sorts of files. +== Installation + +NOTE: As of 4.x, `tika-app` is distributed as a zip archive rather than a single +self-contained jar. The bare `tika-app-<version>.jar` is only a thin launcher and +will fail with `NoClassDefFoundError` if run on its own — the parsers and supporting +modules (including the batch processor) live in the adjacent `lib/` directory. + +Download `tika-app-<version>.zip`, unzip it, and run `tika-app-<version>.jar` from +inside the unzipped directory so that `lib/` and `plugins/` sit alongside the jar: + +[source,bash] +---- +unzip tika-app-<version>.zip +cd tika-app-<version> +java -jar tika-app-<version>.jar [option...] [file|port...] +---- + +The examples below use `tika-app.jar` as shorthand for the versioned jar in the +unzipped distribution. + == Basic Usage [source,bash] @@ -143,16 +163,17 @@ Use a custom configuration file: java -jar tika-app.jar --config=tika-config.json document.pdf ---- -== Batch Processing (tika-async-cli) +== Batch Processing -For processing large numbers of files, use `tika-async-cli`. It uses the Tika Pipes -architecture with forked JVM processes for fault tolerance. +For processing large numbers of files, run `tika-app` with input/output directories. +Under the hood this uses Tika Pipes batch processing, with forked JVM processes for +fault tolerance. === Basic Batch Usage [source,bash] ---- -java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output +java -jar tika-app.jar -i /path/to/input -o /path/to/output ---- This processes all files in the input directory and writes JSON metadata (RMETA format) @@ -195,7 +216,7 @@ Extract markdown content only (no metadata) from all files: [source,bash] ---- -java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output -h m --content-only +java -jar tika-app.jar -i /path/to/input -o /path/to/output -h m --content-only ---- This produces `.md` files in the output directory containing just the extracted markdown @@ -205,5 +226,5 @@ Extract text with all metadata in concatenated mode: [source,bash] ---- -java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output --concatenate +java -jar tika-app.jar -i /path/to/input -o /path/to/output --concatenate ---- diff --git a/pom.xml b/pom.xml index ce97d63c90..10b9bf8759 100644 --- a/pom.xml +++ b/pom.xml @@ -127,6 +127,56 @@ <include name="tika-pipes/tika-pipes-plugins/*/target/tika-pipes-*-${project.version}.zip*" /> </fileset> </copy> + <!-- + TIKA-4733: fail the release build if any expected dist + artifact did not land in target/${project.version}/. + Ant <copy> silently succeeds when an <include> matches + zero files, so a renamed, re-versioned, or forgotten + artifact (or a module whose assembly/shade stopped + producing output) would otherwise yield an incomplete + release candidate with no error. Each <fail> below names + the missing artifact so the gap is obvious. Keep this list + in sync with the <copy> include list above. + --> + <fail message="Release staging missing: CHANGES.txt"> + <condition><not><available file="${basedir}/target/${project.version}/CHANGES.txt" /></not></condition> + </fail> + <fail message="Release staging missing: tika-${project.version}-src.zip"> + <condition><not><available file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" /></not></condition> + </fail> + <fail message="Release staging missing: tika-parser-scientific-package-${project.version}-shaded.jar"> + <condition><not><available file="${basedir}/target/${project.version}/tika-parser-scientific-package-${project.version}-shaded.jar" /></not></condition> + </fail> + <fail message="Release staging missing: tika-parser-sqlite3-package-${project.version}-shaded.jar"> + <condition><not><available file="${basedir}/target/${project.version}/tika-parser-sqlite3-package-${project.version}-shaded.jar" /></not></condition> + </fail> + <fail message="Release staging missing: tika-parser-nlp-package-${project.version}-shaded.jar"> + <condition><not><available file="${basedir}/target/${project.version}/tika-parser-nlp-package-${project.version}-shaded.jar" /></not></condition> + </fail> + <fail message="Release staging missing: tika-app-${project.version}.zip"> + <condition><not><available file="${basedir}/target/${project.version}/tika-app-${project.version}.zip" /></not></condition> + </fail> + <fail message="Release staging missing: tika-server-standard-${project.version}-bin.zip"> + <condition><not><available file="${basedir}/target/${project.version}/tika-server-standard-${project.version}-bin.zip" /></not></condition> + </fail> + <fail message="Release staging missing: tika-eval-app-${project.version}.zip"> + <condition><not><available file="${basedir}/target/${project.version}/tika-eval-app-${project.version}.zip" /></not></condition> + </fail> + <!-- + pipes plugin zips are staged via a glob, so assert the + staged count equals the number of plugin modules that have + an assembly descriptor. This catches a single plugin whose + zip silently dropped out without hard-coding the plugin set. + --> + <resourcecount property="staged.plugin.zip.count"> + <fileset dir="${basedir}/target/${project.version}" includes="tika-pipes-*-${project.version}.zip" /> + </resourcecount> + <resourcecount property="source.plugin.count"> + <fileset dir="${basedir}/tika-pipes/tika-pipes-plugins" includes="*/src/main/assembly/assembly.xml" /> + </resourcecount> + <fail message="Release staging has ${staged.plugin.zip.count} tika-pipes plugin zip(s) but there are ${source.plugin.count} plugin module(s) with an assembly descriptor; a plugin zip was dropped from dist staging."> + <condition><not><equals arg1="${staged.plugin.zip.count}" arg2="${source.plugin.count}" /></not></condition> + </fail> <checksum algorithm="SHA-512" fileext=".sha512"> <fileset dir="${basedir}/target/${project.version}"> <include name="*.tgz" /> diff --git a/tika-app/pom.xml b/tika-app/pom.xml index 93f1394c92..737c53ad27 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -151,6 +151,10 @@ <descriptor>src/main/assembly/assembly.xml</descriptor> </descriptors> <appendAssemblyId>false</appendAssemblyId> + <!-- TIKA-4733: this fat zip (slim jar + lib/) is an Apache dist artifact, + not a Maven Central one. attach=false keeps it off Central; the + apache-release profile stages it to dist straight from target/. --> + <attach>false</attach> </configuration> <executions> <execution> diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 82be748314..3134d71a61 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -298,17 +298,43 @@ public class TikaCLI { } if (runpack || ! StringUtils.isBlank(tikaConfigPath)) { - TikaAsyncCLI.main(args); + invokeAsyncCLI(args); return; } if (args.length == 1 && args[0].endsWith(".json")) { - TikaAsyncCLI.main(args); + invokeAsyncCLI(args); return; } // For batch mode (two directories), pass directly to TikaAsyncCLI. // It will create its own config with PluginsWriter that includes // plugin-roots, fetcher, emitter, and pipes-iterator configuration. - TikaAsyncCLI.main(args); + invokeAsyncCLI(args); + } + + /** + * Invokes the batch/async processor ({@code tika-async-cli}). The async + * processor and the parsers it forks live in the {@code lib/} directory of + * the tika-app distribution rather than inside the bare {@code tika-app.jar}. + * If tika-app is run as a standalone jar (without the surrounding unzipped + * distribution), the supporting classes are missing from the classpath and + * the JVM throws {@link NoClassDefFoundError}. Translate that into an + * actionable message rather than letting the raw error escape. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-4733">TIKA-4733</a> + */ + private static void invokeAsyncCLI(String[] args) throws Exception { + try { + TikaAsyncCLI.main(args); + } catch (NoClassDefFoundError e) { + System.err.println("Error: could not load the Tika batch/async processor (" + + e.getMessage() + ")."); + System.err.println("Batch mode requires the full tika-app distribution, not the " + + "standalone jar."); + System.err.println("Download tika-app-<version>.zip, unzip it, and run " + + "tika-app-<version>.jar from inside the unzipped directory so that the " + + "adjacent 'lib/' and 'plugins/' directories are on the classpath."); + System.exit(1); + } } /** diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 4b97f249c1..52cac005d1 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -86,6 +86,10 @@ <descriptor>src/main/assembly/assembly.xml</descriptor> </descriptors> <appendAssemblyId>false</appendAssemblyId> + <!-- TIKA-4733: this fat zip (slim jar + lib/) is an Apache dist artifact, + not a Maven Central one. attach=false keeps it off Central; the + apache-release profile stages it to dist straight from target/. --> + <attach>false</attach> </configuration> <executions> <execution> diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml index 13f3c18434..e343f90bf1 100644 --- a/tika-server/tika-server-standard/pom.xml +++ b/tika-server/tika-server-standard/pom.xml @@ -180,6 +180,13 @@ <descriptors> <descriptor>src/main/assembly/assembly.xml</descriptor> </descriptors> + <!-- TIKA-4733: the -bin.zip full distribution is an Apache dist + artifact, not a Maven Central one. attach=false keeps it off + Central; the apache-release profile stages it to dist straight + from target/. Sibling modules (tika-e2e-tests/tika-server) that + consume tika-server-standard:bin:zip as a Maven dep are satisfied + by the install-file execution below. --> + <attach>false</attach> </configuration> <executions> <execution> @@ -191,6 +198,36 @@ </execution> </executions> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-install-plugin</artifactId> + <executions> + <!-- + With <attach>false</attach> on the assembly above (TIKA-4733) the + -bin.zip is not part of the project artifact set and so is neither + deployed to Central nor installed locally. Sibling reactor modules + declare tika-server-standard:bin:zip as a Maven dep, so install it + into the local repo at its canonical coordinates to satisfy reactor + resolution without publishing it to Central. + --> + <execution> + <id>install-server-bin-zip-locally</id> + <phase>install</phase> + <goals> + <goal>install-file</goal> + </goals> + <configuration> + <file>${project.build.directory}/tika-server-standard-${project.version}-bin.zip</file> + <groupId>${project.groupId}</groupId> + <artifactId>${project.artifactId}</artifactId> + <version>${project.version}</version> + <classifier>bin</classifier> + <packaging>zip</packaging> + <generatePom>false</generatePom> + </configuration> + </execution> + </executions> + </plugin> </plugins> </build> diff --git a/tika-server/tika-server-standard/src/main/assembly/assembly.xml b/tika-server/tika-server-standard/src/main/assembly/assembly.xml index ad210ebbd2..707675b01d 100644 --- a/tika-server/tika-server-standard/src/main/assembly/assembly.xml +++ b/tika-server/tika-server-standard/src/main/assembly/assembly.xml @@ -20,8 +20,9 @@ <id>bin</id> <baseDirectory>${project.build.finalName}-bin</baseDirectory> <includeBaseDirectory>false</includeBaseDirectory> + <!-- TIKA-4733: 4.x ships the full distribution as .zip only (universally + readable); the redundant .tgz of identical contents was dropped. --> <formats> - <format>tgz</format> <format>zip</format> </formats> <dependencySets>
