This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4733
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b86b4733a5cbfe5111eefe5a6dafc15e7986dc4e
Author: tallison <[email protected]>
AuthorDate: Wed May 20 09:24:10 2026 -0400

    TIKA-4733 -- improve release artifact robustness and documentation
---
 docs/modules/ROOT/pages/pipes/parse-modes.adoc     |  4 +-
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  | 35 ++++++++++++---
 pom.xml                                            | 50 ++++++++++++++++++++++
 tika-app/pom.xml                                   |  4 ++
 .../src/main/java/org/apache/tika/cli/TikaCLI.java | 32 ++++++++++++--
 tika-eval/tika-eval-app/pom.xml                    |  4 ++
 tika-server/tika-server-standard/pom.xml           | 37 ++++++++++++++++
 .../src/main/assembly/assembly.xml                 |  3 +-
 8 files changed, 156 insertions(+), 13 deletions(-)

diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc 
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 6e5f47fa4e..9c1bf96860 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -147,12 +147,12 @@ only `X-TIKA:content` and `X-TIKA:container_exception`. 
If you set your own
 
 === CLI usage
 
-The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the 
`--content-only`
+The `tika-app` batch processor supports `CONTENT_ONLY` via the `--content-only`
 flag:
 
 [source,bash]
 ----
-java -jar tika-async-cli.jar -i /input -o /output -h m --content-only
+java -jar tika-app.jar -i /input -o /output -h m --content-only
 ----
 
 This produces `.md` files (when using the `m` handler type) containing only the
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc 
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index 17a631e1f8..e3abc00a3c 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -24,9 +24,29 @@ This section covers using Apache Tika from the command line 
via `tika-app`.
 
 == Overview
 
-The Tika application (`tika-app.jar`) is a standalone command line utility for 
extracting
+The Tika application (`tika-app`) is a command line utility for extracting
 text content and metadata from all sorts of files.
 
+== Installation
+
+NOTE: As of 4.x, `tika-app` is distributed as a zip archive rather than a 
single
+self-contained jar. The bare `tika-app-<version>.jar` is only a thin launcher 
and
+will fail with `NoClassDefFoundError` if run on its own — the parsers and 
supporting
+modules (including the batch processor) live in the adjacent `lib/` directory.
+
+Download `tika-app-<version>.zip`, unzip it, and run `tika-app-<version>.jar` 
from
+inside the unzipped directory so that `lib/` and `plugins/` sit alongside the 
jar:
+
+[source,bash]
+----
+unzip tika-app-<version>.zip
+cd tika-app-<version>
+java -jar tika-app-<version>.jar [option...] [file|port...]
+----
+
+The examples below use `tika-app.jar` as shorthand for the versioned jar in the
+unzipped distribution.
+
 == Basic Usage
 
 [source,bash]
@@ -143,16 +163,17 @@ Use a custom configuration file:
 java -jar tika-app.jar --config=tika-config.json document.pdf
 ----
 
-== Batch Processing (tika-async-cli)
+== Batch Processing
 
-For processing large numbers of files, use `tika-async-cli`. It uses the Tika 
Pipes
-architecture with forked JVM processes for fault tolerance.
+For processing large numbers of files, run `tika-app` with input/output 
directories.
+Under the hood this uses Tika Pipes batch processing, with forked JVM 
processes for
+fault tolerance.
 
 === Basic Batch Usage
 
 [source,bash]
 ----
-java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output
+java -jar tika-app.jar -i /path/to/input -o /path/to/output
 ----
 
 This processes all files in the input directory and writes JSON metadata 
(RMETA format)
@@ -195,7 +216,7 @@ Extract markdown content only (no metadata) from all files:
 
 [source,bash]
 ----
-java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output -h m 
--content-only
+java -jar tika-app.jar -i /path/to/input -o /path/to/output -h m --content-only
 ----
 
 This produces `.md` files in the output directory containing just the 
extracted markdown
@@ -205,5 +226,5 @@ Extract text with all metadata in concatenated mode:
 
 [source,bash]
 ----
-java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output --concatenate
+java -jar tika-app.jar -i /path/to/input -o /path/to/output --concatenate
 ----
diff --git a/pom.xml b/pom.xml
index ce97d63c90..10b9bf8759 100644
--- a/pom.xml
+++ b/pom.xml
@@ -127,6 +127,56 @@
                         <include 
name="tika-pipes/tika-pipes-plugins/*/target/tika-pipes-*-${project.version}.zip*"
 />
                       </fileset>
                     </copy>
+                    <!--
+                      TIKA-4733: fail the release build if any expected dist
+                      artifact did not land in target/${project.version}/.
+                      Ant <copy> silently succeeds when an <include> matches
+                      zero files, so a renamed, re-versioned, or forgotten
+                      artifact (or a module whose assembly/shade stopped
+                      producing output) would otherwise yield an incomplete
+                      release candidate with no error. Each <fail> below names
+                      the missing artifact so the gap is obvious. Keep this 
list
+                      in sync with the <copy> include list above.
+                    -->
+                    <fail message="Release staging missing: CHANGES.txt">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/CHANGES.txt" /></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-${project.version}-src.zip">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" 
/></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-parser-scientific-package-${project.version}-shaded.jar">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-parser-scientific-package-${project.version}-shaded.jar"
 /></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-parser-sqlite3-package-${project.version}-shaded.jar">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-parser-sqlite3-package-${project.version}-shaded.jar"
 /></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-parser-nlp-package-${project.version}-shaded.jar">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-parser-nlp-package-${project.version}-shaded.jar"
 /></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-app-${project.version}.zip">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-app-${project.version}.zip" 
/></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-server-standard-${project.version}-bin.zip">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-server-standard-${project.version}-bin.zip"
 /></not></condition>
+                    </fail>
+                    <fail message="Release staging missing: 
tika-eval-app-${project.version}.zip">
+                      <condition><not><available 
file="${basedir}/target/${project.version}/tika-eval-app-${project.version}.zip"
 /></not></condition>
+                    </fail>
+                    <!--
+                      pipes plugin zips are staged via a glob, so assert the
+                      staged count equals the number of plugin modules that 
have
+                      an assembly descriptor. This catches a single plugin 
whose
+                      zip silently dropped out without hard-coding the plugin 
set.
+                    -->
+                    <resourcecount property="staged.plugin.zip.count">
+                      <fileset dir="${basedir}/target/${project.version}" 
includes="tika-pipes-*-${project.version}.zip" />
+                    </resourcecount>
+                    <resourcecount property="source.plugin.count">
+                      <fileset dir="${basedir}/tika-pipes/tika-pipes-plugins" 
includes="*/src/main/assembly/assembly.xml" />
+                    </resourcecount>
+                    <fail message="Release staging has 
${staged.plugin.zip.count} tika-pipes plugin zip(s) but there are 
${source.plugin.count} plugin module(s) with an assembly descriptor; a plugin 
zip was dropped from dist staging.">
+                      <condition><not><equals 
arg1="${staged.plugin.zip.count}" arg2="${source.plugin.count}" 
/></not></condition>
+                    </fail>
                     <checksum algorithm="SHA-512" fileext=".sha512">
                       <fileset dir="${basedir}/target/${project.version}">
                         <include name="*.tgz" />
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 93f1394c92..737c53ad27 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -151,6 +151,10 @@
             <descriptor>src/main/assembly/assembly.xml</descriptor>
           </descriptors>
           <appendAssemblyId>false</appendAssemblyId>
+          <!-- TIKA-4733: this fat zip (slim jar + lib/) is an Apache dist 
artifact,
+               not a Maven Central one. attach=false keeps it off Central; the
+               apache-release profile stages it to dist straight from target/. 
-->
+          <attach>false</attach>
         </configuration>
         <executions>
           <execution>
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 82be748314..3134d71a61 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -298,17 +298,43 @@ public class TikaCLI {
         }
 
         if (runpack || ! StringUtils.isBlank(tikaConfigPath)) {
-            TikaAsyncCLI.main(args);
+            invokeAsyncCLI(args);
             return;
         }
         if (args.length == 1 &&  args[0].endsWith(".json")) {
-            TikaAsyncCLI.main(args);
+            invokeAsyncCLI(args);
             return;
         }
         // For batch mode (two directories), pass directly to TikaAsyncCLI.
         // It will create its own config with PluginsWriter that includes
         // plugin-roots, fetcher, emitter, and pipes-iterator configuration.
-        TikaAsyncCLI.main(args);
+        invokeAsyncCLI(args);
+    }
+
+    /**
+     * Invokes the batch/async processor ({@code tika-async-cli}). The async
+     * processor and the parsers it forks live in the {@code lib/} directory of
+     * the tika-app distribution rather than inside the bare {@code 
tika-app.jar}.
+     * If tika-app is run as a standalone jar (without the surrounding unzipped
+     * distribution), the supporting classes are missing from the classpath and
+     * the JVM throws {@link NoClassDefFoundError}. Translate that into an
+     * actionable message rather than letting the raw error escape.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-4733";>TIKA-4733</a>
+     */
+    private static void invokeAsyncCLI(String[] args) throws Exception {
+        try {
+            TikaAsyncCLI.main(args);
+        } catch (NoClassDefFoundError e) {
+            System.err.println("Error: could not load the Tika batch/async 
processor (" +
+                    e.getMessage() + ").");
+            System.err.println("Batch mode requires the full tika-app 
distribution, not the "
+                    + "standalone jar.");
+            System.err.println("Download tika-app-<version>.zip, unzip it, and 
run "
+                    + "tika-app-<version>.jar from inside the unzipped 
directory so that the "
+                    + "adjacent 'lib/' and 'plugins/' directories are on the 
classpath.");
+            System.exit(1);
+        }
     }
 
     /**
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 4b97f249c1..52cac005d1 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -86,6 +86,10 @@
             <descriptor>src/main/assembly/assembly.xml</descriptor>
           </descriptors>
           <appendAssemblyId>false</appendAssemblyId>
+          <!-- TIKA-4733: this fat zip (slim jar + lib/) is an Apache dist 
artifact,
+               not a Maven Central one. attach=false keeps it off Central; the
+               apache-release profile stages it to dist straight from target/. 
-->
+          <attach>false</attach>
         </configuration>
         <executions>
           <execution>
diff --git a/tika-server/tika-server-standard/pom.xml 
b/tika-server/tika-server-standard/pom.xml
index 13f3c18434..e343f90bf1 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -180,6 +180,13 @@
           <descriptors>
             <descriptor>src/main/assembly/assembly.xml</descriptor>
           </descriptors>
+          <!-- TIKA-4733: the -bin.zip full distribution is an Apache dist
+               artifact, not a Maven Central one. attach=false keeps it off
+               Central; the apache-release profile stages it to dist straight
+               from target/. Sibling modules (tika-e2e-tests/tika-server) that
+               consume tika-server-standard:bin:zip as a Maven dep are 
satisfied
+               by the install-file execution below. -->
+          <attach>false</attach>
         </configuration>
         <executions>
           <execution>
@@ -191,6 +198,36 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-install-plugin</artifactId>
+        <executions>
+          <!--
+            With <attach>false</attach> on the assembly above (TIKA-4733) the
+            -bin.zip is not part of the project artifact set and so is neither
+            deployed to Central nor installed locally. Sibling reactor modules
+            declare tika-server-standard:bin:zip as a Maven dep, so install it
+            into the local repo at its canonical coordinates to satisfy reactor
+            resolution without publishing it to Central.
+          -->
+          <execution>
+            <id>install-server-bin-zip-locally</id>
+            <phase>install</phase>
+            <goals>
+              <goal>install-file</goal>
+            </goals>
+            <configuration>
+              
<file>${project.build.directory}/tika-server-standard-${project.version}-bin.zip</file>
+              <groupId>${project.groupId}</groupId>
+              <artifactId>${project.artifactId}</artifactId>
+              <version>${project.version}</version>
+              <classifier>bin</classifier>
+              <packaging>zip</packaging>
+              <generatePom>false</generatePom>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 
diff --git a/tika-server/tika-server-standard/src/main/assembly/assembly.xml 
b/tika-server/tika-server-standard/src/main/assembly/assembly.xml
index ad210ebbd2..707675b01d 100644
--- a/tika-server/tika-server-standard/src/main/assembly/assembly.xml
+++ b/tika-server/tika-server-standard/src/main/assembly/assembly.xml
@@ -20,8 +20,9 @@
   <id>bin</id>
   <baseDirectory>${project.build.finalName}-bin</baseDirectory>
   <includeBaseDirectory>false</includeBaseDirectory>
+  <!-- TIKA-4733: 4.x ships the full distribution as .zip only (universally
+       readable); the redundant .tgz of identical contents was dropped. -->
   <formats>
-    <format>tgz</format>
     <format>zip</format>
   </formats>
   <dependencySets>

Reply via email to