This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8870bf99b9 TIKA-4755 - extra jars (#2880)
8870bf99b9 is described below
commit 8870bf99b9d1e7466acebefecb887660b7e155f4
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 9 22:24:21 2026 +0200
TIKA-4755 - extra jars (#2880)
---
CHANGES.txt | 9 ++
docs/modules/ROOT/pages/configuration/index.adoc | 28 ++++
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +
.../java/org/apache/tika/config/TikaExtras.java | 167 +++++++++++++++++++++
.../org/apache/tika/config/TikaExtrasTest.java | 138 +++++++++++++++++
.../tika/pipes/core/PerClientServerManager.java | 4 +-
.../tika/pipes/core/SharedServerManager.java | 4 +-
tika-server/docker-build/README.md | 6 +
.../apache/tika/server/core/TikaServerProcess.java | 6 +
9 files changed, 362 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 31c3876a0e..e827e1fab4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,15 @@ Release 4.0.0-beta-1 - unreleased
-x/--xml, the server /tika/xml and /rmeta/xml paths (or the
X-Tika-Handler header), and the async CLI --handler x (TIKA-4663).
+ NEW FEATURES
+
+ * tika-app and tika-server can load extra jars (additional
EncodingDetectors,
+ Parsers, etc.) from the directory named by the -Dtika.extras.dir system
+ property, without repackaging the application. Off by default; the
directory
+ is a trusted code location whose contents run with full process
privileges.
+ The extra jars are also forwarded onto forked pipes/server worker
processes,
+ so they are available where parsing actually happens (TIKA-4755).
+
Release 4.0.0-alpha-1 - 5/4/2026
diff --git a/docs/modules/ROOT/pages/configuration/index.adoc
b/docs/modules/ROOT/pages/configuration/index.adoc
index 37a176e0a3..0605793967 100644
--- a/docs/modules/ROOT/pages/configuration/index.adoc
+++ b/docs/modules/ROOT/pages/configuration/index.adoc
@@ -28,6 +28,34 @@ content handlers, server behavior, and the Tika Pipes
pipeline.
NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the
xref:migration-to-4x/index.adoc[Migration Guide] for details on converting to
JSON.
+== Adding extra jars (`tika.extras.dir`)
+
+To add extra components — additional `EncodingDetector` or `Parser`
+implementations, or their dependencies — without repackaging the application,
+drop their jars in a directory and point the `tika.extras.dir` system property
+at it:
+
+[source,bash]
+----
+java -Dtika.extras.dir=/path/to/extras -jar tika-app.jar ...
+java -Dtika.extras.dir=/path/to/extras -jar tika-server-standard.jar ...
+----
+
+Every `*.jar` in that directory is added to the classpath that Tika's
+service-loading scans, so SPI-registered components in those jars are picked up
+automatically. The jars are also forwarded onto forked Pipes/server worker
+processes, so they are available where parsing happens.
+
+This is *off by default* — nothing is loaded unless `tika.extras.dir` is set,
and
+there is no implicit default directory (Tika does not scan the working
directory automatically). Treat the directory as a trusted code
+location: anything in it runs with the full privileges of the Tika process, so
it
+must not be writable by less-trusted principals (and, for a server, must not be
+reachable by request handling).
+
+NOTE: In the Docker images you can instead mount a directory to `/tika-extras`.
+PF4J-managed plugins are a separate mechanism — see
+xref:pipes/plugins/index.adoc[Pipes Plugins].
+
== Top-level JSON structure
A `tika-config.json` is a single JSON object whose keys are the top-level
sections
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 5bea9f0da6..abbe4c280f 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -69,6 +69,7 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.Tika;
import org.apache.tika.async.cli.TikaAsyncCLI;
import org.apache.tika.config.EmbeddedLimits;
+import org.apache.tika.config.TikaExtras;
import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.config.loader.ComponentRegistry;
import org.apache.tika.config.loader.TikaLoader;
@@ -256,6 +257,7 @@ public class TikaCLI {
}
public static void main(String[] args) throws Exception {
+ TikaExtras.install();
TikaCLI cli = new TikaCLI();
if (cli.testForHelp(args)) {
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java
b/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java
new file mode 100644
index 0000000000..1a702c9ef7
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import java.io.File;
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Opt-in mechanism for adding user-supplied "extras" jars (extra
+ * {@code EncodingDetector}s, {@code Parser}s, etc.) to Tika's SPI discovery
+ * without repackaging the application.
+ *
+ * <p><b>Off by default.</b> Nothing is loaded unless the
+ * {@value #EXTRAS_DIR_PROPERTY} system property points at a directory; then
every
+ * {@code *.jar} in it is made visible to service-loading. There is no
implicit or
+ * default directory — the feature is off unless the property is set. (A
relative
+ * property value is resolved against the process working directory, like any
path.)
+ *
+ * <p><b>Security:</b> this is a trusted code directory — anything in it runs
with
+ * the full privileges of the Tika process. Treat write access to it exactly
like
+ * write access to {@code lib/}; it must not be writable by less-trusted
principals
+ * (for servers, not reachable by request handling). Being opt-in keeps "we
are
+ * now loading extra code" an explicit, auditable choice.
+ */
+public final class TikaExtras {
+
+ /** System property naming the extras directory. Unset = feature off. */
+ public static final String EXTRAS_DIR_PROPERTY = "tika.extras.dir";
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaExtras.class);
+
+ private TikaExtras() {
+ }
+
+ /**
+ * If {@value #EXTRAS_DIR_PROPERTY} is set, installs a classloader over the
+ * {@code *.jar} files in that directory as the thread + Tika
+ * {@link ServiceLoader} context classloader, so they join SPI discovery.
+ * No-op (returns {@code null}) when the property is unset or the
directory is
+ * missing/empty. Call exactly once at startup, before any Tika component
is
+ * loaded: each call builds a new classloader, so repeated calls stack
them and
+ * leave the earlier ones' open jar handles dangling.
+ *
+ * @return the installed classloader, or {@code null} if extras are
off/empty
+ */
+ public static ClassLoader install() {
+ List<Path> jars = extraJars();
+ if (jars.isEmpty()) {
+ return null;
+ }
+ List<URL> urls = new ArrayList<>(jars.size());
+ List<Path> loaded = new ArrayList<>(jars.size());
+ for (Path jar : jars) {
+ try {
+ urls.add(jar.toUri().toURL());
+ loaded.add(jar);
+ } catch (Exception e) {
+ LOG.warn("Skipping extra jar {}: {}", jar, e.toString());
+ }
+ }
+ if (urls.isEmpty()) {
+ return null;
+ }
+ ClassLoader parent = Thread.currentThread().getContextClassLoader();
+ if (parent == null) {
+ parent = TikaExtras.class.getClassLoader();
+ }
+ URLClassLoader cl = new URLClassLoader(urls.toArray(new URL[0]),
parent);
+ Thread.currentThread().setContextClassLoader(cl);
+ ServiceLoader.setContextClassLoader(cl);
+ LOG.info("{}: loaded {} extra jar(s): {}", EXTRAS_DIR_PROPERTY,
loaded.size(), loaded);
+ return cl;
+ }
+
+ /**
+ * The {@code *.jar} files in the {@value #EXTRAS_DIR_PROPERTY} directory
— for
+ * callers that extend a forked process's classpath rather than installing
a
+ * classloader. Empty when the property is unset or the directory is
+ * missing/has no jars.
+ */
+ public static List<Path> extraJars() {
+ Path dir = extrasDir();
+ if (dir == null || !Files.isDirectory(dir)) {
+ return Collections.emptyList();
+ }
+ List<Path> jars = new ArrayList<>();
+ try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir,
"*.jar")) {
+ for (Path jar : stream) {
+ jars.add(jar);
+ }
+ } catch (Exception e) {
+ LOG.warn("Could not scan {}={}: {}", EXTRAS_DIR_PROPERTY, dir,
e.toString());
+ }
+ // Sort by file name so jar load order (classloader URL order /
forked-child
+ // classpath order, hence SPI precedence) is deterministic across
platforms
+ // and filesystems rather than depending on directory iteration order.
+ jars.sort(Comparator.comparing(jar -> jar.getFileName().toString()));
+ return jars;
+ }
+
+ /**
+ * Appends the {@link #extraJars()} (as absolute paths, joined with the
+ * platform path separator) to the given classpath string — for extending a
+ * forked process's {@code -cp} with the extras jars. Returns {@code
classpath}
+ * unchanged when the feature is off or the directory has no jars.
+ *
+ * @param classpath the base classpath to extend
+ * @return the classpath with any extras jars appended
+ */
+ public static String appendJarsToClasspath(String classpath) {
+ List<Path> jars = extraJars();
+ if (jars.isEmpty()) {
+ return classpath;
+ }
+ String separator = File.pathSeparator;
+ StringBuilder sb = new StringBuilder();
+ if (classpath != null && !classpath.isEmpty()) {
+ sb.append(classpath);
+ }
+ for (Path jar : jars) {
+ if (sb.length() > 0) {
+ sb.append(separator);
+ }
+ sb.append(jar.toAbsolutePath());
+ }
+ return sb.toString();
+ }
+
+ /** The configured extras directory, or {@code null} if the feature is
off. */
+ public static Path extrasDir() {
+ String prop = System.getProperty(EXTRAS_DIR_PROPERTY);
+ if (prop == null || prop.isBlank()) {
+ return null;
+ }
+ try {
+ return Path.of(prop.trim());
+ } catch (java.nio.file.InvalidPathException e) {
+ LOG.warn("Ignoring invalid {}: {}", EXTRAS_DIR_PROPERTY,
e.getMessage());
+ return null;
+ }
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java
b/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java
new file mode 100644
index 0000000000..f57e5c4d64
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.net.URLClassLoader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.jar.JarEntry;
+import java.util.jar.JarOutputStream;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+public class TikaExtrasTest {
+
+ private static final String MARKER = "tika-extra-marker.txt";
+
+ @Test
+ public void offWhenPropertyUnset() throws Exception {
+ withProperty(null, () -> {
+ assertNull(TikaExtras.extrasDir(), "feature must be off with no
property");
+ assertTrue(TikaExtras.extraJars().isEmpty());
+ assertNull(TikaExtras.install());
+ });
+ }
+
+ @Test
+ public void emptyWhenDirMissing(@TempDir Path tmp) throws Exception {
+ Path missing = tmp.resolve("does-not-exist");
+ withProperty(missing.toString(), () -> {
+ assertTrue(TikaExtras.extraJars().isEmpty());
+ assertNull(TikaExtras.install());
+ });
+ }
+
+ @Test
+ public void loadsJarsWhenPropertySet(@TempDir Path tmp) throws Exception {
+ Path jar = tmp.resolve("extra.jar");
+ try (JarOutputStream jos = new
JarOutputStream(Files.newOutputStream(jar))) {
+ jos.putNextEntry(new JarEntry(MARKER));
+ jos.write("hi".getBytes(UTF_8));
+ jos.closeEntry();
+ }
+ ClassLoader prevCtx = Thread.currentThread().getContextClassLoader();
+ ClassLoader[] installed = new ClassLoader[1];
+ try {
+ withProperty(tmp.toString(), () -> {
+ assertEquals(1, TikaExtras.extraJars().size());
+ ClassLoader cl = TikaExtras.install();
+ installed[0] = cl;
+ assertNotNull(cl, "extras dir with a jar should install a
classloader");
+ assertNotNull(cl.getResource(MARKER), "the extra jar must be
on the classloader");
+ assertSame(cl, Thread.currentThread().getContextClassLoader());
+ });
+ } finally {
+ Thread.currentThread().setContextClassLoader(prevCtx);
+ // ServiceLoader.CONTEXT_CLASS_LOADER defaults to null and this is
the
+ // only test that sets it; reset to null rather than to the
thread's
+ // context loader so we don't leak global state into later tests.
+ ServiceLoader.setContextClassLoader(null);
+ // Close the URLClassLoader so it releases its handle on extra.jar
before
+ // @TempDir cleanup runs; otherwise the delete fails on Windows,
where an
+ // open file cannot be removed.
+ if (installed[0] instanceof URLClassLoader) {
+ ((URLClassLoader) installed[0]).close();
+ }
+ }
+ }
+
+ @Test
+ public void appendJarsToClasspathOffReturnsInput() throws Exception {
+ withProperty(null, () -> {
+ assertNull(TikaExtras.appendJarsToClasspath(null));
+ assertEquals("base", TikaExtras.appendJarsToClasspath("base"));
+ });
+ }
+
+ @Test
+ public void appendJarsToClasspathNoLeadingSeparator(@TempDir Path tmp)
throws Exception {
+ Path jar = tmp.resolve("extra.jar");
+ try (JarOutputStream jos = new
JarOutputStream(Files.newOutputStream(jar))) {
+ jos.putNextEntry(new JarEntry(MARKER));
+ jos.closeEntry();
+ }
+ String sep = System.getProperty("path.separator");
+ String abs = jar.toAbsolutePath().toString();
+ withProperty(tmp.toString(), () -> {
+ // null/empty base must not produce a leading separator
+ assertEquals(abs, TikaExtras.appendJarsToClasspath(null));
+ assertEquals(abs, TikaExtras.appendJarsToClasspath(""));
+ // non-empty base gets the separator between it and the jar
+ assertEquals("base" + sep + abs,
TikaExtras.appendJarsToClasspath("base"));
+ });
+ }
+
+ private interface Body {
+ void run() throws Exception;
+ }
+
+ private static void withProperty(String value, Body body) throws Exception
{
+ String prev = System.getProperty(TikaExtras.EXTRAS_DIR_PROPERTY);
+ try {
+ if (value == null) {
+ System.clearProperty(TikaExtras.EXTRAS_DIR_PROPERTY);
+ } else {
+ System.setProperty(TikaExtras.EXTRAS_DIR_PROPERTY, value);
+ }
+ body.run();
+ } finally {
+ if (prev == null) {
+ System.clearProperty(TikaExtras.EXTRAS_DIR_PROPERTY);
+ } else {
+ System.setProperty(TikaExtras.EXTRAS_DIR_PROPERTY, prev);
+ }
+ }
+ }
+}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java
index 50165dce1b..fdc661d83b 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java
@@ -34,6 +34,7 @@ import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.tika.config.TikaExtras;
import org.apache.tika.pipes.core.server.PipesServer;
import org.apache.tika.utils.ProcessUtils;
@@ -509,7 +510,8 @@ public class PerClientServerManager implements
ServerManager {
private Path writeArgFile() throws IOException {
Path argFile = tmpDir.resolve("jvm-args.txt");
- String classpath = System.getProperty("java.class.path");
+ // forward any tika.extras.dir jars to the forked PipesServer
+ String classpath =
TikaExtras.appendJarsToClasspath(System.getProperty("java.class.path"));
String normalizedClasspath = classpath.replace("\\", "/");
String content = "-cp\n\"" + normalizedClasspath + "\"\n";
Files.writeString(argFile, content, StandardCharsets.UTF_8);
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java
index ea18f31b0e..40b24cfb5d 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java
@@ -37,6 +37,7 @@ import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.tika.config.TikaExtras;
import org.apache.tika.pipes.core.server.PipesServer;
import org.apache.tika.utils.ProcessUtils;
@@ -497,7 +498,8 @@ public class SharedServerManager implements ServerManager {
private Path writeArgFile() throws IOException {
Path argFile = tmpDir.resolve("jvm-args.txt");
- String classpath = System.getProperty("java.class.path");
+ // forward any tika.extras.dir jars to the forked PipesServer
+ String classpath =
TikaExtras.appendJarsToClasspath(System.getProperty("java.class.path"));
String normalizedClasspath = classpath.replace("\\", "/");
String content = "-cp\n\"" + normalizedClasspath + "\"\n";
Files.writeString(argFile, content, StandardCharsets.UTF_8);
diff --git a/tika-server/docker-build/README.md
b/tika-server/docker-build/README.md
index b7307d5c51..61adad448f 100644
--- a/tika-server/docker-build/README.md
+++ b/tika-server/docker-build/README.md
@@ -215,6 +215,12 @@ As of 2.5.0.2, if you'd like to add extra jars from your
local `my-jars` directo
You may want to do this to add optional components, such as the tika-eval
metadata filter, or optional
dependencies such as jai-imageio-jpeg2000 (check license compatibility first!).
+Outside Docker (e.g. `java -jar tika-app.jar` or `tika-server-standard.jar`),
point the
+`tika.extras.dir` system property at a directory of extra jars instead, e.g.
+`java -Dtika.extras.dir=my-jars -jar tika-server-standard.jar`. Either way the
extra jars
+are also forwarded onto forked worker processes. The directory is a trusted
code location
+whose contents run with the full privileges of the Tika process.
+
### Docker Compose Examples
There are a number of sample Docker Compose files included in the repos to
allow you to test some different scenarios.
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index 0a31493e25..1d1cddfacb 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -60,6 +60,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.Tika;
import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.config.TikaExtras;
import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaException;
@@ -120,6 +121,11 @@ public class TikaServerProcess {
}
public static void main(String[] args) throws Exception {
+ // Install any tika.extras.dir jars before loading components, so the
+ // server's in-process SPI discovery sees them too (forked PipesServer
+ // children get them via the classpath in PerClientServerManager /
+ // SharedServerManager writeArgFile).
+ TikaExtras.install();
LOG.info("Starting {} server", Tika.getString());
try {
Options options = getOptions();