This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4755-enable-extras-jars in repository https://gitbox.apache.org/repos/asf/tika.git
commit c377695ec28a164a6197b9b156e169e1d8a5deaf Author: tallison <[email protected]> AuthorDate: Sat Jun 6 16:30:06 2026 -0400 TIKA-4755 - extra jars --- CHANGES.txt | 9 ++ docs/modules/ROOT/pages/configuration/index.adoc | 27 +++++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 + .../java/org/apache/tika/config/TikaExtras.java | 123 +++++++++++++++++++++ .../org/apache/tika/config/TikaExtrasTest.java | 100 +++++++++++++++++ .../tika/pipes/core/PerClientServerManager.java | 6 + .../tika/pipes/core/SharedServerManager.java | 6 + tika-server/docker-build/README.md | 6 + 8 files changed, 279 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index 31c3876a0e..e827e1fab4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -9,6 +9,15 @@ Release 4.0.0-beta-1 - unreleased -x/--xml, the server /tika/xml and /rmeta/xml paths (or the X-Tika-Handler header), and the async CLI --handler x (TIKA-4663). + NEW FEATURES + + * tika-app and tika-server can load extra jars (additional EncodingDetectors, + Parsers, etc.) from the directory named by the -Dtika.extras.dir system + property, without repackaging the application. Off by default; the directory + is a trusted code location whose contents run with full process privileges. + The extra jars are also forwarded onto forked pipes/server worker processes, + so they are available where parsing actually happens (TIKA-4755). + Release 4.0.0-alpha-1 - 5/4/2026 diff --git a/docs/modules/ROOT/pages/configuration/index.adoc b/docs/modules/ROOT/pages/configuration/index.adoc index 37a176e0a3..3d96d7c5dd 100644 --- a/docs/modules/ROOT/pages/configuration/index.adoc +++ b/docs/modules/ROOT/pages/configuration/index.adoc @@ -28,6 +28,33 @@ content handlers, server behavior, and the Tika Pipes pipeline. NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the xref:migration-to-4x/index.adoc[Migration Guide] for details on converting to JSON. +== Adding extra jars (`tika.extras.dir`) + +To add extra components — additional ``EncodingDetector``s, ``Parser``s, or their +dependencies — without repackaging the application, drop their jars in a directory +and point the `tika.extras.dir` system property at it: + +[source,bash] +---- +java -Dtika.extras.dir=/path/to/extras -jar tika-app.jar ... +java -Dtika.extras.dir=/path/to/extras -jar tika-server-standard.jar ... +---- + +Every `*.jar` in that directory is added to the classpath that Tika's +service-loading scans, so SPI-registered components in those jars are picked up +automatically. The jars are also forwarded onto forked Pipes/server worker +processes, so they are available where parsing happens. + +This is *off by default* — nothing is loaded unless `tika.extras.dir` is set, and +the working directory is never used. Treat the directory as a trusted code +location: anything in it runs with the full privileges of the Tika process, so it +must not be writable by less-trusted principals (and, for a server, must not be +reachable by request handling). + +NOTE: In the Docker images you can instead mount a directory to `/tika-extras`. +PF4J-managed plugins are a separate mechanism — see +xref:pipes/plugins/index.adoc[Pipes Plugins]. + == Top-level JSON structure A `tika-config.json` is a single JSON object whose keys are the top-level sections diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 5bea9f0da6..abbe4c280f 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -69,6 +69,7 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.Tika; import org.apache.tika.async.cli.TikaAsyncCLI; import org.apache.tika.config.EmbeddedLimits; +import org.apache.tika.config.TikaExtras; import org.apache.tika.config.TimeoutLimits; import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.config.loader.TikaLoader; @@ -256,6 +257,7 @@ public class TikaCLI { } public static void main(String[] args) throws Exception { + TikaExtras.install(); TikaCLI cli = new TikaCLI(); if (cli.testForHelp(args)) { diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java b/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java new file mode 100644 index 0000000000..9e9be00d2a --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/config/TikaExtras.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import java.net.URL; +import java.net.URLClassLoader; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Opt-in mechanism for adding user-supplied "extras" jars (extra + * {@code EncodingDetector}s, {@code Parser}s, etc.) to Tika's SPI discovery + * without repackaging the application. + * + * <p><b>Off by default.</b> Nothing is loaded unless the + * {@value #EXTRAS_DIR_PROPERTY} system property points at a directory; then every + * {@code *.jar} in it is made visible to service-loading. There is no implicit + * directory, and the working directory is never used. + * + * <p><b>Security:</b> this is a trusted code directory — anything in it runs with + * the full privileges of the Tika process. Treat write access to it exactly like + * write access to {@code lib/}; it must not be writable by less-trusted principals + * (for servers, not reachable by request handling). Being opt-in keeps "we are + * now loading extra code" an explicit, auditable choice. + */ +public final class TikaExtras { + + /** System property naming the extras directory. Unset = feature off. */ + public static final String EXTRAS_DIR_PROPERTY = "tika.extras.dir"; + + private static final Logger LOG = LoggerFactory.getLogger(TikaExtras.class); + + private TikaExtras() { + } + + /** + * If {@value #EXTRAS_DIR_PROPERTY} is set, installs a classloader over the + * {@code *.jar} files in that directory as the thread + Tika + * {@link ServiceLoader} context classloader, so they join SPI discovery. + * No-op (returns {@code null}) when the property is unset or the directory is + * missing/empty. Call once, before any Tika component is loaded. + * + * @return the installed classloader, or {@code null} if extras are off/empty + */ + public static ClassLoader install() { + List<Path> jars = extraJars(); + if (jars.isEmpty()) { + return null; + } + List<URL> urls = new ArrayList<>(jars.size()); + for (Path jar : jars) { + try { + urls.add(jar.toUri().toURL()); + } catch (Exception e) { + LOG.warn("Skipping extra jar {}: {}", jar, e.toString()); + } + } + if (urls.isEmpty()) { + return null; + } + ClassLoader parent = Thread.currentThread().getContextClassLoader(); + if (parent == null) { + parent = TikaExtras.class.getClassLoader(); + } + URLClassLoader cl = new URLClassLoader(urls.toArray(new URL[0]), parent); + Thread.currentThread().setContextClassLoader(cl); + ServiceLoader.setContextClassLoader(cl); + LOG.info("{}: loaded {} extra jar(s): {}", EXTRAS_DIR_PROPERTY, urls.size(), jars); + return cl; + } + + /** + * The {@code *.jar} files in the {@value #EXTRAS_DIR_PROPERTY} directory — for + * callers that extend a forked process's classpath rather than installing a + * classloader. Empty when the property is unset or the directory is + * missing/has no jars. + */ + public static List<Path> extraJars() { + Path dir = extrasDir(); + if (dir == null || !Files.isDirectory(dir)) { + return Collections.emptyList(); + } + List<Path> jars = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*.jar")) { + for (Path jar : stream) { + jars.add(jar); + } + } catch (Exception e) { + LOG.warn("Could not scan {}={}: {}", EXTRAS_DIR_PROPERTY, dir, e.toString()); + } + return jars; + } + + /** The configured extras directory, or {@code null} if the feature is off. */ + public static Path extrasDir() { + String prop = System.getProperty(EXTRAS_DIR_PROPERTY); + if (prop == null || prop.isBlank()) { + return null; + } + return Path.of(prop.trim()); + } +} diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java new file mode 100644 index 0000000000..06f3bb1155 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/config/TikaExtrasTest.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.jar.JarEntry; +import java.util.jar.JarOutputStream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TikaExtrasTest { + + private static final String MARKER = "tika-extra-marker.txt"; + + @Test + public void offWhenPropertyUnset() throws Exception { + withProperty(null, () -> { + assertNull(TikaExtras.extrasDir(), "feature must be off with no property"); + assertTrue(TikaExtras.extraJars().isEmpty()); + assertNull(TikaExtras.install()); + }); + } + + @Test + public void emptyWhenDirMissing(@TempDir Path tmp) throws Exception { + Path missing = tmp.resolve("does-not-exist"); + withProperty(missing.toString(), () -> { + assertTrue(TikaExtras.extraJars().isEmpty()); + assertNull(TikaExtras.install()); + }); + } + + @Test + public void loadsJarsWhenPropertySet(@TempDir Path tmp) throws Exception { + Path jar = tmp.resolve("extra.jar"); + try (JarOutputStream jos = new JarOutputStream(Files.newOutputStream(jar))) { + jos.putNextEntry(new JarEntry(MARKER)); + jos.write("hi".getBytes(UTF_8)); + jos.closeEntry(); + } + ClassLoader prevCtx = Thread.currentThread().getContextClassLoader(); + try { + withProperty(tmp.toString(), () -> { + assertEquals(1, TikaExtras.extraJars().size()); + ClassLoader cl = TikaExtras.install(); + assertNotNull(cl, "extras dir with a jar should install a classloader"); + assertNotNull(cl.getResource(MARKER), "the extra jar must be on the classloader"); + assertSame(cl, Thread.currentThread().getContextClassLoader()); + }); + } finally { + Thread.currentThread().setContextClassLoader(prevCtx); + ServiceLoader.setContextClassLoader(prevCtx); + } + } + + private interface Body { + void run() throws Exception; + } + + private static void withProperty(String value, Body body) throws Exception { + String prev = System.getProperty(TikaExtras.EXTRAS_DIR_PROPERTY); + try { + if (value == null) { + System.clearProperty(TikaExtras.EXTRAS_DIR_PROPERTY); + } else { + System.setProperty(TikaExtras.EXTRAS_DIR_PROPERTY, value); + } + body.run(); + } finally { + if (prev == null) { + System.clearProperty(TikaExtras.EXTRAS_DIR_PROPERTY); + } else { + System.setProperty(TikaExtras.EXTRAS_DIR_PROPERTY, prev); + } + } + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java index 50165dce1b..35a9433463 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PerClientServerManager.java @@ -34,6 +34,7 @@ import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaExtras; import org.apache.tika.pipes.core.server.PipesServer; import org.apache.tika.utils.ProcessUtils; @@ -510,6 +511,11 @@ public class PerClientServerManager implements ServerManager { private Path writeArgFile() throws IOException { Path argFile = tmpDir.resolve("jvm-args.txt"); String classpath = System.getProperty("java.class.path"); + // forward any tika.extras.dir jars to the forked PipesServer + for (Path extra : TikaExtras.extraJars()) { + classpath = classpath + System.getProperty("path.separator") + + extra.toAbsolutePath(); + } String normalizedClasspath = classpath.replace("\\", "/"); String content = "-cp\n\"" + normalizedClasspath + "\"\n"; Files.writeString(argFile, content, StandardCharsets.UTF_8); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java index ea18f31b0e..65d654168e 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/SharedServerManager.java @@ -37,6 +37,7 @@ import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.TikaExtras; import org.apache.tika.pipes.core.server.PipesServer; import org.apache.tika.utils.ProcessUtils; @@ -498,6 +499,11 @@ public class SharedServerManager implements ServerManager { private Path writeArgFile() throws IOException { Path argFile = tmpDir.resolve("jvm-args.txt"); String classpath = System.getProperty("java.class.path"); + // forward any tika.extras.dir jars to the forked PipesServer + for (Path extra : TikaExtras.extraJars()) { + classpath = classpath + System.getProperty("path.separator") + + extra.toAbsolutePath(); + } String normalizedClasspath = classpath.replace("\\", "/"); String content = "-cp\n\"" + normalizedClasspath + "\"\n"; Files.writeString(argFile, content, StandardCharsets.UTF_8); diff --git a/tika-server/docker-build/README.md b/tika-server/docker-build/README.md index b7307d5c51..61adad448f 100644 --- a/tika-server/docker-build/README.md +++ b/tika-server/docker-build/README.md @@ -215,6 +215,12 @@ As of 2.5.0.2, if you'd like to add extra jars from your local `my-jars` directo You may want to do this to add optional components, such as the tika-eval metadata filter, or optional dependencies such as jai-imageio-jpeg2000 (check license compatibility first!). +Outside Docker (e.g. `java -jar tika-app.jar` or `tika-server-standard.jar`), point the +`tika.extras.dir` system property at a directory of extra jars instead, e.g. +`java -Dtika.extras.dir=my-jars -jar tika-server-standard.jar`. Either way the extra jars +are also forwarded onto forked worker processes. The directory is a trusted code location +whose contents run with the full privileges of the Tika process. + ### Docker Compose Examples There are a number of sample Docker Compose files included in the repos to allow you to test some different scenarios.
