This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4571 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8a6018d3010873663f3a6d495d9b917e24a4d2d1 Author: tallison <[email protected]> AuthorDate: Mon Dec 15 14:51:37 2025 -0500 TIKA-4571 -- add a replacement for ForkParser (and fix a rat test in tika-serialization :/) --- tika-pipes/pom.xml | 1 + .../org/apache/tika/pipes/api/fetcher/Fetcher.java | 18 +- tika-pipes/tika-pipes-fork-parser/pom.xml | 158 ++++++++ .../src/main/assembly/assembly.xml | 51 +++ .../apache/tika/pipes/fork/PipesForkParser.java | 326 +++++++++++++++ .../tika/pipes/fork/PipesForkParserConfig.java | 263 +++++++++++++ .../tika/pipes/fork/PipesForkParserException.java | 102 +++++ .../apache/tika/pipes/fork/PipesForkResult.java | 151 +++++++ .../tika/pipes/fork/PipesForkParserTest.java | 435 +++++++++++++++++++++ .../tika/pipes/fetcher/fs/FileSystemFetcher.java | 89 ++--- .../pipes/fetcher/fs/FileSystemFetcherConfig.java | 17 +- .../fetcher/fs/FileSystemFetcherRuntimeConfig.java | 54 --- .../fs/FileSystemFetcherRuntimeConfigTest.java | 184 --------- .../pipes/fetcher/fs/FileSystemFetcherTest.java | 115 +++++- tika-serialization/pom.xml | 9 + 15 files changed, 1662 insertions(+), 311 deletions(-) diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index 40ed5bbbf..e8366313d 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -36,6 +36,7 @@ <module>tika-pipes-reporter-commons</module> <module>tika-pipes-iterator-commons</module> <module>tika-pipes-plugins</module> + <module>tika-pipes-fork-parser</module> <module>tika-async-cli</module> <module>tika-pipes-integration-tests</module> </modules> diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java index d281130f1..1e49488d9 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java @@ -35,5 +35,21 @@ import org.apache.tika.plugins.TikaExtension; */ public interface Fetcher extends TikaExtension, ExtensionPoint { - TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; + /** + * Fetches a resource and returns it as a TikaInputStream. + * + * @param fetchKey the key identifying the resource to fetch (interpretation + * depends on the implementation, e.g., file path, URL, S3 key) + * @param metadata metadata object to be updated with resource information + * @param parseContext the parse context + * @return a TikaInputStream for reading the resource content + * @throws TikaException if a Tika-specific error occurs during fetching + * @throws IOException if an I/O error occurs during fetching + * @throws SecurityException if the fetchKey attempts to access a resource + * outside permitted boundaries (e.g., path traversal attack) + * @throws IllegalArgumentException if the fetchKey contains invalid characters + * (e.g., null bytes) + */ + TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) + throws TikaException, IOException; } diff --git a/tika-pipes/tika-pipes-fork-parser/pom.xml b/tika-pipes/tika-pipes-fork-parser/pom.xml new file mode 100644 index 000000000..712aba51b --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/pom.xml @@ -0,0 +1,158 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes</artifactId> + <version>4.0.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>tika-pipes-fork-parser</artifactId> + + <name>Apache Tika pipes fork parser</name> + <description>A ForkParser implementation backed by PipesClient for parsing in forked JVM processes</description> + <url>https://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-api</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-file-system</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parsers-standard-package</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pipes-file-system</artifactId> + <version>${project.version}</version> + <scope>test</scope> + <type>zip</type> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-api</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-engine</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifestEntries> + <Automatic-Module-Name>org.apache.tika.pipes.fork</Automatic-Module-Name> + </manifestEntries> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <executions> + <execution> + <id>copy-plugins</id> + <phase>process-test-resources</phase> + <goals> + <goal>copy</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/plugins</outputDirectory> + <artifactItems> + <artifactItem> + <groupId>org.apache.tika</groupId> + <artifactId>tika-pipes-file-system</artifactId> + <version>${project.version}</version> + <type>zip</type> + <overWrite>true</overWrite> + </artifactItem> + </artifactItems> + </configuration> + </execution> + <execution> + <id>copy-dependencies</id> + <phase>package</phase> + <goals> + <goal>copy-dependencies</goal> + </goals> + <configuration> + <outputDirectory>${project.build.directory}/lib</outputDirectory> + <includeScope>runtime</includeScope> + <stripVersion>false</stripVersion> + <overWriteReleases>false</overWriteReleases> + <overWriteSnapshots>false</overWriteSnapshots> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <descriptors> + <descriptor>src/main/assembly/assembly.xml</descriptor> + </descriptors> + <appendAssemblyId>false</appendAssemblyId> + </configuration> + <executions> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/assembly/assembly.xml b/tika-pipes/tika-pipes-fork-parser/src/main/assembly/assembly.xml new file mode 100644 index 000000000..37c48d403 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/main/assembly/assembly.xml @@ -0,0 +1,51 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.1" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.1 http://maven.apache.org/xsd/assembly-2.1.1.xsd"> + <id>bin</id> + <formats> + <format>zip</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + + <dependencySets> + <dependencySet> + <outputDirectory>lib</outputDirectory> + <useProjectArtifact>false</useProjectArtifact> + <unpack>false</unpack> + <scope>runtime</scope> + </dependencySet> + </dependencySets> + <fileSets> + <fileSet> + <directory>${project.build.directory}</directory> + <outputDirectory>/</outputDirectory> + <includes> + <include>*.jar</include> + </includes> + <excludes> + <exclude>*-sources.jar</exclude> + <exclude>*-javadoc.jar</exclude> + </excludes> + </fileSet> + <fileSet> + <directory>${project.build.directory}/plugins</directory> + <outputDirectory>plugins</outputDirectory> + </fileSet> + </fileSets> +</assembly> diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java new file mode 100644 index 000000000..1ccba9976 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParser.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fork; + +import java.io.Closeable; +import java.io.IOException; +import java.io.StringWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.PipesResult; +import org.apache.tika.pipes.api.emitter.EmitKey; +import org.apache.tika.pipes.api.fetcher.FetchKey; +import org.apache.tika.pipes.core.PipesConfig; +import org.apache.tika.pipes.core.PipesException; +import org.apache.tika.pipes.core.PipesParser; + +/** + * A ForkParser implementation backed by {@link PipesParser}. + * <p> + * This parser runs parsing in forked JVM processes, providing isolation from + * crashes, memory leaks, and other issues that can occur during parsing. + * Multiple forked processes can be used for concurrent parsing. + * <p> + * Unlike the legacy ForkParser which streams SAX events between processes, + * this implementation uses the pipes infrastructure and returns parsed content + * in the metadata (via {@link org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT}). + * <p> + * <strong>Thread Safety:</strong> This class is thread-safe. Multiple threads can + * call {@link #parse} concurrently, and requests will be distributed across the + * pool of forked processes. + * <p> + * <strong>Error Handling:</strong> + * <ul> + * <li>Application errors (initialization failures, config errors) throw + * {@link PipesForkParserException}</li> + * <li>Process crashes (OOM, timeout) are returned in the result - the next + * parse will automatically restart the forked process</li> + * <li>Per-document errors (fetch/parse exceptions) are returned in the result</li> + * </ul> + * <p> + * Example usage: + * <pre> + * PipesForkParserConfig config = new PipesForkParserConfig(); + * config.setHandlerConfig(new HandlerConfig(HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true)); + * + * try (PipesForkParser parser = new PipesForkParser(config)) { + * PipesForkResult result = parser.parse(Paths.get("/path/to/file.pdf")); + * for (Metadata m : result.getMetadataList()) { + * String content = m.get(TikaCoreProperties.TIKA_CONTENT); + * // process content and metadata + * } + * } + * </pre> + */ +public class PipesForkParser implements Closeable { + + public static final String DEFAULT_FETCHER_NAME = "fs"; + + private final PipesForkParserConfig config; + private final PipesParser pipesParser; + private final Path tikaConfigPath; + + /** + * Creates a new PipesForkParser with default configuration. + * + * @throws IOException if the temporary config file cannot be created + */ + public PipesForkParser() throws IOException { + this(new PipesForkParserConfig()); + } + + /** + * Creates a new PipesForkParser with the specified configuration. + * + * @param config the configuration for this parser + * @throws IOException if the temporary config file cannot be created + */ + public PipesForkParser(PipesForkParserConfig config) throws IOException { + this.config = config; + this.tikaConfigPath = createTikaConfigFile(); + this.pipesParser = new PipesParser(config.getPipesConfig(), tikaConfigPath); + } + + /** + * Parse a file in a forked JVM process. + * + * @param path the path to the file to parse + * @return the parse result containing metadata and content + * @throws IOException if an I/O error occurs + * @throws InterruptedException if the parsing is interrupted + * @throws PipesException if a pipes infrastructure error occurs + * @throws PipesForkParserException if an application error occurs (initialization + * failure or configuration error) + */ + public PipesForkResult parse(Path path) + throws IOException, InterruptedException, PipesException, TikaException { + return parse(path, new Metadata(), new ParseContext()); + } + + /** + * Parse a file in a forked JVM process with the specified metadata. + * + * @param path the path to the file to parse + * @param metadata initial metadata (e.g., content type hint) + * @return the parse result containing metadata and content + * @throws IOException if an I/O error occurs + * @throws InterruptedException if the parsing is interrupted + * @throws PipesException if a pipes infrastructure error occurs + * @throws PipesForkParserException if an application error occurs (initialization + * failure or configuration error) + */ + public PipesForkResult parse(Path path, Metadata metadata) + throws IOException, InterruptedException, PipesException, TikaException { + return parse(path, metadata, new ParseContext()); + } + + /** + * Parse a file in a forked JVM process with the specified metadata and parse context. + * + * @param path the path to the file to parse + * @param metadata initial metadata (e.g., content type hint) + * @param parseContext the parse context + * @return the parse result containing metadata and content + * @throws IOException if an I/O error occurs + * @throws InterruptedException if the parsing is interrupted + * @throws PipesException if a pipes infrastructure error occurs + * @throws PipesForkParserException if an application error occurs (initialization + * failure or configuration error) + */ + public PipesForkResult parse(Path path, Metadata metadata, ParseContext parseContext) + throws IOException, InterruptedException, PipesException, TikaException { + + String absolutePath = path.toAbsolutePath().toString(); + String id = absolutePath; + + FetchKey fetchKey = new FetchKey(config.getFetcherName(), absolutePath); + EmitKey emitKey = new EmitKey("", id); // Empty emitter name since we're using PASSBACK_ALL + + // Add handler config to parse context so server knows how to handle content + parseContext.set(HandlerConfig.class, config.getHandlerConfig()); + + FetchEmitTuple tuple = new FetchEmitTuple(id, fetchKey, emitKey, metadata, parseContext); + + PipesResult result = pipesParser.parse(tuple); + + // Check for application errors and throw if necessary + // Process crashes are NOT thrown - the next parse will restart the process + checkForApplicationError(result); + + return new PipesForkResult(result); + } + + /** + * Checks if the result represents an application error and throws an exception if so. + * <p> + * Application errors that cause exceptions: + * <ul> + * <li>Initialization failures (parser, fetcher, or emitter)</li> + * <li>Configuration errors (fetcher or emitter not found)</li> + * <li>Client unavailable within timeout</li> + * </ul> + * <p> + * Process crashes (OOM, timeout, unspecified crash) are NOT thrown as exceptions. + * The forked process will be automatically restarted on the next parse call. + * Check {@link PipesForkResult#isProcessCrash()} to detect these cases. + * <p> + * Per-document errors (fetch exception, parse exception) are also NOT thrown. + * These are returned in the result so the caller can handle them appropriately + * (e.g., log and continue with the next file). + * + * @param result the pipes result to check + * @throws PipesForkParserException if the result represents an application error + */ + private void checkForApplicationError(PipesResult result) throws PipesForkParserException { + PipesResult.RESULT_STATUS status = result.status(); + + // Only throw for application errors that indicate infrastructure/config problems + // Process crashes and per-document errors are returned to the caller + switch (status) { + case FAILED_TO_INITIALIZE: + throw new PipesForkParserException(status, + "Failed to initialize parser" + + (result.message() != null ? ": " + result.message() : "")); + + case FETCHER_INITIALIZATION_EXCEPTION: + throw new PipesForkParserException(status, + "Failed to initialize fetcher" + + (result.message() != null ? ": " + result.message() : "")); + + case EMITTER_INITIALIZATION_EXCEPTION: + throw new PipesForkParserException(status, + "Failed to initialize emitter" + + (result.message() != null ? ": " + result.message() : "")); + + case FETCHER_NOT_FOUND: + throw new PipesForkParserException(status, + "Fetcher not found" + + (result.message() != null ? ": " + result.message() : "")); + + case EMITTER_NOT_FOUND: + throw new PipesForkParserException(status, + "Emitter not found" + + (result.message() != null ? ": " + result.message() : "")); + + case CLIENT_UNAVAILABLE_WITHIN_MS: + throw new PipesForkParserException(status, + "No client available within timeout" + + (result.message() != null ? ": " + result.message() : "")); + + default: + // Process crashes (OOM, TIMEOUT, UNSPECIFIED_CRASH) - not thrown, + // next parse will restart the process automatically + // + // Per-document errors (FETCH_EXCEPTION, PARSE_EXCEPTION_NO_EMIT, etc.) - + // not thrown, caller can check result and decide how to handle + // + // Success states - obviously not thrown + break; + } + } + + @Override + public void close() throws IOException { + pipesParser.close(); + // Clean up temp config file + if (tikaConfigPath != null) { + Files.deleteIfExists(tikaConfigPath); + } + } + + /** + * Creates a temporary tika-config.json file for the forked process. + * This configures: + * - FileSystemFetcher as the fetcher + * - PASSBACK_ALL emit strategy (no emitter, return results to client) + */ + private Path createTikaConfigFile() throws IOException { + Path configFile = Files.createTempFile("tika-fork-config-", ".json"); + + String jsonConfig = generateJsonConfig(); + Files.writeString(configFile, jsonConfig); + + return configFile; + } + + private String generateJsonConfig() throws IOException { + PipesConfig pc = config.getPipesConfig(); + + ObjectMapper mapper = new ObjectMapper(); + mapper.enable(SerializationFeature.INDENT_OUTPUT); + + StringWriter writer = new StringWriter(); + try (JsonGenerator gen = mapper.getFactory().createGenerator(writer)) { + gen.writeStartObject(); + + // Fetchers section + gen.writeObjectFieldStart("fetchers"); + gen.writeObjectFieldStart(config.getFetcherName()); + gen.writeObjectFieldStart("file-system-fetcher"); + // No basePath - fetchKey will be treated as absolute path + // Set allowAbsolutePaths to suppress the security warning since this is intentional + gen.writeBooleanField("allowAbsolutePaths", true); + gen.writeEndObject(); // file-system-fetcher + gen.writeEndObject(); // fetcher name + gen.writeEndObject(); // fetchers + + // Pipes configuration section + gen.writeObjectFieldStart("pipes"); + gen.writeNumberField("numClients", pc.getNumClients()); + gen.writeNumberField("timeoutMillis", pc.getTimeoutMillis()); + gen.writeNumberField("startupTimeoutMillis", pc.getStartupTimeoutMillis()); + gen.writeNumberField("maxFilesProcessedPerProcess", pc.getMaxFilesProcessedPerProcess()); + + // Emit strategy - PASSBACK_ALL means no emitter, return results to client + gen.writeObjectFieldStart("emitStrategy"); + gen.writeStringField("type", "PASSBACK_ALL"); + gen.writeEndObject(); // emitStrategy + + // JVM args if specified + ArrayList<String> jvmArgs = pc.getForkedJvmArgs(); + if (jvmArgs != null && !jvmArgs.isEmpty()) { + gen.writeArrayFieldStart("forkedJvmArgs"); + for (String arg : jvmArgs) { + gen.writeString(arg); + } + gen.writeEndArray(); + } + + gen.writeEndObject(); // pipes + + // Plugin roots if specified + if (config.getPluginsDir() != null) { + gen.writeStringField("plugin-roots", config.getPluginsDir().toAbsolutePath().toString()); + } + + gen.writeEndObject(); // root + } + + return writer.toString(); + } +} diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java new file mode 100644 index 000000000..8ffa0b555 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserConfig.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fork; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.core.PipesConfig; +import org.apache.tika.sax.BasicContentHandlerFactory; + +/** + * Configuration for {@link PipesForkParser}. + * <p> + * This provides a simplified configuration API that abstracts away the + * complexity of the pipes infrastructure. + */ +public class PipesForkParserConfig { + + private final PipesConfig pipesConfig; + private HandlerConfig handlerConfig; + private String fetcherName = PipesForkParser.DEFAULT_FETCHER_NAME; + private Path pluginsDir; + + public PipesForkParserConfig() { + this.pipesConfig = new PipesConfig(); + this.handlerConfig = new HandlerConfig(); + // Default to single client for simple fork parser use case + this.pipesConfig.setNumClients(1); + } + + /** + * Get the underlying PipesConfig for advanced configuration. + * + * @return the pipes configuration + */ + public PipesConfig getPipesConfig() { + return pipesConfig; + } + + /** + * Get the handler configuration that specifies how content should be handled. + * + * @return the handler configuration + */ + public HandlerConfig getHandlerConfig() { + return handlerConfig; + } + + /** + * Set the handler configuration. + * + * @param handlerConfig the handler configuration + * @return this config for chaining + */ + public PipesForkParserConfig setHandlerConfig(HandlerConfig handlerConfig) { + this.handlerConfig = handlerConfig; + return this; + } + + /** + * Set the handler type (TEXT, HTML, XML, etc.). + * + * @param type the handler type + * @return this config for chaining + */ + public PipesForkParserConfig setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE type) { + this.handlerConfig.setType(type); + return this; + } + + /** + * Set the parse mode (RMETA for recursive metadata, CONCATENATE for single document). + * + * @param parseMode the parse mode + * @return this config for chaining + */ + public PipesForkParserConfig setParseMode(HandlerConfig.PARSE_MODE parseMode) { + this.handlerConfig.setParseMode(parseMode); + return this; + } + + /** + * Set the write limit for content extraction. + * + * @param writeLimit the maximum characters to extract (-1 for unlimited) + * @return this config for chaining + */ + public PipesForkParserConfig setWriteLimit(int writeLimit) { + this.handlerConfig.setWriteLimit(writeLimit); + return this; + } + + /** + * Set the maximum number of embedded resources to process. + * + * @param maxEmbeddedResources the maximum embedded resources (-1 for unlimited) + * @return this config for chaining + */ + public PipesForkParserConfig setMaxEmbeddedResources(int maxEmbeddedResources) { + this.handlerConfig.setMaxEmbeddedResources(maxEmbeddedResources); + return this; + } + + /** + * Get the fetcher name used for file system fetching. + * + * @return the fetcher name + */ + public String getFetcherName() { + return fetcherName; + } + + /** + * Set the fetcher name. + * + * @param fetcherName the fetcher name + * @return this config for chaining + */ + public PipesForkParserConfig setFetcherName(String fetcherName) { + this.fetcherName = fetcherName; + return this; + } + + /** + * Set the timeout in milliseconds for parsing operations. + * + * @param timeoutMillis the timeout in milliseconds + * @return this config for chaining + */ + public PipesForkParserConfig setTimeoutMillis(long timeoutMillis) { + pipesConfig.setTimeoutMillis(timeoutMillis); + return this; + } + + /** + * Set the JVM arguments for the forked process. + * + * @param jvmArgs the JVM arguments (e.g., "-Xmx512m") + * @return this config for chaining + */ + public PipesForkParserConfig setJvmArgs(List<String> jvmArgs) { + pipesConfig.setForkedJvmArgs(new ArrayList<>(jvmArgs)); + return this; + } + + /** + * Add a JVM argument for the forked process. + * + * @param arg the JVM argument to add + * @return this config for chaining + */ + public PipesForkParserConfig addJvmArg(String arg) { + pipesConfig.getForkedJvmArgs().add(arg); + return this; + } + + /** + * Set the Java executable path. + * + * @param javaPath path to the java executable + * @return this config for chaining + */ + public PipesForkParserConfig setJavaPath(String javaPath) { + pipesConfig.setJavaPath(javaPath); + return this; + } + + /** + * Set the maximum number of files to process before restarting the forked process. + * This helps prevent memory leaks from accumulating. + * + * @param maxFiles the maximum files per process (-1 for unlimited) + * @return this config for chaining + */ + public PipesForkParserConfig setMaxFilesPerProcess(int maxFiles) { + pipesConfig.setMaxFilesProcessedPerProcess(maxFiles); + return this; + } + + /** + * <b>EXPERT:</b> Set the number of forked JVM processes (clients) to use for parsing. + * <p> + * This enables concurrent parsing across multiple forked processes. Each client + * is an independent JVM that can parse documents in parallel. When multiple threads + * call {@link PipesForkParser#parse}, requests are distributed across the pool + * of forked processes. + * <p> + * <b>When to use:</b> Set this higher than 1 when you need to parse many documents + * concurrently and have sufficient CPU cores and memory. Each forked process + * consumes memory independently (based on your JVM args like -Xmx). + * <p> + * <b>Default:</b> 1 (single forked process, suitable for simple sequential use) + * + * @param numClients the number of forked JVM processes (must be >= 1) + * @return this config for chaining + * @throws IllegalArgumentException if numClients is less than 1 + */ + public PipesForkParserConfig setNumClients(int numClients) { + if (numClients < 1) { + throw new IllegalArgumentException("numClients must be >= 1"); + } + pipesConfig.setNumClients(numClients); + return this; + } + + /** + * Get the number of forked JVM processes configured. + * + * @return the number of clients + */ + public int getNumClients() { + return pipesConfig.getNumClients(); + } + + /** + * Set the startup timeout in milliseconds. + * + * @param startupTimeoutMillis the startup timeout + * @return this config for chaining + */ + public PipesForkParserConfig setStartupTimeoutMillis(long startupTimeoutMillis) { + pipesConfig.setStartupTimeoutMillis(startupTimeoutMillis); + return this; + } + + /** + * Get the plugins directory. + * + * @return the plugins directory, or null if not set + */ + public Path getPluginsDir() { + return pluginsDir; + } + + /** + * Set the plugins directory where plugin zips are located. + * This directory should contain the tika-pipes-file-system zip + * and any other required plugins. + * + * @param pluginsDir the plugins directory + * @return this config for chaining + */ + public PipesForkParserConfig setPluginsDir(Path pluginsDir) { + this.pluginsDir = pluginsDir; + return this; + } +} diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserException.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserException.java new file mode 100644 index 000000000..5f32c5421 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkParserException.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fork; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.pipes.api.PipesResult; + +/** + * Exception thrown when {@link PipesForkParser} encounters an application error. + * <p> + * This exception is thrown for application-level errors that indicate + * infrastructure or configuration problems: + * <ul> + * <li>Initialization failures (parser, fetcher, or emitter initialization)</li> + * <li>Configuration errors (fetcher or emitter not found)</li> + * <li>Client unavailable (no forked process available within timeout)</li> + * </ul> + * <p> + * The following are NOT thrown as exceptions: + * <ul> + * <li>Process crashes (OOM, timeout) - returned in result, next parse + * will automatically restart the forked process</li> + * <li>Per-document failures (fetch exception, parse exception) - returned + * in result so caller can handle gracefully</li> + * </ul> + * + * @see PipesForkResult#isProcessCrash() + * @see PipesForkResult#isApplicationError() + */ +public class PipesForkParserException extends TikaException { + + private final PipesResult.RESULT_STATUS status; + + /** + * Creates a new exception with the given status and message. + * + * @param status the result status that caused this exception + * @param message the error message + */ + public PipesForkParserException( + PipesResult.RESULT_STATUS status, String message) { + super(message); + this.status = status; + } + + /** + * Creates a new exception with the given status, message, and cause. + * + * @param status the result status that caused this exception + * @param message the error message + * @param cause the underlying cause + */ + public PipesForkParserException( + PipesResult.RESULT_STATUS status, String message, Throwable cause) { + super(message, cause); + this.status = status; + } + + /** + * Get the result status that caused this exception. + * + * @return the result status + */ + public PipesResult.RESULT_STATUS getStatus() { + return status; + } + + /** + * Check if this exception was caused by an initialization failure. + * + * @return true if initialization failed + */ + public boolean isInitializationFailure() { + return status == PipesResult.RESULT_STATUS.FAILED_TO_INITIALIZE + || status == PipesResult.RESULT_STATUS.FETCHER_INITIALIZATION_EXCEPTION + || status == PipesResult.RESULT_STATUS.EMITTER_INITIALIZATION_EXCEPTION; + } + + /** + * Check if this exception was caused by a configuration error. + * + * @return true if there was a configuration error + */ + public boolean isConfigurationError() { + return status == PipesResult.RESULT_STATUS.FETCHER_NOT_FOUND + || status == PipesResult.RESULT_STATUS.EMITTER_NOT_FOUND; + } +} diff --git a/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkResult.java b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkResult.java new file mode 100644 index 000000000..b1dde3e07 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/main/java/org/apache/tika/pipes/fork/PipesForkResult.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fork; + +import java.util.Collections; +import java.util.List; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.pipes.api.PipesResult; + +/** + * Result from parsing a file with {@link PipesForkParser}. + * <p> + * This wraps the {@link PipesResult} and provides convenient access to + * the parsed content and metadata. + * <p> + * Content is available in the metadata via {@link TikaCoreProperties#TIKA_CONTENT}. + */ +public class PipesForkResult { + + private final PipesResult pipesResult; + + public PipesForkResult(PipesResult pipesResult) { + this.pipesResult = pipesResult; + } + + /** + * Get the result status. + * + * @return the result status + */ + public PipesResult.RESULT_STATUS getStatus() { + return pipesResult.status(); + } + + /** + * Check if the parsing was successful. + * + * @return true if parsing succeeded + */ + public boolean isSuccess() { + return pipesResult.isSuccess(); + } + + /** + * Check if there was a process crash (OOM, timeout, etc.). + * + * @return true if the forked process crashed + */ + public boolean isProcessCrash() { + return pipesResult.isProcessCrash(); + } + + /** + * Check if there was an application error. + * + * @return true if there was an application-level error + */ + public boolean isApplicationError() { + return pipesResult.isApplicationError(); + } + + /** + * Get the list of metadata objects from parsing. + * <p> + * In RMETA mode, there will be one metadata object per document + * (container plus embedded documents). + * <p> + * In CONCATENATE mode, there will be a single metadata object. + * <p> + * Content is available via {@link TikaCoreProperties#TIKA_CONTENT}. + * + * @return the list of metadata objects, or empty list if none + */ + public List<Metadata> getMetadataList() { + if (pipesResult.emitData() == null) { + return Collections.emptyList(); + } + return pipesResult.emitData().getMetadataList(); + } + + /** + * Get the content from the first (or only) metadata object. + * <p> + * This is a convenience method for simple use cases. + * + * @return the content, or null if not available + */ + public String getContent() { + List<Metadata> metadataList = getMetadataList(); + if (metadataList.isEmpty()) { + return null; + } + return metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); + } + + /** + * Get the first (or only) metadata object. + * + * @return the metadata, or null if not available + */ + public Metadata getMetadata() { + List<Metadata> metadataList = getMetadataList(); + if (metadataList.isEmpty()) { + return null; + } + return metadataList.get(0); + } + + /** + * Get any error message associated with the result. + * + * @return the error message, or null if none + */ + public String getMessage() { + return pipesResult.message(); + } + + /** + * Get the underlying PipesResult for advanced access. + * + * @return the pipes result + */ + public PipesResult getPipesResult() { + return pipesResult; + } + + @Override + public String toString() { + return "PipesForkResult{" + + "status=" + getStatus() + + ", metadataCount=" + getMetadataList().size() + + ", message=" + getMessage() + + '}'; + } +} diff --git a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java new file mode 100644 index 000000000..75761f828 --- /dev/null +++ b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java @@ -0,0 +1,435 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fork; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.pipes.api.HandlerConfig; +import org.apache.tika.pipes.api.PipesResult; +import org.apache.tika.sax.BasicContentHandlerFactory; + +public class PipesForkParserTest { + + private static final Path PLUGINS_DIR = Paths.get("target/plugins"); + + @TempDir + Path tempDir; + + @BeforeAll + static void checkPluginsDir() { + if (!Files.isDirectory(PLUGINS_DIR)) { + System.err.println("WARNING: Plugins directory not found at " + PLUGINS_DIR.toAbsolutePath() + + ". Tests may fail. Run 'mvn process-test-resources' first."); + } + } + + private Path createZipWithEmbeddedFiles(String zipName, String... entries) throws IOException { + Path zipPath = tempDir.resolve(zipName); + try (OutputStream fos = Files.newOutputStream(zipPath); + ZipOutputStream zos = new ZipOutputStream(fos)) { + for (int i = 0; i < entries.length; i += 2) { + zos.putNextEntry(new ZipEntry(entries[i])); + zos.write(entries[i + 1].getBytes(StandardCharsets.UTF_8)); + zos.closeEntry(); + } + } + return zipPath; + } + + @Test + public void testParseTextFile() throws Exception { + // Create a simple test file + Path testFile = tempDir.resolve("test.txt"); + String content = "Hello, this is a test document.\nIt has multiple lines."; + Files.writeString(testFile, content); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000) + .addJvmArg("-Xmx256m"); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testFile); + + assertTrue(result.isSuccess(), "Parse should succeed. Status: " + result.getStatus() + + ", message: " + result.getMessage()); + assertFalse(result.isProcessCrash(), "Should not be a process crash"); + + List<Metadata> metadataList = result.getMetadataList(); + assertNotNull(metadataList, "Metadata list should not be null"); + assertFalse(metadataList.isEmpty(), "Metadata list should not be empty"); + + String extractedContent = result.getContent(); + assertNotNull(extractedContent, "Content should not be null"); + assertTrue(extractedContent.contains("Hello"), "Content should contain 'Hello'"); + assertTrue(extractedContent.contains("test document"), "Content should contain 'test document'"); + } + } + + @Test + public void testParseWithMetadata() throws Exception { + // Create a simple HTML file + Path testFile = tempDir.resolve("test.html"); + String html = "<html><head><title>Test Title</title></head>" + + "<body><p>Test paragraph content.</p></body></html>"; + Files.writeString(testFile, html); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + Metadata initialMetadata = new Metadata(); + PipesForkResult result = parser.parse(testFile, initialMetadata); + + assertTrue(result.isSuccess(), "Parse should succeed"); + + Metadata metadata = result.getMetadata(); + assertNotNull(metadata, "Metadata should not be null"); + + String extractedContent = result.getContent(); + assertNotNull(extractedContent, "Content should not be null"); + assertTrue(extractedContent.contains("Test paragraph"), "Content should contain paragraph text"); + } + } + + @Test + public void testParseMultipleFiles() throws Exception { + // Create multiple test files + Path testFile1 = tempDir.resolve("test1.txt"); + Path testFile2 = tempDir.resolve("test2.txt"); + Files.writeString(testFile1, "Content of first file"); + Files.writeString(testFile2, "Content of second file"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result1 = parser.parse(testFile1); + assertTrue(result1.isSuccess()); + assertTrue(result1.getContent().contains("first file")); + + PipesForkResult result2 = parser.parse(testFile2); + assertTrue(result2.isSuccess()); + assertTrue(result2.getContent().contains("second file")); + } + } + + @Test + public void testConcatenateMode() throws Exception { + Path testZip = createZipWithEmbeddedFiles("test_with_embedded.zip", + "embedded1.txt", "Content from first embedded file", + "embedded2.txt", "Content from second embedded file"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.CONCATENATE) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testZip); + + assertTrue(result.isSuccess(), "Parse should succeed"); + + // In CONCATENATE mode, there should be exactly one metadata object + // even though the zip contains multiple embedded files + List<Metadata> metadataList = result.getMetadataList(); + assertEquals(1, metadataList.size(), "CONCATENATE mode should return single metadata"); + + // The content should contain text from both embedded files + String content = result.getContent(); + assertNotNull(content); + assertTrue(content.contains("first embedded"), + "Content should contain text from first embedded file"); + assertTrue(content.contains("second embedded"), + "Content should contain text from second embedded file"); + } + } + + @Test + public void testRmetaModeWithEmbedded() throws Exception { + Path testZip = createZipWithEmbeddedFiles("test_rmeta_embedded.zip", + "file1.txt", "First file content", + "file2.txt", "Second file content"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testZip); + + assertTrue(result.isSuccess(), "Parse should succeed"); + + // In RMETA mode, there should be multiple metadata objects: + // one for the container (zip) and one for each embedded file + List<Metadata> metadataList = result.getMetadataList(); + assertTrue(metadataList.size() >= 3, + "RMETA mode should return metadata for container + embedded files, got: " + + metadataList.size()); + } + } + + @Test + public void testDefaultConfigMatchesExplicitRmeta() throws Exception { + Path testZip = createZipWithEmbeddedFiles("test_default_config.zip", + "file1.txt", "First file content", + "file2.txt", "Second file content"); + + // Parse with explicit RMETA config + PipesForkParserConfig explicitConfig = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + int explicitMetadataCount; + try (PipesForkParser parser = new PipesForkParser(explicitConfig)) { + PipesForkResult result = parser.parse(testZip); + assertTrue(result.isSuccess()); + explicitMetadataCount = result.getMetadataList().size(); + } + + // Parse with default config (only pluginsDir set) - should produce same results + PipesForkParserConfig defaultConfig = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR); + try (PipesForkParser parser = new PipesForkParser(defaultConfig)) { + PipesForkResult result = parser.parse(testZip); + + assertTrue(result.isSuccess(), "Parse with default config should succeed"); + assertEquals(explicitMetadataCount, result.getMetadataList().size(), + "Default config should produce same metadata count as explicit RMETA config"); + } + } + + @Test + public void testTextVsXhtmlHandlerType() throws Exception { + // Create an HTML file to parse + Path testFile = tempDir.resolve("test_handler.html"); + String html = "<html><head><title>Test Title</title></head>" + + "<body><p>Paragraph one.</p><p>Paragraph two.</p></body></html>"; + Files.writeString(testFile, html); + + // Parse with TEXT handler - should get plain text without markup + PipesForkParserConfig textConfig = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + String textContent; + try (PipesForkParser parser = new PipesForkParser(textConfig)) { + PipesForkResult result = parser.parse(testFile); + assertTrue(result.isSuccess(), "TEXT parse should succeed"); + textContent = result.getContent(); + assertNotNull(textContent, "TEXT content should not be null"); + // TEXT mode should NOT contain HTML tags + assertFalse(textContent.contains("<p>"), "TEXT content should not contain <p> tags"); + assertFalse(textContent.contains("<html>"), "TEXT content should not contain <html> tags"); + assertTrue(textContent.contains("Paragraph one"), "TEXT content should contain text"); + } + + // Parse with XML handler - should get XHTML markup + PipesForkParserConfig xmlConfig = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.XML) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setTimeoutMillis(60000); + + String xmlContent; + try (PipesForkParser parser = new PipesForkParser(xmlConfig)) { + PipesForkResult result = parser.parse(testFile); + assertTrue(result.isSuccess(), "XML parse should succeed"); + xmlContent = result.getContent(); + assertNotNull(xmlContent, "XML content should not be null"); + // XML mode SHOULD contain markup + assertTrue(xmlContent.contains("<p>") || xmlContent.contains("<p "), + "XML content should contain <p> tags"); + assertTrue(xmlContent.contains("Paragraph one"), "XML content should contain text"); + } + + // The XML content should be longer due to markup + assertTrue(xmlContent.length() > textContent.length(), + "XML content should be longer than TEXT content due to markup"); + } + + @Test + public void testWriteLimit() throws Exception { + // Create a file with more content than the write limit + Path testFile = tempDir.resolve("longfile.txt"); + StringBuilder longContent = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longContent.append("This is line ").append(i).append(" of the test document.\n"); + } + Files.writeString(testFile, longContent.toString()); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT) + .setParseMode(HandlerConfig.PARSE_MODE.RMETA) + .setWriteLimit(100) // Limit to 100 characters + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testFile); + + // Note: behavior depends on throwOnWriteLimitReached setting + // With default (true), this may result in an exception being recorded + assertNotNull(result); + } + } + + @Test + public void testDefaultConfiguration() throws Exception { + Path testFile = tempDir.resolve("default.txt"); + Files.writeString(testFile, "Testing default configuration"); + + // Use default configuration (only pluginsDir set) + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR); + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testFile); + assertTrue(result.isSuccess()); + assertNotNull(result.getContent()); + } + } + + @Test + public void testFileNotFoundReturnsFetchException() throws Exception { + // Try to parse a file that doesn't exist + Path nonExistentFile = tempDir.resolve("does_not_exist.txt"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + // This should NOT throw an exception - fetch failures are returned in the result + PipesForkResult result = parser.parse(nonExistentFile); + + // The result should indicate a fetch exception, not success + assertFalse(result.isSuccess(), "Should not succeed for non-existent file"); + assertFalse(result.isProcessCrash(), "Should not be a process crash"); + assertEquals(PipesResult.RESULT_STATUS.FETCH_EXCEPTION, result.getStatus(), + "Should be a FETCH_EXCEPTION"); + assertNotNull(result.getMessage(), "Should have an error message"); + } + } + + @Test + public void testFetchExceptionDoesNotPreventNextParse() throws Exception { + // First try a non-existent file, then try a real file + Path nonExistentFile = tempDir.resolve("does_not_exist.txt"); + Path realFile = tempDir.resolve("real_file.txt"); + Files.writeString(realFile, "This file exists"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + // First parse - should fail with fetch exception + PipesForkResult result1 = parser.parse(nonExistentFile); + assertEquals(PipesResult.RESULT_STATUS.FETCH_EXCEPTION, result1.getStatus()); + + // Second parse - should succeed + PipesForkResult result2 = parser.parse(realFile); + assertTrue(result2.isSuccess(), "Should succeed for existing file"); + assertTrue(result2.getContent().contains("This file exists")); + } + } + + @Test + public void testParseSuccessWithExceptionStatus() throws Exception { + // Create a file that will parse but may have warnings + // For example, a file with content that might trigger a write limit + Path testFile = tempDir.resolve("parse_with_warning.txt"); + Files.writeString(testFile, "Simple content"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testFile); + + // Verify we can check for different success states + if (result.isSuccess()) { + // Could be PARSE_SUCCESS, PARSE_SUCCESS_WITH_EXCEPTION, or EMIT_SUCCESS_PASSBACK + assertTrue( + result.getStatus() == PipesResult.RESULT_STATUS.PARSE_SUCCESS || + result.getStatus() == PipesResult.RESULT_STATUS.PARSE_SUCCESS_WITH_EXCEPTION || + result.getStatus() == PipesResult.RESULT_STATUS.EMIT_SUCCESS_PASSBACK, + "Success status should be one of the success types"); + } + } + } + + @Test + public void testResultCategorization() throws Exception { + // Test that we can properly categorize results + Path testFile = tempDir.resolve("categorize.txt"); + Files.writeString(testFile, "Test categorization"); + + PipesForkParserConfig config = new PipesForkParserConfig() + .setPluginsDir(PLUGINS_DIR) + .setTimeoutMillis(60000); + + try (PipesForkParser parser = new PipesForkParser(config)) { + PipesForkResult result = parser.parse(testFile); + + // At least one of these should be true + boolean hasCategory = result.isSuccess() || result.isProcessCrash() || result.isApplicationError(); + assertTrue(hasCategory, "Result should have a valid category"); + + // These should be mutually exclusive + int trueCount = 0; + if (result.isSuccess()) trueCount++; + if (result.isProcessCrash()) trueCount++; + if (result.isApplicationError()) trueCount++; + assertEquals(1, trueCount, "Exactly one category should be true"); + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index d1f6a8e16..34ba51c9b 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -24,13 +24,10 @@ import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.FileTime; import java.util.Date; -import java.util.Optional; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.config.ConfigContainer; -import org.apache.tika.config.JsonConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -69,47 +66,26 @@ public class FileSystemFetcher extends AbstractTikaExtension implements Fetcher } @Override - public TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws IOException, TikaException { + public TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) + throws IOException, TikaException { if (fetchKey.contains("\u0000")) { - throw new IllegalArgumentException("Path must not contain 'u0000'. " + - "Please review the life decisions that led you to requesting " + - "a file name with this character in it."); + throw new IllegalArgumentException("Path must not contain 'u0000'. " + + "Please review the life decisions that led you to requesting " + + "a file name with this character in it."); } FileSystemFetcherConfig config = defaultFileSystemFetcherConfig; - ConfigContainer configContainer = parseContext.get(ConfigContainer.class); - if (configContainer != null) { - Optional<JsonConfig> configJson = configContainer.get(getExtensionConfig().id()); - if (configJson.isPresent()) { - try { - // Check if basePath is present in runtime config - this is not allowed for security - if (configJson.get().json().contains("\"basePath\"")) { - throw new TikaConfigException( - "Cannot change 'basePath' at runtime for security reasons. " + - "basePath can only be set during initialization."); - } - - // Load runtime config (excludes basePath for security) - FileSystemFetcherRuntimeConfig runtimeConfig = - FileSystemFetcherRuntimeConfig.load(configJson.get().json()); - - // Merge runtime config into default config while preserving basePath - config = new FileSystemFetcherConfig() - .setBasePath(defaultFileSystemFetcherConfig.getBasePath()) - .setExtractFileSystemMetadata(runtimeConfig.isExtractFileSystemMetadata()); - } catch (TikaConfigException e) { - throw new IOException("Failed to load runtime config", e); - } - } - } - Path p = null; - if (! StringUtils.isBlank(config.getBasePath())) { + Path p; + if (StringUtils.isBlank(config.getBasePath())) { + // No basePath - treat fetchKey as absolute path + p = Paths.get(fetchKey); + } else { Path basePath = Paths.get(config.getBasePath()); if (!Files.isDirectory(basePath)) { throw new IOException("BasePath is not a directory: " + basePath); } p = basePath.resolve(fetchKey); if (!p.toRealPath().startsWith(basePath.toRealPath())) { - throw new IllegalArgumentException( + throw new SecurityException( "fetchKey must resolve to be a descendant of the 'basePath'"); } } @@ -143,38 +119,39 @@ public class FileSystemFetcher extends AbstractTikaExtension implements Fetcher metadata.set(property, new Date(fileTime.toMillis())); } - private void checkConfig(FileSystemFetcherConfig fetcherConfig) throws TikaConfigException { + private void checkConfig(FileSystemFetcherConfig fetcherConfig) + throws TikaConfigException { String basePath = fetcherConfig.getBasePath(); if (basePath == null || basePath.isBlank()) { - LOG.warn("'basePath' has not been set. " + - "This means that client code or clients can read from any file that this " + - "process has permissions to read. If you are running tika-server, make " + - "absolutely certain that you've locked down " + - "access to tika-server and file-permissions for the tika-server process."); + if (!fetcherConfig.isAllowAbsolutePaths()) { + throw new TikaConfigException( + "'basePath' must be set, or 'allowAbsolutePaths' must be true. " + + "Without basePath, clients can read any file this process " + + "has access to. Set 'allowAbsolutePaths: true' to explicitly " + + "allow this behavior and accept the security risks."); + } return; } - if (basePath.toString().startsWith("http://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please use the tika-fetcher-http module for http calls"); - } else if (basePath.toString().startsWith("ftp://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please consider contributing an ftp fetcher module"); - } else if (basePath.toString().startsWith("s3://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please use the tika-fetcher-s3 module"); + if (basePath.startsWith("http://")) { + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + "Please use the tika-fetcher-http module for http calls"); + } else if (basePath.startsWith("ftp://")) { + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + "Please consider contributing an ftp fetcher module"); + } else if (basePath.startsWith("s3://")) { + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + "Please use the tika-fetcher-s3 module"); } if (basePath.contains("\u0000")) { throw new TikaConfigException( - "base path must not contain \u0000. " + "Seriously, what were you thinking?"); + "base path must not contain \u0000. Seriously, what were you thinking?"); } } - static boolean isDescendant(Path root, Path descendant) { - return descendant.toAbsolutePath().normalize() - .startsWith(root.toAbsolutePath().normalize()); - } - @Override public String toString() { return "FileSystemFetcher{" + "defaultFileSystemFetcherConfig=" + defaultFileSystemFetcherConfig + ", pluginConfig=" + pluginConfig + '}'; diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherConfig.java index fcf2e5d5e..7ee64e38d 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherConfig.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherConfig.java @@ -37,7 +37,8 @@ public class FileSystemFetcherConfig { } private String basePath; - private boolean extractFileSystemMetadata; + private boolean extractFileSystemMetadata = false; + private boolean allowAbsolutePaths = false; public boolean isExtractFileSystemMetadata() { return extractFileSystemMetadata; @@ -56,4 +57,18 @@ public class FileSystemFetcherConfig { this.basePath = basePath; return this; } + + /** + * If true, allows fetchKey to be an absolute path when basePath is not set. + * This suppresses the security warning about unrestricted file access. + * Use this when you intentionally want to allow fetching from any path. + */ + public boolean isAllowAbsolutePaths() { + return allowAbsolutePaths; + } + + public FileSystemFetcherConfig setAllowAbsolutePaths(boolean allowAbsolutePaths) { + this.allowAbsolutePaths = allowAbsolutePaths; + return this; + } } diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfig.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfig.java deleted file mode 100644 index ffadf9822..000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfig.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.fetcher.fs; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -import org.apache.tika.exception.TikaConfigException; - -/** - * Runtime configuration for FileSystemFetcher. - * Only includes fields that are safe to update at runtime. - * basePath is intentionally excluded for security reasons. - */ -public class FileSystemFetcherRuntimeConfig { - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public static FileSystemFetcherRuntimeConfig load(final String json) - throws TikaConfigException { - try { - return OBJECT_MAPPER.readValue(json, - FileSystemFetcherRuntimeConfig.class); - } catch (JsonProcessingException e) { - throw new TikaConfigException( - "Failed to parse FileSystemFetcherRuntimeConfig from JSON", e); - } - } - - private boolean extractFileSystemMetadata; - - public boolean isExtractFileSystemMetadata() { - return extractFileSystemMetadata; - } - - public FileSystemFetcherRuntimeConfig setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { - this.extractFileSystemMetadata = extractFileSystemMetadata; - return this; - } -} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfigTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfigTest.java deleted file mode 100644 index c1be6c535..000000000 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherRuntimeConfigTest.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.fetcher.fs; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Locale; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -import org.apache.tika.config.ConfigContainer; -import org.apache.tika.metadata.FileSystem; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.plugins.ExtensionConfig; - -/** - * Tests runtime configuration of FileSystemFetcher via ConfigContainer and ParseContext. - */ -public class FileSystemFetcherRuntimeConfigTest { - - @Test - public void testRuntimeConfigViaParseContext(@TempDir Path tempDir) throws Exception { - // Create a test file - Path testFile = tempDir.resolve("test.txt"); - Files.writeString(testFile, "test content"); - - // Create fetcher with default config (no extractFileSystemMetadata) - String defaultConfig = String.format(Locale.ROOT, "{\"basePath\":\"%s\"}", - tempDir.toString().replace("\\", "\\\\")); - ExtensionConfig pluginConfig = new ExtensionConfig("test-fetcher", "test", defaultConfig); - FileSystemFetcher fetcher = new FileSystemFetcher(pluginConfig); - - // Fetch without runtime config - should not extract file system metadata - Metadata metadata1 = new Metadata(); - ParseContext context1 = new ParseContext(); - try (InputStream is = fetcher.fetch("test.txt", metadata1, context1)) { - assertNotNull(is); - } - assertNull(metadata1.get(FileSystem.CREATED), - "Without extractFileSystemMetadata, should not have CREATED metadata"); - - // Now create runtime config with extractFileSystemMetadata=true - // Note: basePath is NOT included for security reasons - String runtimeConfig = "{\"extractFileSystemMetadata\":true}"; - - ConfigContainer configContainer = new ConfigContainer(); - configContainer.set("test-fetcher", runtimeConfig); - - ParseContext context2 = new ParseContext(); - context2.set(ConfigContainer.class, configContainer); - - // Fetch with runtime config - should extract file system metadata - Metadata metadata2 = new Metadata(); - try (InputStream is = fetcher.fetch("test.txt", metadata2, context2)) { - assertNotNull(is); - } - assertNotNull(metadata2.get(FileSystem.CREATED), - "With extractFileSystemMetadata=true, should have CREATED metadata"); - assertNotNull(metadata2.get(FileSystem.MODIFIED), - "With extractFileSystemMetadata=true, should have MODIFIED metadata"); - } - - @Test - public void testRuntimeConfigCannotOverrideBasePath(@TempDir Path tempDir) throws Exception { - // Create two directories with different files - Path dir1 = tempDir.resolve("dir1"); - Path dir2 = tempDir.resolve("dir2"); - Files.createDirectories(dir1); - Files.createDirectories(dir2); - - Path file1 = dir1.resolve("test.txt"); - Files.writeString(file1, "content from dir1"); - - // Create fetcher with dir1 as default basePath - String defaultConfig = String.format(Locale.ROOT, "{\"basePath\":\"%s\"}", - dir1.toString().replace("\\", "\\\\")); - ExtensionConfig pluginConfig = new ExtensionConfig("test-fetcher", "test", defaultConfig); - FileSystemFetcher fetcher = new FileSystemFetcher(pluginConfig); - - // Fetch from default basePath (dir1) - Metadata metadata1 = new Metadata(); - ParseContext context1 = new ParseContext(); - try (InputStream is = fetcher.fetch("test.txt", metadata1, context1)) { - String content = new String(is.readAllBytes(), StandardCharsets.UTF_8); - assertEquals("content from dir1", content); - } - - // Try to override basePath at runtime to point to dir2 - // This should throw an exception for security reasons - String runtimeConfig = String.format(Locale.ROOT, "{\"basePath\":\"%s\"}", - dir2.toString().replace("\\", "\\\\")); - ConfigContainer configContainer = new ConfigContainer(); - configContainer.set("test-fetcher", runtimeConfig); - - ParseContext context2 = new ParseContext(); - context2.set(ConfigContainer.class, configContainer); - - // Fetch with runtime config - should throw exception - Metadata metadata2 = new Metadata(); - IOException exception = assertThrows(IOException.class, () -> { - fetcher.fetch("test.txt", metadata2, context2); - }); - assertTrue(exception.getCause() != null && - exception.getCause().getMessage().contains("Cannot change 'basePath' at runtime"), - "Should throw exception when attempting to change basePath at runtime"); - } - - @Test - public void testConfigContainerNotPresent(@TempDir Path tempDir) throws Exception { - // Create a test file - Path testFile = tempDir.resolve("test.txt"); - Files.writeString(testFile, "test content"); - - // Create fetcher with default config - String defaultConfig = String.format(Locale.ROOT, "{\"basePath\":\"%s\"}", - tempDir.toString().replace("\\", "\\\\")); - ExtensionConfig pluginConfig = new ExtensionConfig("test-fetcher", "test", defaultConfig); - FileSystemFetcher fetcher = new FileSystemFetcher(pluginConfig); - - // Fetch with ParseContext that has no ConfigContainer - should use default config - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - // Don't set ConfigContainer in context - - try (InputStream is = fetcher.fetch("test.txt", metadata, context)) { - assertNotNull(is); - String content = new String(is.readAllBytes(), StandardCharsets.UTF_8); - assertEquals("test content", content); - } - } - - @Test - public void testConfigContainerWithDifferentId(@TempDir Path tempDir) throws Exception { - // Create a test file - Path testFile = tempDir.resolve("test.txt"); - Files.writeString(testFile, "test content"); - - // Create fetcher with default config - String defaultConfig = String.format(Locale.ROOT, "{\"basePath\":\"%s\"}", - tempDir.toString().replace("\\", "\\\\")); - ExtensionConfig pluginConfig = new ExtensionConfig("test-fetcher", "test", defaultConfig); - FileSystemFetcher fetcher = new FileSystemFetcher(pluginConfig); - - // Create ConfigContainer with config for a different fetcher ID - ConfigContainer configContainer = new ConfigContainer(); - configContainer.set("different-fetcher", "{\"basePath\":\"/some/other/path\"}"); - - ParseContext context = new ParseContext(); - context.set(ConfigContainer.class, configContainer); - - // Fetch - should use default config since runtime config is for different ID - Metadata metadata = new Metadata(); - try (InputStream is = fetcher.fetch("test.txt", metadata, context)) { - assertNotNull(is); - String content = new String(is.readAllBytes(), StandardCharsets.UTF_8); - assertEquals("test content", content); - } - } -} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java index 8c3254503..e485844dc 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java @@ -16,42 +16,127 @@ */ package org.apache.tika.pipes.fetcher.fs; -import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; +import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.fetcher.Fetcher; import org.apache.tika.plugins.ExtensionConfig; public class FileSystemFetcherTest { + private static final ObjectMapper MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private Fetcher createFetcher(Path basePath, Boolean allowAbsolutePaths) + throws TikaConfigException { + ObjectNode config = MAPPER.createObjectNode(); + if (basePath != null) { + config.put("basePath", basePath.toAbsolutePath().toString()); + } + if (allowAbsolutePaths != null) { + config.put("allowAbsolutePaths", allowAbsolutePaths); + } + ExtensionConfig pluginConfig = new ExtensionConfig("test", "test", config.toString()); + return new FileSystemFetcherFactory().buildExtension(pluginConfig); + } + @Test - public void testDescendant() throws Exception { + public void testNullByte() throws Exception { + assertThrows(TikaConfigException.class, () -> { + ObjectNode config = MAPPER.createObjectNode(); + config.put("basePath", "bad\u0000path"); + ExtensionConfig pluginConfig = new ExtensionConfig("test", "test", config.toString()); + new FileSystemFetcherFactory().buildExtension(pluginConfig); + }); + } - Path root = Paths.get("/ab/cd/"); - Path descendant = root.resolve("ef/gh/ij.pdf"); - assertTrue(FileSystemFetcher.isDescendant(root, descendant)); + @Test + public void testPathTraversalBlocked() throws Exception { + // Create a subdirectory as basePath and a file outside it + Path basePath = tempDir.resolve("allowed"); + Files.createDirectories(basePath); + + Path fileInBase = basePath.resolve("safe.txt"); + Files.writeString(fileInBase, "safe content"); + + Path fileOutsideBase = tempDir.resolve("secret.txt"); + Files.writeString(fileOutsideBase, "secret content"); + + // Create fetcher with basePath set to the subdirectory + Fetcher fetcher = createFetcher(basePath, null); - descendant = Paths.get("/cd/ef.pdf"); - assertFalse(FileSystemFetcher.isDescendant(root, descendant)); + // Valid path within basePath should work + try (TikaInputStream tis = fetcher.fetch("safe.txt", new Metadata(), new ParseContext())) { + assertNotNull(tis); + } - descendant = root.resolve("../../ij.pdf"); - assertFalse(FileSystemFetcher.isDescendant(root, descendant)); + // Path traversal attempt should be rejected + assertThrows(SecurityException.class, () -> { + fetcher.fetch("../secret.txt", new Metadata(), new ParseContext()); + }); } @Test - public void testNullByte() throws Exception { + public void testDeepPathTraversalBlocked() throws Exception { + // Create nested directories + Path basePath = tempDir.resolve("a/b/c"); + Files.createDirectories(basePath); + + Path fileInBase = basePath.resolve("file.txt"); + Files.writeString(fileInBase, "nested content"); + + Path fileOutsideBase = tempDir.resolve("outside.txt"); + Files.writeString(fileOutsideBase, "outside content"); + + Fetcher fetcher = createFetcher(basePath, null); + + // Deep path traversal should be rejected + assertThrows(SecurityException.class, () -> { + fetcher.fetch("../../../outside.txt", new Metadata(), new ParseContext()); + }); + + // Even deeper traversal should be rejected + assertThrows(SecurityException.class, () -> { + fetcher.fetch("../../../../../../../../etc/passwd", new Metadata(), new ParseContext()); + }); + } + + @Test + public void testAllowAbsolutePathsRequired() throws Exception { + // Without basePath and without allowAbsolutePaths, should throw assertThrows(TikaConfigException.class, () -> { - ExtensionConfig pluginConfig = new ExtensionConfig("test", "test", - "{ \"basePath\":\"bad\\u0000path\"}"); - Fetcher f = new FileSystemFetcherFactory().buildExtension(pluginConfig); + createFetcher(null, null); }); } + + @Test + public void testAllowAbsolutePathsWorks() throws Exception { + // Create a file to fetch + Path testFile = tempDir.resolve("test.txt"); + Files.writeString(testFile, "test content"); + + // With allowAbsolutePaths=true and no basePath, should work + Fetcher fetcher = createFetcher(null, true); + + // Fetch using absolute path + try (TikaInputStream tis = fetcher.fetch( + testFile.toAbsolutePath().toString(), new Metadata(), new ParseContext())) { + assertNotNull(tis); + } + } } diff --git a/tika-serialization/pom.xml b/tika-serialization/pom.xml index 186146bc1..e9401b73c 100644 --- a/tika-serialization/pom.xml +++ b/tika-serialization/pom.xml @@ -90,6 +90,15 @@ </dependencies> <build> <plugins> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes> + <exclude>**/test-documents/**</exclude> + </excludes> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId>
