This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch haystack-pipes-parsemode in repository https://gitbox.apache.org/repos/asf/tika.git
commit 210fe3a77d9dea77799b0a7b278816e53bd98770 Author: tballison <[email protected]> AuthorDate: Wed May 13 12:46:25 2026 -0400 improve parsemode configuration --- docs/modules/ROOT/pages/pipes/parse-modes.adoc | 7 ++++--- .../tika/pipes/core/server/ConnectionHandler.java | 2 +- .../apache/tika/pipes/core/server/PipesServer.java | 6 ++++-- .../apache/tika/pipes/core/server/PipesWorker.java | 8 +++++++- .../tika/pipes/core/server/ServerProtocolIO.java | 19 ++++++++++++++----- .../tika/pipes/core/server/SharedServerResources.java | 1 + 6 files changed, 31 insertions(+), 12 deletions(-) diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc b/docs/modules/ROOT/pages/pipes/parse-modes.adoc index 2a1af6a593..6e5f47fa4e 100644 --- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc +++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc @@ -20,7 +20,8 @@ :toclevels: 3 Tika Pipes uses `ParseMode` to control how documents are parsed and how results are emitted. -The parse mode is set on the `ParseContext` or configured in `PipesConfig`. +The parse mode is configured in the `pipes` section of the JSON config, or overridden per-request +in the `parseContext` field of a `FetchEmitTuple`. == Available Parse Modes @@ -87,7 +88,7 @@ to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`). [source,json] ---- { - "parseContext": { + "pipes": { "parseMode": "CONCATENATE" } } @@ -125,7 +126,7 @@ useful for: [source,json] ---- { - "parseContext": { + "pipes": { "parseMode": "CONTENT_ONLY" } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java index 3162f0922a..c6f802e516 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java @@ -230,7 +230,7 @@ public class ConnectionHandler implements Runnable, Closeable { resources.getEmitStrategy(), resources.getEmitterManager(), threshold); return new PipesWorker(fetchEmitTuple, mergedContext, resources.getAutoDetectParser(), resources.getEmitterManager(), fetchHandler, parseHandler, emitHandler, - resources.getDefaultMetadataWriteLimiterFactory()); + resources.getDefaultMetadataWriteLimiterFactory(), pipesConfig.getParseMode()); } private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext mergedContext, diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index fb7a74551f..1fd9df1a2a 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -61,6 +61,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.core.EmitStrategy; import org.apache.tika.pipes.core.EmitStrategyConfig; @@ -403,7 +404,8 @@ public class PipesServer implements AutoCloseable { long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES; EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold); return new PipesWorker(fetchEmitTuple, mergedContext, autoDetectParser, emitterManager, - fetchHandler, parseHandler, emitHandler, defaultMetadataWriteLimiterFactory); + fetchHandler, parseHandler, emitHandler, defaultMetadataWriteLimiterFactory, + pipesConfig.getParseMode()); } private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext mergedContext, @@ -550,7 +552,7 @@ public class PipesServer implements AutoCloseable { if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null) { mergedContext.set(EmbeddedDocumentExtractorFactory.class, new UnpackExtractorFactory()); } - // Overlay request's values (request takes precedence) + // Request-level values override config defaults mergedContext.copyFrom(requestContext); return mergedContext; } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index 136853e5d2..a76defc641 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -69,10 +69,12 @@ class PipesWorker implements Callable<PipesResult> { private final ParseHandler parseHandler; private final EmitHandler emitHandler; private final MetadataWriteLimiterFactory defaultMetadataWriteLimiterFactory; + private final ParseMode defaultParseMode; public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext parseContext, AutoDetectParser autoDetectParser, EmitterManager emitterManager, FetchHandler fetchHandler, ParseHandler parseHandler, - EmitHandler emitHandler, MetadataWriteLimiterFactory defaultMetadataWriteLimiterFactory) { + EmitHandler emitHandler, MetadataWriteLimiterFactory defaultMetadataWriteLimiterFactory, + ParseMode defaultParseMode) { this.fetchEmitTuple = fetchEmitTuple; this.parseContext = parseContext; this.autoDetectParser = autoDetectParser; @@ -81,6 +83,7 @@ class PipesWorker implements Callable<PipesResult> { this.parseHandler = parseHandler; this.emitHandler = emitHandler; this.defaultMetadataWriteLimiterFactory = defaultMetadataWriteLimiterFactory; + this.defaultParseMode = defaultParseMode; } @Override @@ -607,6 +610,9 @@ class PipesWorker implements Callable<PipesResult> { } ParseMode parseMode = parseContext.get(ParseMode.class); + if (parseMode == null) { + parseMode = defaultParseMode; + } UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); // For UNPACK mode, automatically set up byte extraction diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java index 3d71f87457..531db0036f 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java @@ -20,6 +20,9 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -45,6 +48,8 @@ import org.apache.tika.utils.StringUtils; */ public class ServerProtocolIO { + private static final Logger LOG = LoggerFactory.getLogger(ServerProtocolIO.class); + private final DataInputStream input; private final DataOutputStream output; @@ -122,12 +127,16 @@ public class ServerProtocolIO { UnpackConfig unpackConfig = requestContext.get(UnpackConfig.class); ParseMode parseMode = requestContext.get(ParseMode.class); + // Warn (don't throw) when UnpackConfig has an emitter but ParseMode is not UNPACK. + // The global parse-context may include UnpackConfig as a default for UNPACK pipe runs, + // but the /rmeta and /tika endpoints explicitly set RMETA mode and PipesWorker correctly + // ignores UnpackConfig for non-UNPACK modes. Throwing here would crash the child process. if (unpackConfig != null && !StringUtils.isBlank(unpackConfig.getEmitter()) - && parseMode != ParseMode.UNPACK) { - throw new TikaConfigException( - "FetchEmitTuple has UnpackConfig with emitter '" + unpackConfig.getEmitter() + - "' but ParseMode is " + parseMode + ". " + - "To extract embedded bytes, set ParseMode.UNPACK in the ParseContext."); + && parseMode != null && parseMode != ParseMode.UNPACK) { + LOG.warn("FetchEmitTuple has UnpackConfig with emitter '{}' but ParseMode is {}. " + + "UnpackConfig will be ignored. " + + "To extract embedded bytes, set ParseMode.UNPACK in the ParseContext.", + unpackConfig.getEmitter(), parseMode); } } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java index e92f3455cd..cb5f8412a6 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java @@ -158,6 +158,7 @@ public class SharedServerResources { if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null) { mergedContext.set(EmbeddedDocumentExtractorFactory.class, new UnpackExtractorFactory()); } + // Request-level values override config defaults mergedContext.copyFrom(requestContext); return mergedContext; }
