This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4626 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 006efccb319edc6aeb8893ad9d9b8057e9b0edb4 Author: tallison <[email protected]> AuthorDate: Wed Jan 21 06:37:49 2026 -0500 TIKA-4626 - swap in tika-pipes for /tika and /rmeta endpoints --- .../org/apache/tika/pipes/core/PipesClient.java | 3 + .../tika/pipes/core/server/ParseHandler.java | 8 + .../apache/tika/pipes/core/server/PipesServer.java | 2 + .../apache/tika/pipes/core/server/PipesWorker.java | 12 +- .../apache/tika/server/core/TikaServerProcess.java | 12 +- .../server/core/resource/PipesParsingHelper.java | 117 +++----- .../core/resource/RecursiveMetadataResource.java | 38 +-- .../org/apache/tika/server/core/CXFTestBase.java | 59 +++- .../server/core/benchmark/TikaServerBenchmark.java | 312 ++++++++++----------- tika-server/tika-server-standard/pom.xml | 2 +- 10 files changed, 269 insertions(+), 296 deletions(-) diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java index e79c35ccfe..053856bbeb 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesClient.java @@ -423,7 +423,9 @@ public class PipesClient implements Closeable { int len = serverTuple.input.readInt(); byte[] bytes = new byte[len]; serverTuple.input.readFully(bytes); + writeAck(); + try (ObjectInputStream objectInputStream = new ObjectInputStream(UnsynchronizedByteArrayInputStream .builder() .setByteArray(bytes) @@ -492,6 +494,7 @@ public class PipesClient implements Closeable { } } socket.setSoTimeout((int) pipesConfig.getSocketTimeoutMs()); + socket.setTcpNoDelay(true); // Disable Nagle's algorithm to avoid ~40ms delays on small writes serverTuple = new ServerTuple(process, serverSocket, socket, new DataInputStream(socket.getInputStream()), new DataOutputStream(socket.getOutputStream()), tmpDir); waitForStartup(); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index af3e75f50a..a28b2c15dc 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -34,6 +34,7 @@ import org.apache.tika.digest.Digester; import org.apache.tika.digest.SkipContainerDocumentDigest; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; import org.apache.tika.io.TikaInputStream; @@ -221,11 +222,15 @@ class ParseHandler { //queue better be empty. we deserve an exception if not intermediateResult.add(metadata); countDownLatch.await(); + boolean writeLimitReached = false; try { autoDetectParser.parse(stream, handler, metadata, parseContext); } catch (SAXException e) { containerException = ExceptionUtils.getStackTrace(e); LOG.warn("sax problem:" + fetchEmitTuple.getId(), e); + if (WriteLimitReachedException.isWriteLimitReached(e)) { + writeLimitReached = true; + } } catch (EncryptedDocumentException e) { containerException = ExceptionUtils.getStackTrace(e); LOG.warn("encrypted document:" + fetchEmitTuple.getId(), e); @@ -240,6 +245,9 @@ class ParseHandler { if (containerException != null) { metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, containerException); } + if (writeLimitReached) { + metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, true); + } if (LOG.isTraceEnabled()) { LOG.trace("timer -- parse only time: {} ms", System.currentTimeMillis() - start); } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index 66d5d9ae50..a97e8557a9 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -165,6 +165,7 @@ public class PipesServer implements AutoCloseable { LOG.debug("pipesClientId={}: connecting to client on port={}", pipesClientId, port); Socket socket = new Socket(); socket.connect(new InetSocketAddress(InetAddress.getLoopbackAddress(), port), PipesClient.SOCKET_CONNECT_TIMEOUT_MS); + socket.setTcpNoDelay(true); // Disable Nagle's algorithm to avoid ~40ms delays on small writes DataInputStream dis = new DataInputStream(socket.getInputStream()); DataOutputStream dos = new DataOutputStream(socket.getOutputStream()); @@ -443,6 +444,7 @@ public class PipesServer implements AutoCloseable { int length = input.readInt(); byte[] bytes = new byte[length]; input.readFully(bytes); + try (ObjectInputStream objectInputStream = new ObjectInputStream( UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) { return (FetchEmitTuple) objectInputStream.readObject(); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index b779388127..d8315cfd9f 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -18,8 +18,6 @@ package org.apache.tika.pipes.core.server; import java.io.Closeable; import java.io.IOException; -import java.time.Duration; -import java.time.Instant; import java.util.List; import java.util.concurrent.Callable; @@ -71,23 +69,15 @@ class PipesWorker implements Callable<PipesResult> { @Override public PipesResult call() throws Exception { - Instant start = Instant.now(); - - if (LOG.isTraceEnabled()) { - LOG.trace("timer -- got fetcher: {}ms", Duration.between(start, Instant.now()).toMillis()); - } - start = Instant.now(); MetadataListAndEmbeddedBytes parseData = null; try { //this can be null if there is a fetch exception ParseDataOrPipesResult parseDataResult = parseFromTuple(); + if (parseDataResult.pipesResult != null) { return parseDataResult.pipesResult; } - if (LOG.isTraceEnabled()) { - LOG.trace("timer -- to parse: {} ms", Duration.between(start, Instant.now()).toMillis()); - } parseData = parseDataResult.parseDataResult; if (parseData == null || metadataIsEmpty(parseData.getMetadataList())) { diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index 0b7cd9cc5f..4edf08bc9a 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -420,7 +420,7 @@ public class TikaServerProcess { * Initializes the PipesParsingHelper for pipes-based parsing with process isolation. * <p> * The PipesParser will be configured with PASSBACK_ALL emit strategy so that - * parsed content is returned directly instead of being emitted to an external emitter. + * parsed results are returned through the socket connection. * <p> * If no config file is provided, a minimal default configuration will be created. * The plugin-roots will default to a "plugins" directory at the same level as the server jar. @@ -430,25 +430,23 @@ public class TikaServerProcess { * @throws Exception if pipes initialization fails */ private static PipesParsingHelper initPipesParsingHelper(TikaServerConfig tikaServerConfig) throws Exception { - TikaJsonConfig tikaJsonConfig; + // Load or create config Path configPath; - if (tikaServerConfig.hasConfigFile()) { configPath = tikaServerConfig.getConfigPath(); - tikaJsonConfig = TikaJsonConfig.load(configPath); } else { - // Create minimal config - will use defaults configPath = createDefaultConfig(); - tikaJsonConfig = TikaJsonConfig.load(configPath); } + TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath); + // Load or create PipesConfig with defaults PipesConfig pipesConfig = tikaJsonConfig.deserialize("pipes", PipesConfig.class); if (pipesConfig == null) { pipesConfig = new PipesConfig(); } - // Force PASSBACK_ALL strategy so results are returned to us (not emitted) + // Use PASSBACK_ALL strategy: results are returned through the socket pipesConfig.setEmitStrategy(new EmitStrategyConfig(EmitStrategy.PASSBACK_ALL)); // Create PipesParser diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java index 290a4fc0b1..51f85d72ec 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java @@ -40,8 +40,6 @@ import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.emitter.EmitData; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; -import org.apache.tika.pipes.core.EmitStrategy; -import org.apache.tika.pipes.core.EmitStrategyConfig; import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.pipes.core.PipesException; import org.apache.tika.pipes.core.PipesParser; @@ -76,22 +74,22 @@ public class PipesParsingHelper { private final PipesParser pipesParser; private final PipesConfig pipesConfig; - private final Path tempDirectory; + private final Path inputTempDirectory; public PipesParsingHelper(PipesParser pipesParser, PipesConfig pipesConfig) { this.pipesParser = pipesParser; this.pipesConfig = pipesConfig; - // Determine temp directory + // Determine input temp directory String configTempDir = pipesConfig.getTempDirectory(); if (configTempDir != null && !configTempDir.isBlank()) { - this.tempDirectory = Paths.get(configTempDir); - if (!Files.isDirectory(this.tempDirectory)) { + this.inputTempDirectory = Paths.get(configTempDir); + if (!Files.isDirectory(this.inputTempDirectory)) { throw new IllegalArgumentException( "Configured tempDirectory does not exist or is not a directory: " + configTempDir); } } else { - this.tempDirectory = null; // Use system default + this.inputTempDirectory = null; // Use system default } } @@ -108,29 +106,29 @@ public class PipesParsingHelper { */ public List<Metadata> parse(InputStream inputStream, Metadata metadata, ParseContext parseContext, ParseMode parseMode) throws IOException { - Path tempFile = null; + Path inputTempFile = null; + String requestId = UUID.randomUUID().toString(); + try { // Write input stream to temp file - tempFile = createTempFile(); - Files.copy(inputStream, tempFile, StandardCopyOption.REPLACE_EXISTING); + inputTempFile = createInputTempFile(); + Files.copy(inputStream, inputTempFile, StandardCopyOption.REPLACE_EXISTING); // Set parse mode in context parseContext.set(ParseMode.class, parseMode); - // Set emit strategy override to PASSBACK_ALL - we want results returned, not emitted - parseContext.set(EmitStrategyConfig.class, new EmitStrategyConfig(EmitStrategy.PASSBACK_ALL)); + // Create FetchEmitTuple - use NO_EMIT since we're using PASSBACK_ALL + FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, inputTempFile.toAbsolutePath().toString()); - // Create FetchEmitTuple - FetchKey fetchKey = new FetchKey(DEFAULT_FETCHER_ID, tempFile.toAbsolutePath().toString()); FetchEmitTuple tuple = new FetchEmitTuple( - UUID.randomUUID().toString(), + requestId, fetchKey, EmitKey.NO_EMIT, metadata, parseContext ); - // Execute parse via pipes + // Execute parse via pipes - results will be passed back through socket PipesResult result = pipesParser.parse(tuple); // Process result @@ -142,20 +140,19 @@ public class PipesParsingHelper { } catch (PipesException e) { throw new TikaServerParseException(e); } finally { - // Clean up temp file - if (tempFile != null) { + // Clean up input temp file + if (inputTempFile != null) { try { - Files.deleteIfExists(tempFile); + Files.deleteIfExists(inputTempFile); } catch (IOException e) { - LOG.warn("Failed to delete temp file: {}", tempFile, e); + LOG.warn("Failed to delete input temp file: {}", inputTempFile, e); } } } } /** - * Processes the PipesResult and extracts metadata list. - * Throws appropriate exceptions for error states. + * Processes the PipesResult and returns the metadata list. */ private List<Metadata> processResult(PipesResult result) { if (result.isProcessCrash()) { @@ -183,66 +180,22 @@ public class PipesParsingHelper { Response.Status.INTERNAL_SERVER_ERROR); } - // Success cases + // Get metadata from result EmitData emitData = result.emitData(); - if (emitData == null) { - LOG.debug("Parse returned null emitData, status: {}", result.status()); - // Check if there's an exception message in the result - String message = result.message(); - if (message != null && !message.isEmpty()) { - // Create metadata with exception info - Metadata metadata = new Metadata(); - metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, message); - return Collections.singletonList(metadata); - } - return Collections.emptyList(); + if (emitData != null && emitData.getMetadataList() != null) { + return emitData.getMetadataList(); } - List<Metadata> metadataList = emitData.getMetadataList(); - if (metadataList == null) { - return Collections.emptyList(); + // Empty result + LOG.debug("Parse returned empty result, status: {}", result.status()); + String message = result.message(); + if (message != null && !message.isEmpty()) { + Metadata errorMetadata = new Metadata(); + errorMetadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, message); + return Collections.singletonList(errorMetadata); } - // Handle parse success with exception - always add exception info to metadata - // This includes PARSE_SUCCESS_WITH_EXCEPTION, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_SUCCESS_PASSBACK - String stackTrace = emitData.getContainerStackTrace(); - boolean hasException = stackTrace != null && !stackTrace.isEmpty(); - - if (hasException && !metadataList.isEmpty()) { - // Check if this was a WriteLimitReached exception and set the flag - checkWriteLimitReached(metadataList, stackTrace); - // Add the stack trace to the metadata if not already set by pipes - Metadata firstMetadata = metadataList.get(0); - if (firstMetadata.get(TikaCoreProperties.CONTAINER_EXCEPTION) == null) { - firstMetadata.set(TikaCoreProperties.CONTAINER_EXCEPTION, stackTrace); - } - } - - return metadataList; - } - - /** - * Checks if the parse result was due to write limit being reached. - * This is a "soft" exception that should still return HTTP 200. - * If detected from stack trace but not in metadata, sets the metadata flag. - */ - private boolean checkWriteLimitReached(List<Metadata> metadataList, String stackTrace) { - if (metadataList.isEmpty()) { - return false; - } - Metadata metadata = metadataList.get(0); - // Check metadata flag (set by RecursiveParserWrapper or CompositeParser) - String flagValue = metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED); - if ("true".equals(flagValue)) { - return true; - } - // Also check stack trace for WriteLimitReachedException - if (stackTrace != null && stackTrace.contains("WriteLimitReachedException")) { - // Set the metadata flag if not already set (for consistency) - metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true"); - return true; - } - return false; + return Collections.emptyList(); } /** @@ -265,13 +218,13 @@ public class PipesParsingHelper { } /** - * Creates a temp file in the configured temp directory. + * Creates a temp file for input in the configured temp directory. */ - private Path createTempFile() throws IOException { - if (tempDirectory != null) { - return Files.createTempFile(tempDirectory, "tika-server-", ".tmp"); + private Path createInputTempFile() throws IOException { + if (inputTempDirectory != null) { + return Files.createTempFile(inputTempDirectory, "tika-server-input-", ".tmp"); } else { - return Files.createTempFile("tika-server-", ".tmp"); + return Files.createTempFile("tika-server-input-", ".tmp"); } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index 2ea92c72cb..eda085354b 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -17,7 +17,6 @@ package org.apache.tika.server.core.resource; import static org.apache.tika.server.core.resource.TikaResource.fillMetadata; -import static org.apache.tika.server.core.resource.TikaResource.getTikaLoader; import static org.apache.tika.server.core.resource.TikaResource.getWriteLimit; import static org.apache.tika.server.core.resource.TikaResource.setupContentHandlerFactory; import static org.apache.tika.server.core.resource.TikaResource.setupContentHandlerFactoryIfNeeded; @@ -42,7 +41,6 @@ import org.slf4j.LoggerFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -55,7 +53,12 @@ public class RecursiveMetadataResource { protected static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.XML; private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class); - public static List<Metadata> parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap<String, String> httpHeaders, + /** + * Parses content and returns metadata list. + * Metadata filtering is done in the child process, so no filtering needed here. + */ + public static List<Metadata> parseMetadata(TikaInputStream tis, Metadata metadata, + MultivaluedMap<String, String> httpHeaders, ServerHandlerConfig handlerConfig) throws Exception { @@ -68,10 +71,8 @@ public class RecursiveMetadataResource { setupContentHandlerFactory(context, handlerConfig.type().toString(), handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached()); - List<Metadata> metadataList = TikaResource.parseWithPipes(tis, metadata, context, ParseMode.RMETA); - MetadataFilter metadataFilter = context.get(MetadataFilter.class, getTikaLoader().loadMetadataFilters()); - metadataFilter.filter(metadataList); - return metadataList; + // Filtering is done in child process, no need to filter again + return TikaResource.parseWithPipes(tis, metadata, context, ParseMode.RMETA); } static ServerHandlerConfig buildHandlerConfig(MultivaluedMap<String, String> httpHeaders, String handlerTypeName, ParseMode parseMode) { @@ -113,10 +114,9 @@ public class RecursiveMetadataResource { @Path("form{" + HANDLER_TYPE_PARAM + " : (\\w+)?}") public Response getMetadataFromMultipart(Attachment att, @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName) throws Exception { try (TikaInputStream tis = TikaInputStream.get(att.getObject(InputStream.class))) { - return Response - .ok(parseMetadataToMetadataList(tis, new Metadata(), att.getHeaders(), - buildHandlerConfig(att.getHeaders(), handlerTypeName, ParseMode.RMETA))) - .build(); + List<Metadata> metadataList = parseMetadata(tis, new Metadata(), att.getHeaders(), + buildHandlerConfig(att.getHeaders(), handlerTypeName, ParseMode.RMETA)); + return Response.ok(new MetadataList(metadataList)).build(); } } @@ -153,9 +153,8 @@ public class RecursiveMetadataResource { setupContentHandlerFactoryIfNeeded(context, handlerConfig.type().toString(), handlerConfig.writeLimit(), handlerConfig.throwOnWriteLimitReached()); + // Filtering is done in child process, no need to filter again List<Metadata> metadataList = TikaResource.parseWithPipes(tis, metadata, context, ParseMode.RMETA); - MetadataFilter metadataFilter = context.get(MetadataFilter.class, getTikaLoader().loadMetadataFilters()); - metadataFilter.filter(metadataList); return new MetadataList(metadataList); } @@ -188,16 +187,9 @@ public class RecursiveMetadataResource { public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(is)) { - return Response - .ok(parseMetadataToMetadataList(tis, metadata, httpHeaders.getRequestHeaders(), - buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, ParseMode.RMETA))) - .build(); + List<Metadata> metadataList = parseMetadata(tis, metadata, httpHeaders.getRequestHeaders(), + buildHandlerConfig(httpHeaders.getRequestHeaders(), handlerTypeName, ParseMode.RMETA)); + return Response.ok(new MetadataList(metadataList)).build(); } } - - private MetadataList parseMetadataToMetadataList(TikaInputStream tis, Metadata metadata, - MultivaluedMap<String, String> httpHeaders, ServerHandlerConfig handlerConfig) - throws Exception { - return new MetadataList(parseMetadata(tis, metadata, httpHeaders, handlerConfig)); - } } diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index 1a1b9cacfe..d73c002546 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@ -181,16 +181,18 @@ public abstract class CXFTestBase { public void setUp() throws Exception { Path tmp = Files.createTempFile("tika-server-test-", ".json"); try { + // Copy tika config to temp file first + Files.copy(getTikaConfigInputStream(), tmp, StandardCopyOption.REPLACE_EXISTING); + InputStream pipesConfigInputStream = getPipesConfigInputStream(); if (pipesConfigInputStream != null) { - this.pipesConfigPath = Files.createTempFile("tika-server-pipes-", ".json"); - Files.copy(pipesConfigInputStream, this.pipesConfigPath, StandardCopyOption.REPLACE_EXISTING); + // Test provided its own pipes config - merge in PASSBACK_ALL emit strategy + this.pipesConfigPath = mergePassbackAllStrategy(pipesConfigInputStream); } else { - // Create a default pipes config for tests - this.pipesConfigPath = createDefaultTestConfig(); + // Create a default pipes config, merging metadata-filters from tika config + this.pipesConfigPath = createDefaultTestConfig(tmp); } - Files.copy(getTikaConfigInputStream(), tmp, StandardCopyOption.REPLACE_EXISTING); this.tika = TikaLoader.load(tmp); // Initialize PipesParsingHelper for pipes-based parsing @@ -231,12 +233,53 @@ public abstract class CXFTestBase { server = sf.create(); } + /** + * Merges PASSBACK_ALL emit strategy into a pipes config. + * This ensures the child process uses PASSBACK_ALL regardless of what's in the config file. + */ + private Path mergePassbackAllStrategy(InputStream pipesConfigInputStream) throws IOException { + ObjectMapper mapper = new ObjectMapper(); + com.fasterxml.jackson.databind.node.ObjectNode root = (com.fasterxml.jackson.databind.node.ObjectNode) mapper.readTree(pipesConfigInputStream); + + // Get or create pipes section + com.fasterxml.jackson.databind.node.ObjectNode pipes = (com.fasterxml.jackson.databind.node.ObjectNode) root.get("pipes"); + if (pipes == null) { + pipes = mapper.createObjectNode(); + root.set("pipes", pipes); + } + + // Set emit strategy to PASSBACK_ALL + com.fasterxml.jackson.databind.node.ObjectNode emitStrategy = mapper.createObjectNode(); + emitStrategy.put("type", "PASSBACK_ALL"); + pipes.set("emitStrategy", emitStrategy); + + Path tempConfig = Files.createTempFile("tika-server-pipes-", ".json"); + mapper.writerWithDefaultPrettyPrinter().writeValue(tempConfig.toFile(), root); + return tempConfig; + } + /** * Creates a default test config with pipes configuration. + * If the tika config contains metadata-filters, they are merged into the pipes config. + * + * @param tikaConfigPath path to the tika config (may contain metadata-filters) */ - private Path createDefaultTestConfig() throws IOException { + private Path createDefaultTestConfig(Path tikaConfigPath) throws IOException { Path pluginsDir = Paths.get("target/plugins").toAbsolutePath(); + // Read tika config to check for metadata-filters + String metadataFiltersJson = ""; + try { + ObjectMapper mapper = new ObjectMapper(); + JsonNode tikaConfig = mapper.readTree(tikaConfigPath.toFile()); + JsonNode metadataFilters = tikaConfig.get("metadata-filters"); + if (metadataFilters != null && !metadataFilters.isEmpty()) { + metadataFiltersJson = ",\n \"metadata-filters\": " + mapper.writeValueAsString(metadataFilters); + } + } catch (Exception e) { + LOG.debug("Could not read metadata-filters from tika config: {}", e.getMessage()); + } + String configJson = String.format(Locale.ROOT, """ { "fetchers": { @@ -250,9 +293,9 @@ public abstract class CXFTestBase { "numClients": 2, "timeoutMillis": 60000 }, - "plugin-roots": "%s" + "plugin-roots": "%s"%s } - """, pluginsDir.toString().replace("\\", "/")); + """, pluginsDir.toString().replace("\\", "/"), metadataFiltersJson); Path tempConfig = Files.createTempFile("tika-test-default-config-", ".json"); Files.writeString(tempConfig, configJson); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/benchmark/TikaServerBenchmark.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/benchmark/TikaServerBenchmark.java index bbaed1080b..795a22d09f 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/benchmark/TikaServerBenchmark.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/benchmark/TikaServerBenchmark.java @@ -60,8 +60,8 @@ import java.util.concurrent.atomic.AtomicInteger; * --async Async mode: all requests sent immediately (stress test) * * Size mode options: - * --small-kb=N Size of small files in KB (default: 1) - * --large-kb=N Size of large files in KB (default: 100) + * --small-times=N Number of paragraph repetitions for small output (default: 10) + * --large-times=N Number of paragraph repetitions for large output (default: 1000) * * Sleep mode options: * --short-ms=N Short sleep duration in ms (default: 10) @@ -70,24 +70,33 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class TikaServerBenchmark { - private static final String MOCK_XML_SIZE_TEMPLATE = """ - <?xml version="1.0" encoding="UTF-8" ?> - <mock> - <metadata action="add" name="author">Benchmark Test</metadata> - <metadata action="add" name="title">Performance Test Document</metadata> - <write element="p">%s</write> - </mock> - """; + // Template with both sleep (parse time) and output size (times) + // Format args: sleepMs, times + // Padding added to avoid zip bomb detection (need >10KB input for 1MB output at 100:1 ratio) + private static final String MOCK_XML_TEMPLATE; - private static final String MOCK_XML_SLEEP_TEMPLATE = """ + static { + StringBuilder sb = new StringBuilder(); + sb.append(""" <?xml version="1.0" encoding="UTF-8" ?> <mock> <metadata action="add" name="author">Benchmark Test</metadata> - <metadata action="add" name="title">Sleep Test Document</metadata> + <metadata action="add" name="title">Performance Test Document</metadata> <hang millis="%d" heavy="false" interruptible="false" /> - <write element="p">Test content after sleep</write> + <write element="p" times="%d">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore.</write> + <!-- Padding to increase input size and avoid zip bomb detection: + """); + // Add ~12KB of padding (120 lines of 100 chars each) + String paddingLine = "PADDING: Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt.\n"; + for (int i = 0; i < 120; i++) { + sb.append(paddingLine); + } + sb.append(""" + --> </mock> - """; + """); + MOCK_XML_TEMPLATE = sb.toString(); + } private final String baseUrl; private final String endpoint; @@ -95,14 +104,13 @@ public class TikaServerBenchmark { private final int count; private final int warmupCount; private final int repeats; - private final String mode; private final boolean syncMode; - // Size mode params - private final int smallSizeKb; - private final int largeSizeKb; + // Output size params (times = number of paragraph repetitions) + private final int smallTimes; + private final int largeTimes; - // Sleep mode params + // Parse time params private final int shortSleepMs; private final int longSleepMs; @@ -110,22 +118,24 @@ public class TikaServerBenchmark { private final ExecutorService httpExecutor; private final ExecutorService taskExecutor; - private byte[] smallContent; - private byte[] largeContent; + // 2x2 matrix: [short/long sleep] x [small/large output] + private byte[] shortSmallContent; // short parse, small output + private byte[] shortLargeContent; // short parse, large output + private byte[] longSmallContent; // long parse, small output + private byte[] longLargeContent; // long parse, large output public TikaServerBenchmark(String baseUrl, String endpoint, int threads, int count, - int warmupCount, int repeats, String mode, boolean syncMode, - int smallSizeKb, int largeSizeKb, int shortSleepMs, int longSleepMs) { + int warmupCount, int repeats, boolean syncMode, + int smallTimes, int largeTimes, int shortSleepMs, int longSleepMs) { this.baseUrl = baseUrl; this.endpoint = endpoint; this.threads = threads; this.count = count; this.warmupCount = warmupCount; this.repeats = repeats; - this.mode = mode; this.syncMode = syncMode; - this.smallSizeKb = smallSizeKb; - this.largeSizeKb = largeSizeKb; + this.smallTimes = smallTimes; + this.largeTimes = largeTimes; this.shortSleepMs = shortSleepMs; this.longSleepMs = longSleepMs; @@ -143,54 +153,33 @@ public class TikaServerBenchmark { } private void generateTestContent() { - if ("sleep".equals(mode)) { - smallContent = generateSleepMockXml(shortSleepMs); - largeContent = generateSleepMockXml(longSleepMs); - } else { - smallContent = generateSizeMockXml(smallSizeKb * 1024); - largeContent = generateSizeMockXml(largeSizeKb * 1024); - } - } - - private byte[] generateSizeMockXml(int targetSizeBytes) { - StringBuilder content = new StringBuilder(); - String baseText = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + - "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " + - "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. "; - - while (content.length() < targetSizeBytes) { - content.append(baseText); - } - - String xml = String.format(Locale.ROOT, MOCK_XML_SIZE_TEMPLATE, - content.substring(0, Math.min(content.length(), targetSizeBytes))); - return xml.getBytes(StandardCharsets.UTF_8); + // 2x2 matrix of test content + shortSmallContent = generateMockXml(shortSleepMs, smallTimes); + shortLargeContent = generateMockXml(shortSleepMs, largeTimes); + longSmallContent = generateMockXml(longSleepMs, smallTimes); + longLargeContent = generateMockXml(longSleepMs, largeTimes); } - private byte[] generateSleepMockXml(int sleepMs) { - String xml = String.format(Locale.ROOT, MOCK_XML_SLEEP_TEMPLATE, sleepMs); + private byte[] generateMockXml(int sleepMs, int times) { + String xml = String.format(Locale.ROOT, MOCK_XML_TEMPLATE, sleepMs, times); return xml.getBytes(StandardCharsets.UTF_8); } public void run() throws Exception { System.out.println("=".repeat(70)); - System.out.println("Tika Server Performance Benchmark"); + System.out.println("Tika Server Performance Benchmark (2x2 Matrix)"); System.out.println("=".repeat(70)); System.out.println(); System.out.printf(Locale.ROOT, "Target URL: %s%s%n", baseUrl, endpoint); System.out.printf(Locale.ROOT, "Threads: %d%n", threads); System.out.printf(Locale.ROOT, "Requests/test: %d%n", count); System.out.printf(Locale.ROOT, "Repeats: %d%n", repeats); - System.out.printf(Locale.ROOT, "Mode: %s%n", mode); System.out.printf(Locale.ROOT, "Request mode: %s%n", syncMode ? "sync (realistic)" : "async (stress test)"); - - if ("sleep".equals(mode)) { - System.out.printf(Locale.ROOT, "Short sleep: %d ms%n", shortSleepMs); - System.out.printf(Locale.ROOT, "Long sleep: %d ms%n", longSleepMs); - } else { - System.out.printf(Locale.ROOT, "Small size: %d KB%n", smallSizeKb); - System.out.printf(Locale.ROOT, "Large size: %d KB%n", largeSizeKb); - } + System.out.println(); + System.out.println("Test Matrix:"); + System.out.printf(Locale.ROOT, " Parse time: short=%dms, long=%dms%n", shortSleepMs, longSleepMs); + System.out.printf(Locale.ROOT, " Output size: small=%d times (~%dKB), large=%d times (~%dKB)%n", + smallTimes, smallTimes * 100 / 1024, largeTimes, largeTimes * 100 / 1024); System.out.println(); // Check server is reachable @@ -201,31 +190,34 @@ public class TikaServerBenchmark { } System.out.println("Server is reachable."); - // Verify MockParser is being used (only for sleep mode) - if ("sleep".equals(mode)) { - if (!verifyMockParserInUse()) { - System.err.println("ERROR: MockParser is NOT being used by the server!"); - System.err.println("The tika-core test jar must be on the server's classpath."); - System.err.println("If using java -jar, the test jar must be in the manifest Class-Path."); - System.err.println("Try running with: java -cp 'tika-server.jar:lib/*' org.apache.tika.server.core.TikaServerCli"); - System.exit(1); - } - System.out.println("MockParser verified - sleep mode will work correctly."); + // Verify MockParser is being used + if (!verifyMockParserInUse()) { + System.err.println("ERROR: MockParser is NOT being used by the server!"); + System.err.println("The tika-core test jar must be on the server's classpath."); + System.err.println("If using java -jar, the test jar must be in the manifest Class-Path."); + System.err.println("Try running with: java -cp 'tika-server.jar:lib/*' org.apache.tika.server.core.TikaServerCli"); + System.exit(1); } + System.out.println("MockParser verified."); System.out.println(); // Warmup System.out.printf(Locale.ROOT, "Warming up with %d requests...%n", warmupCount); - runBenchmark(smallContent, warmupCount, "warmup", getSmallLabel()); + runBenchmark(shortSmallContent, warmupCount, "warmup", "warmup"); System.out.println("Warmup complete."); System.out.println(); - String firstLabel = getSmallLabel(); - String secondLabel = getLargeLabel(); + // Labels for the 2x2 matrix + String shortSmallLabel = String.format(Locale.ROOT, "short-%dms/small-%d", shortSleepMs, smallTimes); + String shortLargeLabel = String.format(Locale.ROOT, "short-%dms/large-%d", shortSleepMs, largeTimes); + String longSmallLabel = String.format(Locale.ROOT, "long-%dms/small-%d", longSleepMs, smallTimes); + String longLargeLabel = String.format(Locale.ROOT, "long-%dms/large-%d", longSleepMs, largeTimes); // Collect results across all repeats - List<BenchmarkResult> firstResults = new ArrayList<>(); - List<BenchmarkResult> secondResults = new ArrayList<>(); + List<BenchmarkResult> shortSmallResults = new ArrayList<>(); + List<BenchmarkResult> shortLargeResults = new ArrayList<>(); + List<BenchmarkResult> longSmallResults = new ArrayList<>(); + List<BenchmarkResult> longLargeResults = new ArrayList<>(); // Per-benchmark warmup count (10 requests per thread) int perBenchmarkWarmup = threads * 10; @@ -238,67 +230,74 @@ public class TikaServerBenchmark { System.out.println("*".repeat(70)); } - // First test (small/short) - System.out.println("-".repeat(70)); - System.out.printf(Locale.ROOT, "Running %s benchmark (%d requests)%n", firstLabel.toUpperCase(Locale.ROOT), count); - System.out.println("-".repeat(70)); - // Warmup for this benchmark (10 requests per thread, not counted) - System.out.printf(Locale.ROOT, " Per-benchmark warmup (%d requests)...%n", perBenchmarkWarmup); - runBenchmark(smallContent, perBenchmarkWarmup, "warmup", firstLabel); - BenchmarkResult firstResult = runBenchmark(smallContent, count, "first", firstLabel); - firstResults.add(firstResult); - printResults(firstResult, firstLabel); - System.out.println(); - - // Second test (large/long) - System.out.println("-".repeat(70)); - System.out.printf(Locale.ROOT, "Running %s benchmark (%d requests)%n", secondLabel.toUpperCase(Locale.ROOT), count); - System.out.println("-".repeat(70)); - // Warmup for this benchmark (10 requests per thread, not counted) - System.out.printf(Locale.ROOT, " Per-benchmark warmup (%d requests)...%n", perBenchmarkWarmup); - runBenchmark(largeContent, perBenchmarkWarmup, "warmup", secondLabel); - BenchmarkResult secondResult = runBenchmark(largeContent, count, "second", secondLabel); - secondResults.add(secondResult); - printResults(secondResult, secondLabel); + // Test 1: short parse, small output + shortSmallResults.add(runSingleBenchmark(shortSmallContent, perBenchmarkWarmup, shortSmallLabel)); + + // Test 2: short parse, large output + shortLargeResults.add(runSingleBenchmark(shortLargeContent, perBenchmarkWarmup, shortLargeLabel)); + + // Test 3: long parse, small output + longSmallResults.add(runSingleBenchmark(longSmallContent, perBenchmarkWarmup, longSmallLabel)); + + // Test 4: long parse, large output + longLargeResults.add(runSingleBenchmark(longLargeContent, perBenchmarkWarmup, longLargeLabel)); } // Calculate aggregated results - BenchmarkResult firstAgg = aggregateResults(firstResults); - BenchmarkResult secondAgg = aggregateResults(secondResults); + BenchmarkResult shortSmallAgg = aggregateResults(shortSmallResults); + BenchmarkResult shortLargeAgg = aggregateResults(shortLargeResults); + BenchmarkResult longSmallAgg = aggregateResults(longSmallResults); + BenchmarkResult longLargeAgg = aggregateResults(longLargeResults); - // Summary + // Summary - 2x2 Matrix format System.out.println(); - System.out.println("=".repeat(70)); + System.out.println("=".repeat(90)); if (repeats > 1) { System.out.printf(Locale.ROOT, "SUMMARY (averaged over %d repeats)%n", repeats); } else { System.out.println("SUMMARY"); } - System.out.println("=".repeat(70)); - System.out.printf(Locale.ROOT, "%-20s %18s %18s%n", "Metric", firstLabel, secondLabel); - System.out.println("-".repeat(70)); - System.out.printf(Locale.ROOT, "%-20s %18.2f %18.2f%n", "Throughput (req/s)", firstAgg.throughput, secondAgg.throughput); - System.out.printf(Locale.ROOT, "%-20s %18.2f %18.2f%n", "Avg Latency (ms)", firstAgg.avgLatencyMs, secondAgg.avgLatencyMs); - System.out.printf(Locale.ROOT, "%-20s %18.2f %18.2f%n", "P50 Latency (ms)", firstAgg.p50LatencyMs, secondAgg.p50LatencyMs); - System.out.printf(Locale.ROOT, "%-20s %18.2f %18.2f%n", "P95 Latency (ms)", firstAgg.p95LatencyMs, secondAgg.p95LatencyMs); - System.out.printf(Locale.ROOT, "%-20s %18.2f %18.2f%n", "P99 Latency (ms)", firstAgg.p99LatencyMs, secondAgg.p99LatencyMs); - System.out.printf(Locale.ROOT, "%-20s %18d %18d%n", "Success Count", firstAgg.successCount, secondAgg.successCount); - System.out.printf(Locale.ROOT, "%-20s %18d %18d%n", "Error Count", firstAgg.errorCount, secondAgg.errorCount); - System.out.println("=".repeat(70)); + System.out.println("=".repeat(90)); + + // Throughput matrix + System.out.println(); + System.out.println("THROUGHPUT (req/s):"); + System.out.printf(Locale.ROOT, "%-20s %20s %20s%n", "", "small-" + smallTimes, "large-" + largeTimes); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "short-" + shortSleepMs + "ms", shortSmallAgg.throughput, shortLargeAgg.throughput); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "long-" + longSleepMs + "ms", longSmallAgg.throughput, longLargeAgg.throughput); + + // Latency matrix + System.out.println(); + System.out.println("AVG LATENCY (ms):"); + System.out.printf(Locale.ROOT, "%-20s %20s %20s%n", "", "small-" + smallTimes, "large-" + largeTimes); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "short-" + shortSleepMs + "ms", shortSmallAgg.avgLatencyMs, shortLargeAgg.avgLatencyMs); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "long-" + longSleepMs + "ms", longSmallAgg.avgLatencyMs, longLargeAgg.avgLatencyMs); - // Output CSV-friendly line for easy comparison + // P95 Latency matrix System.out.println(); - System.out.println("CSV format (for comparison):"); - System.out.printf(Locale.ROOT, "mode,threads,repeats,%s_throughput,%s_p50,%s_p95,%s_throughput,%s_p50,%s_p95%n", - firstLabel, firstLabel, firstLabel, secondLabel, secondLabel, secondLabel); - System.out.printf(Locale.ROOT, "%s,%d,%d,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f%n", - mode, threads, repeats, - firstAgg.throughput, firstAgg.p50LatencyMs, firstAgg.p95LatencyMs, - secondAgg.throughput, secondAgg.p50LatencyMs, secondAgg.p95LatencyMs); + System.out.println("P95 LATENCY (ms):"); + System.out.printf(Locale.ROOT, "%-20s %20s %20s%n", "", "small-" + smallTimes, "large-" + largeTimes); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "short-" + shortSleepMs + "ms", shortSmallAgg.p95LatencyMs, shortLargeAgg.p95LatencyMs); + System.out.printf(Locale.ROOT, "%-20s %20.2f %20.2f%n", "long-" + longSleepMs + "ms", longSmallAgg.p95LatencyMs, longLargeAgg.p95LatencyMs); + + System.out.println(); + System.out.println("=".repeat(90)); shutdown(); } + private BenchmarkResult runSingleBenchmark(byte[] content, int perBenchmarkWarmup, String label) throws Exception { + System.out.println("-".repeat(70)); + System.out.printf(Locale.ROOT, "Running %s benchmark (%d requests)%n", label.toUpperCase(Locale.ROOT), count); + System.out.println("-".repeat(70)); + System.out.printf(Locale.ROOT, " Per-benchmark warmup (%d requests)...%n", perBenchmarkWarmup); + runBenchmark(content, perBenchmarkWarmup, "warmup", label); + BenchmarkResult result = runBenchmark(content, count, "test", label); + printResults(result, label); + System.out.println(); + return result; + } + private BenchmarkResult aggregateResults(List<BenchmarkResult> results) { if (results.size() == 1) { return results.get(0); @@ -314,14 +313,6 @@ public class TikaServerBenchmark { return new BenchmarkResult(avgThroughput, avgLatency, avgP50, avgP95, avgP99, avgMax, totalSuccess, totalErrors); } - private String getSmallLabel() { - return "sleep".equals(mode) ? "short-sleep" : "small-files"; - } - - private String getLargeLabel() { - return "sleep".equals(mode) ? "long-sleep" : "large-files"; - } - private boolean checkServerHealth() { try { HttpRequest request = HttpRequest.newBuilder() @@ -348,6 +339,7 @@ public class TikaServerBenchmark { .uri(URI.create(baseUrl + "/rmeta")) .header("Content-Type", "application/mock+xml") .header("Accept", "application/json") + .header("writeLimit", "-1") .PUT(HttpRequest.BodyPublishers.ofString(testXml)) .timeout(Duration.ofSeconds(10)) .build(); @@ -391,8 +383,8 @@ public class TikaServerBenchmark { AtomicInteger errorCount = new AtomicInteger(0); AtomicInteger completedCount = new AtomicInteger(0); - // Calculate appropriate timeout based on content - int timeoutSeconds = "sleep".equals(mode) ? Math.max(60, longSleepMs / 1000 + 30) : 60; + // Calculate appropriate timeout based on longest possible sleep time + int timeoutSeconds = Math.max(60, longSleepMs / 1000 + 30); // Divide requests among threads int requestsPerThread = requestCount / threads; @@ -451,8 +443,8 @@ public class TikaServerBenchmark { AtomicInteger errorCount = new AtomicInteger(0); AtomicInteger completedCount = new AtomicInteger(0); - // Calculate appropriate timeout based on content - int timeoutSeconds = "sleep".equals(mode) ? Math.max(60, longSleepMs / 1000 + 30) : 60; + // Calculate appropriate timeout based on longest possible sleep time + int timeoutSeconds = Math.max(60, longSleepMs / 1000 + 30); long startTime = System.nanoTime(); @@ -608,15 +600,14 @@ public class TikaServerBenchmark { String url = "http://localhost:9998"; String endpoint = "/tika"; int threads = 4; - int count = 1000; + int count = 100; int warmup = 100; int repeats = 1; - String mode = "size"; boolean syncMode = true; // default to sync (realistic) - int smallKb = 1; - int largeKb = 100; + int smallTimes = 10; + int largeTimes = 10000; int shortMs = 10; - int longMs = 5000; + int longMs = 500; for (String arg : args) { if (arg.startsWith("--url=")) { @@ -631,16 +622,14 @@ public class TikaServerBenchmark { warmup = Integer.parseInt(arg.substring(9)); } else if (arg.startsWith("--repeats=")) { repeats = Integer.parseInt(arg.substring(10)); - } else if (arg.startsWith("--mode=")) { - mode = arg.substring(7); } else if (arg.equals("--sync")) { syncMode = true; } else if (arg.equals("--async")) { syncMode = false; - } else if (arg.startsWith("--small-kb=")) { - smallKb = Integer.parseInt(arg.substring(11)); - } else if (arg.startsWith("--large-kb=")) { - largeKb = Integer.parseInt(arg.substring(11)); + } else if (arg.startsWith("--small-times=")) { + smallTimes = Integer.parseInt(arg.substring(14)); + } else if (arg.startsWith("--large-times=")) { + largeTimes = Integer.parseInt(arg.substring(14)); } else if (arg.startsWith("--short-ms=")) { shortMs = Integer.parseInt(arg.substring(11)); } else if (arg.startsWith("--long-ms=")) { @@ -651,13 +640,8 @@ public class TikaServerBenchmark { } } - if (!mode.equals("size") && !mode.equals("sleep")) { - System.err.println("Invalid mode: " + mode + ". Must be 'size' or 'sleep'."); - System.exit(1); - } - TikaServerBenchmark benchmark = new TikaServerBenchmark( - url, endpoint, threads, count, warmup, repeats, mode, syncMode, smallKb, largeKb, shortMs, longMs); + url, endpoint, threads, count, warmup, repeats, syncMode, smallTimes, largeTimes, shortMs, longMs); try { benchmark.run(); @@ -669,7 +653,9 @@ public class TikaServerBenchmark { } private static void printHelp() { - System.out.println("Tika Server Performance Benchmark"); + System.out.println("Tika Server Performance Benchmark (2x2 Matrix)"); + System.out.println(); + System.out.println("Runs a 2x2 matrix of tests: [short/long parse time] x [small/large output]"); System.out.println(); System.out.println("Usage: java TikaServerBenchmark [options]"); System.out.println(); @@ -677,30 +663,28 @@ public class TikaServerBenchmark { System.out.println(" --url=URL Base URL of tika-server (default: http://localhost:9998)"); System.out.println(" --endpoint=PATH Endpoint to test: /tika or /rmeta (default: /tika)"); System.out.println(" --threads=N Number of client threads (default: 4)"); - System.out.println(" --count=N Number of requests per test (default: 1000)"); + System.out.println(" --count=N Number of requests per test (default: 100)"); System.out.println(" --warmup=N Number of initial warmup requests (default: 100)"); System.out.println(" --repeats=N Number of times to repeat the benchmark (default: 1)"); - System.out.println(" --mode=MODE Test mode: 'size' or 'sleep' (default: size)"); System.out.println(" --sync Synchronous: each thread waits for response before next request (default)"); System.out.println(" --async Asynchronous: all requests sent immediately (stress test)"); System.out.println(); - System.out.println("Size mode options (tests I/O throughput):"); - System.out.println(" --small-kb=N Size of small files in KB (default: 1)"); - System.out.println(" --large-kb=N Size of large files in KB (default: 100)"); + System.out.println("Parse time options:"); + System.out.println(" --short-ms=N Short parse/sleep duration in ms (default: 10)"); + System.out.println(" --long-ms=N Long parse/sleep duration in ms (default: 500)"); System.out.println(); - System.out.println("Sleep mode options (tests process forking overhead):"); - System.out.println(" --short-ms=N Short sleep duration in ms (default: 10)"); - System.out.println(" --long-ms=N Long sleep duration in ms (default: 5000)"); + System.out.println("Output size options:"); + System.out.println(" --small-times=N Paragraph repetitions for small output (default: 10, ~1KB)"); + System.out.println(" --large-times=N Paragraph repetitions for large output (default: 10000, ~1MB)"); System.out.println(); - System.out.println("Note: Each benchmark also runs a per-benchmark warmup of 10*threads requests"); - System.out.println(" that is not counted towards the statistics."); + System.out.println("Note: Each of the 4 benchmarks runs a per-benchmark warmup of 10*threads requests."); System.out.println(); System.out.println("Examples:"); - System.out.println(" # Realistic test with 4 threads (default sync mode)"); - System.out.println(" java TikaServerBenchmark --mode=sleep --threads=4 --short-ms=100 --long-ms=1000"); + System.out.println(" # Default 2x2 matrix test"); + System.out.println(" java TikaServerBenchmark --threads=4"); System.out.println(); - System.out.println(" # Stress test with async mode"); - System.out.println(" java TikaServerBenchmark --mode=sleep --threads=4 --async --count=500"); + System.out.println(" # Custom parse times and output sizes"); + System.out.println(" java TikaServerBenchmark --short-ms=10 --long-ms=1000 --small-times=10 --large-times=5000"); System.out.println(); System.out.println(" # Test /rmeta endpoint with 3 repeats for more stable results"); System.out.println(" java TikaServerBenchmark --endpoint=/rmeta --mode=size --repeats=3"); diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml index 2cfb43d7ec..82db4d689a 100644 --- a/tika-server/tika-server-standard/pom.xml +++ b/tika-server/tika-server-standard/pom.xml @@ -125,7 +125,7 @@ <executions> <execution> <id>unpack-plugins</id> - <phase>prepare-package</phase> + <phase>process-test-resources</phase> <goals> <goal>unpack</goal> </goals>
