This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4517 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 22e2d89d8281bb348b120febcbb66e8ee7cced7c Author: tallison <[email protected]> AuthorDate: Tue Oct 14 19:46:52 2025 -0400 TIKA-4517 -- improve async cli --- .../main/java/org/apache/tika/cli/AsyncHelper.java | 42 +++----- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 30 ++---- .../java/org/apache/tika/cli/AsyncHelperTest.java | 31 ++++++ .../apache/tika/async/cli/SimpleAsyncConfig.java | 10 +- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 114 +++++++++++++++++---- .../apache/tika/async/cli/AsyncCliParserTest.java | 5 +- .../tika/async/cli/TikaConfigAsyncWriterTest.java | 7 +- 7 files changed, 162 insertions(+), 77 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java index a9cc2330c..f8189cf69 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java +++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java @@ -17,45 +17,27 @@ package org.apache.tika.cli; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class AsyncHelper { + + private static final String TIKA_CONFIG_KEY = "--config="; + public static String[] translateArgs(String[] args) { List<String> argList = new ArrayList<>(); - if (args.length == 2) { - if (args[0].startsWith("-Z")) { - argList.add("-Z"); - argList.add("-i"); - argList.add(args[1]); - argList.add("-o"); - argList.add(args[1]); - return argList.toArray(new String[0]); - } else if (args[0].startsWith("-") || args[1].startsWith("-")) { - argList.add(args[0]); - argList.add(args[1]); - return argList.toArray(new String[0]); + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + if (arg.startsWith(TIKA_CONFIG_KEY)) { + String c = arg.substring(TIKA_CONFIG_KEY.length()); + argList.add("-c"); + argList.add(c); + } else if (arg.equals("-a")) { + //do nothing } else { - argList.add("-i"); - argList.add(args[0]); - argList.add("-o"); - argList.add(args[1]); - return argList.toArray(new String[0]); - } - } - if (args.length == 3) { - if (args[0].equals("-Z") && ! args[1].startsWith("-") && ! args[2].startsWith("-")) { - argList.add("-Z"); - argList.add("-i"); - argList.add(args[1]); - argList.add("-o"); - argList.add(args[2]); - return argList.toArray(new String[0]); + argList.add(args[i]); } } - argList.addAll(Arrays.asList(args)); - argList.remove("-a"); return argList.toArray(new String[0]); } } diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 7706c0f59..28a9b29c7 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -276,11 +276,11 @@ public class TikaCLI { Files.copy(TikaCLI.class.getResourceAsStream("/tika-config-default-single-file.xml"), tmpConfig, StandardCopyOption.REPLACE_EXISTING); List<String> argList = new ArrayList<>(); + argList.add("-c"); + argList.add(tmpConfig.toAbsolutePath().toString()); for (String arg : args) { argList.add(arg); } - argList.add("-c"); - argList.add(tmpConfig.toAbsolutePath().toString()); TikaAsyncCLI.main(argList.toArray(new String[0])); } finally { if (tmpConfig != null) { @@ -625,27 +625,15 @@ public class TikaCLI { out.println(" Specify two directories as args with no other args:"); out.println(" java -jar tika-app.jar <inputDirectory> <outputDirectory>"); out.println(); - out.println("Batch Options:"); - out.println(" -i or --inputDir Input directory"); - out.println(" -o or --outputDir Output directory"); - out.println(" -numConsumers Number of processing threads"); - out.println(" -bc Batch config file"); - out.println(" -maxRestarts Maximum number of times the "); - out.println(" watchdog process will restart the forked process."); - out.println(" -timeoutThresholdMillis Number of milliseconds allowed to a parse"); - out.println(" before the process is terminated and restarted"); - out.println(" -fileList List of files to process, with"); - out.println(" paths relative to the input directory"); - out.println(" -includeFilePat Regular expression to determine which"); - out.println(" files to process, e.g. \"(?i)\\.pdf\""); - out.println(" -excludeFilePat Regular expression to determine which"); - out.println(" files to avoid processing, e.g. \"(?i)\\.pdf\""); - out.println(" -maxFileSizeBytes Skip files longer than this value"); + out.println("Batch/Pipes Options:"); + out.println(" -i Input directory"); + out.println(" -o Output directory"); + out.println(" -n Number of forked processes"); + out.println(" -X -Xmx in the forked processes"); + out.println(" -T Timeout in milliseconds"); + out.println(" -Z Recursively unpack all the attachments, too"); out.println(); - out.println(" Control the type of output with -x, -h, -t and/or -J."); out.println(); - out.println(" To modify forked process jvm args, prepend \"J\" as in:"); - out.println(" -JXmx4g or -JDlog4j.configuration=file:log4j.xml."); } private void version() { diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java new file mode 100644 index 000000000..8b1d79d10 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.cli; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import org.junit.jupiter.api.Test; + +public class AsyncHelperTest { + + @Test + public void testBasic() throws Exception { + String[] args = new String[]{"-a", "--config=blah.xml", "-i", "input.docx", "-o", "output/dir"}; + String[] expected = new String[]{"-c", "blah.xml", "-i", "input.docx", "-o", "output/dir"}; + assertArrayEquals(expected, AsyncHelper.translateArgs(args)); + } +} diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java index 0c3987165..e8c48f663 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java @@ -16,6 +16,8 @@ */ package org.apache.tika.async.cli; +import org.apache.tika.sax.BasicContentHandlerFactory; + class SimpleAsyncConfig { private String inputDir; @@ -26,10 +28,11 @@ class SimpleAsyncConfig { private String fileList; private String tikaConfig;//path to the tikaConfig file to be used in the forked process private boolean extractBytes; + private final BasicContentHandlerFactory.HANDLER_TYPE handlerType; //TODO -- switch to a builder public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, - String tikaConfig, boolean extractBytes) { + String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, boolean extractBytes) { this.inputDir = inputDir; this.outputDir = outputDir; this.numClients = numClients; @@ -37,6 +40,7 @@ class SimpleAsyncConfig { this.xmx = xmx; this.fileList = fileList; this.tikaConfig = tikaConfig; + this.handlerType = handlerType; this.extractBytes = extractBytes; } @@ -71,4 +75,8 @@ class SimpleAsyncConfig { public boolean isExtractBytes() { return extractBytes; } + + public BasicContentHandlerFactory.HANDLER_TYPE getHandlerType() { + return handlerType; + } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index fe4377213..8dff25ab3 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -33,11 +33,13 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.FetchEmitTuple; +import org.apache.tika.pipes.core.HandlerConfig; import org.apache.tika.pipes.core.async.AsyncProcessor; import org.apache.tika.pipes.core.emitter.EmitKey; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.fetcher.FetchKey; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.utils.StringUtils; public class TikaAsyncCLI { @@ -49,11 +51,11 @@ public class TikaAsyncCLI { Options options = new Options(); options.addOption("i", "inputDir", true, "input directory"); options.addOption("o", "outputDir", true, "output directory"); - options.addOption("n", "numClients", true, "number of forked clients"); - options.addOption("x", "Xmx", true, "heap for the forked clients in usual jvm heap amount, e.g. -x 1g"); + options.addOption("X", "Xmx", true, "heap for the forked clients in usual jvm heap amount, e.g. -x 1g"); options.addOption("?", "help", false, "this help message"); - options.addOption("t", "timeoutMs", true, "timeout for each parse in milliseconds"); + options.addOption("T", "timeoutMs", true, "timeout for each parse in milliseconds"); + options.addOption("h", "handlerType", true, "handler type: t=text, h=html, x=xml, b=body, i=ignore"); options.addOption("l", "fileList", true, "file list"); options.addOption("c", "config", true, "tikaConfig to inherit from -- " + "commandline options will not overwrite existing iterators, emitters, fetchers and async"); @@ -72,12 +74,12 @@ public class TikaAsyncCLI { private static void processCommandLine(String[] args) throws Exception { if (args.length == 1) { - processWithTikaConfig(PipesIterator.build(Paths.get(args[0])), Paths.get(args[0]), false); + processWithTikaConfig(PipesIterator.build(Paths.get(args[0])), Paths.get(args[0]), null); return; } if (args.length == 2 && args[0].equals("-c")) { - processWithTikaConfig(PipesIterator.build(Paths.get(args[1])), Paths.get(args[1]), false); + processWithTikaConfig(PipesIterator.build(Paths.get(args[1])), Paths.get(args[1]), null); return; } SimpleAsyncConfig simpleAsyncConfig = parseCommandLine(args); @@ -88,7 +90,7 @@ public class TikaAsyncCLI { TikaConfigAsyncWriter tikaConfigAsyncWriter = new TikaConfigAsyncWriter(simpleAsyncConfig); tikaConfigAsyncWriter.write(tikaConfig); PipesIterator pipesIterator = buildPipesIterator(tikaConfig, simpleAsyncConfig); - processWithTikaConfig(pipesIterator, tikaConfig, simpleAsyncConfig.isExtractBytes()); + processWithTikaConfig(pipesIterator, tikaConfig, simpleAsyncConfig); } finally { if (tikaConfig != null) { Files.delete(tikaConfig); @@ -103,23 +105,24 @@ public class TikaAsyncCLI { } Path p = Paths.get(simpleAsyncConfig.getInputDir()); if (Files.isRegularFile(p)) { - return new SingleFilePipesIterator(p.getFileName().toString(), simpleAsyncConfig.isExtractBytes()); + return new SingleFilePipesIterator(p.getFileName().toString()); } return PipesIterator.build(tikaConfig); } //not private for testing purposes - static SimpleAsyncConfig parseCommandLine(String[] args) throws ParseException, IOException { + static SimpleAsyncConfig parseCommandLine(String[] args) throws TikaConfigException, ParseException, IOException { if (args.length == 2 && ! args[0].startsWith("-")) { return new SimpleAsyncConfig(args[0], args[1], null, - null, null, null, null, false); + null, null, null, null, + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false); } Options options = getOptions(); CommandLineParser cliParser = new DefaultParser(); - CommandLine line = cliParser.parse(options, args); + CommandLine line = cliParser.parse(options, args, true); if (line.hasOption("help")) { usage(options); } @@ -130,6 +133,7 @@ public class TikaAsyncCLI { Integer numClients = null; String fileList = null; String tikaConfig = null; + BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; boolean extractBytes = false; if (line.hasOption("i")) { inputDir = line.getOptionValue("i"); @@ -137,11 +141,11 @@ public class TikaAsyncCLI { if (line.hasOption("o")) { outputDir = line.getOptionValue("o"); } - if (line.hasOption("x")) { - xmx = line.getOptionValue("x"); + if (line.hasOption("X")) { + xmx = line.getOptionValue("X"); } - if (line.hasOption("t")) { - timeoutMs = Long.parseLong(line.getOptionValue("t")); + if (line.hasOption("T")) { + timeoutMs = Long.parseLong(line.getOptionValue("T")); } if (line.hasOption("n")) { numClients = Integer.parseInt(line.getOptionValue("n")); @@ -155,18 +159,71 @@ public class TikaAsyncCLI { if (line.hasOption("Z")) { extractBytes = true; } + if (line.hasOption('h')) { + handlerType = getHandlerType(line.getOptionValue('h')); + } + if (line.getArgList().size() > 2) { + throw new TikaConfigException("Can't have more than 2 unknown args: " + line.getArgList()); + } + + if (line.getArgList().size() == 2) { + if (inputDir != null || outputDir != null) { + throw new TikaConfigException("Can only set inputDir and outputDir once. Extra args: " + line.getArgList()); + } + String inString = line.getArgList().get(0); + String outString = line.getArgList().get(1); + if (inString.startsWith("-") || outString.startsWith("-")) { + throw new TikaConfigException("Found an unknown arg in one of the last two args: " + line.getArgList()); + } + Path p = Paths.get(inString); + if (! Files.isDirectory(p) && ! Files.isRegularFile(p)) { + throw new TikaConfigException("Input file/dir must exist: " + p); + } + inputDir = inString; + outputDir = outString; + } else if (line.getArgList().size() == 1) { + if (inputDir != null) { + throw new TikaConfigException("Can only set inputDir once. Extra args: " + line.getArgList()); + } + String inString = line.getArgList().get(0); + if (inString.startsWith("-")) { + throw new TikaConfigException("Found an unknown arg in one of the last arg: " + inString); + } + Path inputPath = Paths.get(inString); + if (! Files.isDirectory(inputPath) && ! Files.isRegularFile(inputPath)) { + throw new TikaConfigException("Input file/dir must exist: " + inputPath); + } + inputDir = inString; + if (Files.isRegularFile(inputPath)) { + outputDir = Paths.get(".").toAbsolutePath().toString(); + } else { + outputDir = Paths.get("output").toAbsolutePath().toString(); + } + } return new SimpleAsyncConfig(inputDir, outputDir, - numClients, timeoutMs, xmx, fileList, tikaConfig, extractBytes); + numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType, extractBytes); + } + + private static BasicContentHandlerFactory.HANDLER_TYPE getHandlerType(String t) throws TikaConfigException { + return switch (t) { + case "x" -> BasicContentHandlerFactory.HANDLER_TYPE.XML; + case "h" -> BasicContentHandlerFactory.HANDLER_TYPE.HTML; + case "b" -> BasicContentHandlerFactory.HANDLER_TYPE.BODY; + case "i" -> BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; + case "t" -> BasicContentHandlerFactory.HANDLER_TYPE.TEXT; + default -> throw new TikaConfigException("Can't understand " + t + " as a handler type. Must be one of: x(ml), h(tml), b(ody), i(gnore), t(ext)"); + }; } - private static void processWithTikaConfig(PipesIterator pipesIterator, Path tikaConfigPath, boolean extractBytes) throws Exception { + private static void processWithTikaConfig(PipesIterator pipesIterator, Path tikaConfigPath, SimpleAsyncConfig asyncConfig) throws Exception { long start = System.currentTimeMillis(); try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath, pipesIterator)) { for (FetchEmitTuple t : pipesIterator) { - configureExtractBytes(t, extractBytes); + configureExtractBytes(t, asyncConfig); + configureHandler(t, asyncConfig); boolean offered = processor.offer(t, TIMEOUT_MS); if (!offered) { throw new TimeoutException("timed out waiting to add a fetch emit tuple"); @@ -186,8 +243,23 @@ public class TikaAsyncCLI { } } - private static void configureExtractBytes(FetchEmitTuple t, boolean extractBytes) { - if (! extractBytes) { + private static void configureHandler(FetchEmitTuple t, SimpleAsyncConfig asyncConfig) { + if (asyncConfig == null) { + return; + } + if (asyncConfig.getHandlerType() == BasicContentHandlerFactory.HANDLER_TYPE.TEXT) { + return; + } + HandlerConfig handlerConfig = new HandlerConfig(asyncConfig.getHandlerType(), HandlerConfig.PARSE_MODE.RMETA, + -1, -1, false); + t.getParseContext().set(HandlerConfig.class, handlerConfig); + } + + private static void configureExtractBytes(FetchEmitTuple t, SimpleAsyncConfig asyncConfig) { + if (asyncConfig == null) { + return; + } + if (!asyncConfig.isExtractBytes()) { return; } ParseContext parseContext = t.getParseContext(); @@ -213,11 +285,9 @@ public class TikaAsyncCLI { private static class SingleFilePipesIterator extends PipesIterator { private final String fName; - private final boolean extractBytes; - public SingleFilePipesIterator(String string, boolean extractBytes) { + public SingleFilePipesIterator(String string) { super(); this.fName = string; - this.extractBytes = extractBytes; } @Override diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java index 4e38aac9c..9d3941cd8 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java @@ -21,6 +21,8 @@ import static org.junit.jupiter.api.Assertions.assertNull; import org.junit.jupiter.api.Test; +import org.apache.tika.sax.BasicContentHandlerFactory; + public class AsyncCliParserTest { @Test @@ -69,13 +71,14 @@ public class AsyncCliParserTest { @Test public void testAll() throws Exception { SimpleAsyncConfig simpleAsyncConfig = TikaAsyncCLI.parseCommandLine( - new String[]{"-i", "input", "-o", "output", "-n", "5", "-t", "30000", "-x", "1g"}); + new String[]{"-i", "input", "-o", "output", "-n", "5", "-T", "30000", "-X", "1g", "-h", "x"}); assertEquals("input", simpleAsyncConfig.getInputDir()); assertEquals("output", simpleAsyncConfig.getOutputDir()); assertNull(simpleAsyncConfig.getFileList()); assertEquals(5, simpleAsyncConfig.getNumClients()); assertEquals(30000L, simpleAsyncConfig.getTimeoutMs()); assertEquals("1g", simpleAsyncConfig.getXmx()); + assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML, simpleAsyncConfig.getHandlerType()); } //TODO -- test for file list with and without inputDir diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java index adafdafd6..7db2dd133 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java @@ -33,6 +33,7 @@ import org.w3c.dom.Node; import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.utils.XMLReaderUtils; public class TikaConfigAsyncWriterTest { @@ -42,7 +43,8 @@ public class TikaConfigAsyncWriterTest { public void testBasic(@TempDir Path dir) throws Exception { Path p = Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-parsers.xml").toURI()); SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", "output", 4, - 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false); + 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false); Path target = dir.resolve("combined.xml"); TikaConfigAsyncWriter writer = new TikaConfigAsyncWriter(simpleAsyncConfig); writer.write(target); @@ -56,7 +58,8 @@ public class TikaConfigAsyncWriterTest { public void testDontOverwriteEmitters(@TempDir Path dir) throws Exception { Path p = Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-emitters.xml").toURI()); SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", "output", 4, - 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false); + 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false); Path target = dir.resolve("combined.xml"); TikaConfigAsyncWriter writer = new TikaConfigAsyncWriter(simpleAsyncConfig); writer.write(target);
