This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a0b088749 TIKA-4517 -- improve async cli (#2365)
a0b088749 is described below
commit a0b088749c451d65af1580a5fa59ffcda5bbc710
Author: Tim Allison <[email protected]>
AuthorDate: Tue Oct 14 20:07:15 2025 -0400
TIKA-4517 -- improve async cli (#2365)
---
.../main/java/org/apache/tika/cli/AsyncHelper.java | 42 +++-----
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 30 ++----
.../java/org/apache/tika/cli/AsyncHelperTest.java | 31 ++++++
.../apache/tika/async/cli/SimpleAsyncConfig.java | 10 +-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 114 +++++++++++++++++----
.../apache/tika/async/cli/AsyncCliParserTest.java | 5 +-
.../tika/async/cli/TikaConfigAsyncWriterTest.java | 7 +-
7 files changed, 162 insertions(+), 77 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index a9cc2330c..f8189cf69 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -17,45 +17,27 @@
package org.apache.tika.cli;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
public class AsyncHelper {
+
+ private static final String TIKA_CONFIG_KEY = "--config=";
+
public static String[] translateArgs(String[] args) {
List<String> argList = new ArrayList<>();
- if (args.length == 2) {
- if (args[0].startsWith("-Z")) {
- argList.add("-Z");
- argList.add("-i");
- argList.add(args[1]);
- argList.add("-o");
- argList.add(args[1]);
- return argList.toArray(new String[0]);
- } else if (args[0].startsWith("-") || args[1].startsWith("-")) {
- argList.add(args[0]);
- argList.add(args[1]);
- return argList.toArray(new String[0]);
+ for (int i = 0; i < args.length; i++) {
+ String arg = args[i];
+ if (arg.startsWith(TIKA_CONFIG_KEY)) {
+ String c = arg.substring(TIKA_CONFIG_KEY.length());
+ argList.add("-c");
+ argList.add(c);
+ } else if (arg.equals("-a")) {
+ //do nothing
} else {
- argList.add("-i");
- argList.add(args[0]);
- argList.add("-o");
- argList.add(args[1]);
- return argList.toArray(new String[0]);
- }
- }
- if (args.length == 3) {
- if (args[0].equals("-Z") && ! args[1].startsWith("-") && !
args[2].startsWith("-")) {
- argList.add("-Z");
- argList.add("-i");
- argList.add(args[1]);
- argList.add("-o");
- argList.add(args[2]);
- return argList.toArray(new String[0]);
+ argList.add(args[i]);
}
}
- argList.addAll(Arrays.asList(args));
- argList.remove("-a");
return argList.toArray(new String[0]);
}
}
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 7706c0f59..28a9b29c7 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -276,11 +276,11 @@ public class TikaCLI {
Files.copy(TikaCLI.class.getResourceAsStream("/tika-config-default-single-file.xml"),
tmpConfig, StandardCopyOption.REPLACE_EXISTING);
List<String> argList = new ArrayList<>();
+ argList.add("-c");
+ argList.add(tmpConfig.toAbsolutePath().toString());
for (String arg : args) {
argList.add(arg);
}
- argList.add("-c");
- argList.add(tmpConfig.toAbsolutePath().toString());
TikaAsyncCLI.main(argList.toArray(new String[0]));
} finally {
if (tmpConfig != null) {
@@ -625,27 +625,15 @@ public class TikaCLI {
out.println(" Specify two directories as args with no other args:");
out.println(" java -jar tika-app.jar <inputDirectory>
<outputDirectory>");
out.println();
- out.println("Batch Options:");
- out.println(" -i or --inputDir Input directory");
- out.println(" -o or --outputDir Output directory");
- out.println(" -numConsumers Number of processing
threads");
- out.println(" -bc Batch config file");
- out.println(" -maxRestarts Maximum number of times
the ");
- out.println(" watchdog process will
restart the forked process.");
- out.println(" -timeoutThresholdMillis Number of milliseconds
allowed to a parse");
- out.println(" before the process is
terminated and restarted");
- out.println(" -fileList List of files to process,
with");
- out.println(" paths relative to the
input directory");
- out.println(" -includeFilePat Regular expression to
determine which");
- out.println(" files to process, e.g.
\"(?i)\\.pdf\"");
- out.println(" -excludeFilePat Regular expression to
determine which");
- out.println(" files to avoid processing,
e.g. \"(?i)\\.pdf\"");
- out.println(" -maxFileSizeBytes Skip files longer than
this value");
+ out.println("Batch/Pipes Options:");
+ out.println(" -i Input directory");
+ out.println(" -o Output directory");
+ out.println(" -n Number of forked
processes");
+ out.println(" -X -Xmx in the forked
processes");
+ out.println(" -T Timeout in milliseconds");
+ out.println(" -Z Recursively unpack all the
attachments, too");
out.println();
- out.println(" Control the type of output with -x, -h, -t and/or
-J.");
out.println();
- out.println(" To modify forked process jvm args, prepend \"J\" as
in:");
- out.println(" -JXmx4g or -JDlog4j.configuration=file:log4j.xml.");
}
private void version() {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
new file mode 100644
index 000000000..8b1d79d10
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+
+import org.junit.jupiter.api.Test;
+
+public class AsyncHelperTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ String[] args = new String[]{"-a", "--config=blah.xml", "-i",
"input.docx", "-o", "output/dir"};
+ String[] expected = new String[]{"-c", "blah.xml", "-i", "input.docx",
"-o", "output/dir"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 0c3987165..e8c48f663 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.async.cli;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+
class SimpleAsyncConfig {
private String inputDir;
@@ -26,10 +28,11 @@ class SimpleAsyncConfig {
private String fileList;
private String tikaConfig;//path to the tikaConfig file to be used in the
forked process
private boolean extractBytes;
+ private final BasicContentHandlerFactory.HANDLER_TYPE handlerType;
//TODO -- switch to a builder
public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList,
- String tikaConfig, boolean extractBytes) {
+ String tikaConfig,
BasicContentHandlerFactory.HANDLER_TYPE handlerType, boolean extractBytes) {
this.inputDir = inputDir;
this.outputDir = outputDir;
this.numClients = numClients;
@@ -37,6 +40,7 @@ class SimpleAsyncConfig {
this.xmx = xmx;
this.fileList = fileList;
this.tikaConfig = tikaConfig;
+ this.handlerType = handlerType;
this.extractBytes = extractBytes;
}
@@ -71,4 +75,8 @@ class SimpleAsyncConfig {
public boolean isExtractBytes() {
return extractBytes;
}
+
+ public BasicContentHandlerFactory.HANDLER_TYPE getHandlerType() {
+ return handlerType;
+ }
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index fe4377213..8dff25ab3 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -33,11 +33,13 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.core.FetchEmitTuple;
+import org.apache.tika.pipes.core.HandlerConfig;
import org.apache.tika.pipes.core.async.AsyncProcessor;
import org.apache.tika.pipes.core.emitter.EmitKey;
import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.core.fetcher.FetchKey;
import org.apache.tika.pipes.core.pipesiterator.PipesIterator;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.utils.StringUtils;
public class TikaAsyncCLI {
@@ -49,11 +51,11 @@ public class TikaAsyncCLI {
Options options = new Options();
options.addOption("i", "inputDir", true, "input directory");
options.addOption("o", "outputDir", true, "output directory");
-
options.addOption("n", "numClients", true, "number of forked clients");
- options.addOption("x", "Xmx", true, "heap for the forked clients in
usual jvm heap amount, e.g. -x 1g");
+ options.addOption("X", "Xmx", true, "heap for the forked clients in
usual jvm heap amount, e.g. -x 1g");
options.addOption("?", "help", false, "this help message");
- options.addOption("t", "timeoutMs", true, "timeout for each parse in
milliseconds");
+ options.addOption("T", "timeoutMs", true, "timeout for each parse in
milliseconds");
+ options.addOption("h", "handlerType", true, "handler type: t=text,
h=html, x=xml, b=body, i=ignore");
options.addOption("l", "fileList", true, "file list");
options.addOption("c", "config", true, "tikaConfig to inherit from --
" +
"commandline options will not overwrite existing iterators,
emitters, fetchers and async");
@@ -72,12 +74,12 @@ public class TikaAsyncCLI {
private static void processCommandLine(String[] args) throws Exception {
if (args.length == 1) {
- processWithTikaConfig(PipesIterator.build(Paths.get(args[0])),
Paths.get(args[0]), false);
+ processWithTikaConfig(PipesIterator.build(Paths.get(args[0])),
Paths.get(args[0]), null);
return;
}
if (args.length == 2 && args[0].equals("-c")) {
- processWithTikaConfig(PipesIterator.build(Paths.get(args[1])),
Paths.get(args[1]), false);
+ processWithTikaConfig(PipesIterator.build(Paths.get(args[1])),
Paths.get(args[1]), null);
return;
}
SimpleAsyncConfig simpleAsyncConfig = parseCommandLine(args);
@@ -88,7 +90,7 @@ public class TikaAsyncCLI {
TikaConfigAsyncWriter tikaConfigAsyncWriter = new
TikaConfigAsyncWriter(simpleAsyncConfig);
tikaConfigAsyncWriter.write(tikaConfig);
PipesIterator pipesIterator = buildPipesIterator(tikaConfig,
simpleAsyncConfig);
- processWithTikaConfig(pipesIterator, tikaConfig,
simpleAsyncConfig.isExtractBytes());
+ processWithTikaConfig(pipesIterator, tikaConfig,
simpleAsyncConfig);
} finally {
if (tikaConfig != null) {
Files.delete(tikaConfig);
@@ -103,23 +105,24 @@ public class TikaAsyncCLI {
}
Path p = Paths.get(simpleAsyncConfig.getInputDir());
if (Files.isRegularFile(p)) {
- return new SingleFilePipesIterator(p.getFileName().toString(),
simpleAsyncConfig.isExtractBytes());
+ return new SingleFilePipesIterator(p.getFileName().toString());
}
return PipesIterator.build(tikaConfig);
}
//not private for testing purposes
- static SimpleAsyncConfig parseCommandLine(String[] args) throws
ParseException, IOException {
+ static SimpleAsyncConfig parseCommandLine(String[] args) throws
TikaConfigException, ParseException, IOException {
if (args.length == 2 && ! args[0].startsWith("-")) {
return new SimpleAsyncConfig(args[0], args[1], null,
- null, null, null, null, false);
+ null, null, null, null,
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false);
}
Options options = getOptions();
CommandLineParser cliParser = new DefaultParser();
- CommandLine line = cliParser.parse(options, args);
+ CommandLine line = cliParser.parse(options, args, true);
if (line.hasOption("help")) {
usage(options);
}
@@ -130,6 +133,7 @@ public class TikaAsyncCLI {
Integer numClients = null;
String fileList = null;
String tikaConfig = null;
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType =
BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
boolean extractBytes = false;
if (line.hasOption("i")) {
inputDir = line.getOptionValue("i");
@@ -137,11 +141,11 @@ public class TikaAsyncCLI {
if (line.hasOption("o")) {
outputDir = line.getOptionValue("o");
}
- if (line.hasOption("x")) {
- xmx = line.getOptionValue("x");
+ if (line.hasOption("X")) {
+ xmx = line.getOptionValue("X");
}
- if (line.hasOption("t")) {
- timeoutMs = Long.parseLong(line.getOptionValue("t"));
+ if (line.hasOption("T")) {
+ timeoutMs = Long.parseLong(line.getOptionValue("T"));
}
if (line.hasOption("n")) {
numClients = Integer.parseInt(line.getOptionValue("n"));
@@ -155,18 +159,71 @@ public class TikaAsyncCLI {
if (line.hasOption("Z")) {
extractBytes = true;
}
+ if (line.hasOption('h')) {
+ handlerType = getHandlerType(line.getOptionValue('h'));
+ }
+ if (line.getArgList().size() > 2) {
+ throw new TikaConfigException("Can't have more than 2 unknown
args: " + line.getArgList());
+ }
+
+ if (line.getArgList().size() == 2) {
+ if (inputDir != null || outputDir != null) {
+ throw new TikaConfigException("Can only set inputDir and
outputDir once. Extra args: " + line.getArgList());
+ }
+ String inString = line.getArgList().get(0);
+ String outString = line.getArgList().get(1);
+ if (inString.startsWith("-") || outString.startsWith("-")) {
+ throw new TikaConfigException("Found an unknown arg in one of
the last two args: " + line.getArgList());
+ }
+ Path p = Paths.get(inString);
+ if (! Files.isDirectory(p) && ! Files.isRegularFile(p)) {
+ throw new TikaConfigException("Input file/dir must exist: " +
p);
+ }
+ inputDir = inString;
+ outputDir = outString;
+ } else if (line.getArgList().size() == 1) {
+ if (inputDir != null) {
+ throw new TikaConfigException("Can only set inputDir once.
Extra args: " + line.getArgList());
+ }
+ String inString = line.getArgList().get(0);
+ if (inString.startsWith("-")) {
+ throw new TikaConfigException("Found an unknown arg in one of
the last arg: " + inString);
+ }
+ Path inputPath = Paths.get(inString);
+ if (! Files.isDirectory(inputPath) && !
Files.isRegularFile(inputPath)) {
+ throw new TikaConfigException("Input file/dir must exist: " +
inputPath);
+ }
+ inputDir = inString;
+ if (Files.isRegularFile(inputPath)) {
+ outputDir = Paths.get(".").toAbsolutePath().toString();
+ } else {
+ outputDir = Paths.get("output").toAbsolutePath().toString();
+ }
+ }
return new SimpleAsyncConfig(inputDir, outputDir,
- numClients, timeoutMs, xmx, fileList, tikaConfig,
extractBytes);
+ numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType,
extractBytes);
+ }
+
+ private static BasicContentHandlerFactory.HANDLER_TYPE
getHandlerType(String t) throws TikaConfigException {
+ return switch (t) {
+ case "x" -> BasicContentHandlerFactory.HANDLER_TYPE.XML;
+ case "h" -> BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+ case "b" -> BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+ case "i" -> BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
+ case "t" -> BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+ default -> throw new TikaConfigException("Can't understand " + t +
" as a handler type. Must be one of: x(ml), h(tml), b(ody), i(gnore), t(ext)");
+ };
}
- private static void processWithTikaConfig(PipesIterator pipesIterator,
Path tikaConfigPath, boolean extractBytes) throws Exception {
+ private static void processWithTikaConfig(PipesIterator pipesIterator,
Path tikaConfigPath, SimpleAsyncConfig asyncConfig) throws Exception {
long start = System.currentTimeMillis();
try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath,
pipesIterator)) {
for (FetchEmitTuple t : pipesIterator) {
- configureExtractBytes(t, extractBytes);
+ configureExtractBytes(t, asyncConfig);
+ configureHandler(t, asyncConfig);
boolean offered = processor.offer(t, TIMEOUT_MS);
if (!offered) {
throw new TimeoutException("timed out waiting to add a
fetch emit tuple");
@@ -186,8 +243,23 @@ public class TikaAsyncCLI {
}
}
- private static void configureExtractBytes(FetchEmitTuple t, boolean
extractBytes) {
- if (! extractBytes) {
+ private static void configureHandler(FetchEmitTuple t, SimpleAsyncConfig
asyncConfig) {
+ if (asyncConfig == null) {
+ return;
+ }
+ if (asyncConfig.getHandlerType() ==
BasicContentHandlerFactory.HANDLER_TYPE.TEXT) {
+ return;
+ }
+ HandlerConfig handlerConfig = new
HandlerConfig(asyncConfig.getHandlerType(), HandlerConfig.PARSE_MODE.RMETA,
+ -1, -1, false);
+ t.getParseContext().set(HandlerConfig.class, handlerConfig);
+ }
+
+ private static void configureExtractBytes(FetchEmitTuple t,
SimpleAsyncConfig asyncConfig) {
+ if (asyncConfig == null) {
+ return;
+ }
+ if (!asyncConfig.isExtractBytes()) {
return;
}
ParseContext parseContext = t.getParseContext();
@@ -213,11 +285,9 @@ public class TikaAsyncCLI {
private static class SingleFilePipesIterator extends PipesIterator {
private final String fName;
- private final boolean extractBytes;
- public SingleFilePipesIterator(String string, boolean extractBytes) {
+ public SingleFilePipesIterator(String string) {
super();
this.fName = string;
- this.extractBytes = extractBytes;
}
@Override
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index 4e38aac9c..9d3941cd8 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import org.junit.jupiter.api.Test;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+
public class AsyncCliParserTest {
@Test
@@ -69,13 +71,14 @@ public class AsyncCliParserTest {
@Test
public void testAll() throws Exception {
SimpleAsyncConfig simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(
- new String[]{"-i", "input", "-o", "output", "-n", "5", "-t",
"30000", "-x", "1g"});
+ new String[]{"-i", "input", "-o", "output", "-n", "5", "-T",
"30000", "-X", "1g", "-h", "x"});
assertEquals("input", simpleAsyncConfig.getInputDir());
assertEquals("output", simpleAsyncConfig.getOutputDir());
assertNull(simpleAsyncConfig.getFileList());
assertEquals(5, simpleAsyncConfig.getNumClients());
assertEquals(30000L, simpleAsyncConfig.getTimeoutMs());
assertEquals("1g", simpleAsyncConfig.getXmx());
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML,
simpleAsyncConfig.getHandlerType());
}
//TODO -- test for file list with and without inputDir
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
index adafdafd6..7db2dd133 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
@@ -33,6 +33,7 @@ import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.utils.XMLReaderUtils;
public class TikaConfigAsyncWriterTest {
@@ -42,7 +43,8 @@ public class TikaConfigAsyncWriterTest {
public void testBasic(@TempDir Path dir) throws Exception {
Path p =
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-parsers.xml").toURI());
SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input",
"output", 4,
- 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false);
+ 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(),
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false);
Path target = dir.resolve("combined.xml");
TikaConfigAsyncWriter writer = new
TikaConfigAsyncWriter(simpleAsyncConfig);
writer.write(target);
@@ -56,7 +58,8 @@ public class TikaConfigAsyncWriterTest {
public void testDontOverwriteEmitters(@TempDir Path dir) throws Exception {
Path p =
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-emitters.xml").toURI());
SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input",
"output", 4,
- 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false);
+ 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(),
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false);
Path target = dir.resolve("combined.xml");
TikaConfigAsyncWriter writer = new
TikaConfigAsyncWriter(simpleAsyncConfig);
writer.write(target);