This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e3564ecdf TIKA-4340 -- remove tika-batch from tika-app (#2029)
e3564ecdf is described below
commit e3564ecdfcd84518c25aa06f81fb77c5f76850b9
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 30 09:24:08 2024 -0400
TIKA-4340 -- remove tika-batch from tika-app (#2029)
---
tika-app/pom.xml | 5 -
.../batch/DigestingAutoDetectParserFactory.java | 51 -----
.../batch/builders/AppParserFactoryBuilder.java | 85 --------
.../apache/tika/cli/BatchCommandLineBuilder.java | 230 ---------------------
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 18 +-
.../main/resources/log4j2_batch_process.properties | 29 ---
.../src/main/resources/tika-app-batch-config.xml | 141 -------------
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 63 +++++-
.../tika/cli/TikaCLIBatchCommandLineTest.java | 210 -------------------
.../tika/cli/TikaCLIBatchIntegrationTest.java | 217 -------------------
.../tika/extractor/TestEmbeddedDocumentUtil.java | 56 -----
tika-pipes/tika-async-cli/pom.xml | 14 +-
.../apache/tika/async/cli/SimpleAsyncConfig.java | 60 ++++++
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 102 ++++++++-
.../tika/async/cli/TikaConfigAsyncWriter.java | 159 ++++++++++++++
.../apache/tika/async/cli/AsyncCliParserTest.java | 82 ++++++++
16 files changed, 485 insertions(+), 1037 deletions(-)
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index b84df212f..3476ff2c9 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -63,11 +63,6 @@
<artifactId>tika-xmp</artifactId>
<version>${project.version}</version>
</dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-batch</artifactId>
- <version>${project.version}</version>
- </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-emitter-fs</artifactId>
diff --git
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
b/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
deleted file mode 100644
index 1e072fa1b..000000000
---
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.batch;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.Parser;
-
-public class DigestingAutoDetectParserFactory extends ParserFactory {
-
- private DigestingParser.Digester digester = null;
-
-
- @Override
- public Parser getParser(TikaConfig config) {
- Parser p = new AutoDetectParser(config);
- if (digester == null) {
- return p;
- }
- boolean skipContainerDocument = false;
- if (config
- .getAutoDetectParserConfig()
- .getDigesterFactory() != null) {
- skipContainerDocument = config
- .getAutoDetectParserConfig()
- .getDigesterFactory()
- .isSkipContainerDocument();
- }
- return new DigestingParser(p, digester, skipContainerDocument);
- }
-
- public void setDigester(DigestingParser.Digester digester) {
- this.digester = digester;
- }
-}
diff --git
a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
deleted file mode 100644
index 09f4d434c..000000000
---
a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.batch.builders;
-
-import java.util.Locale;
-import java.util.Map;
-
-import org.w3c.dom.Node;
-
-import org.apache.tika.batch.DigestingAutoDetectParserFactory;
-import org.apache.tika.batch.ParserFactory;
-import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.digestutils.BouncyCastleDigester;
-import org.apache.tika.parser.digestutils.CommonsDigester;
-import org.apache.tika.util.ClassLoaderUtil;
-import org.apache.tika.util.XMLDOMUtil;
-
-public class AppParserFactoryBuilder implements IParserFactoryBuilder {
-
- @Override
- public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
- Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node,
runtimeAttrs);
- String className = localAttrs.get("class");
- ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class,
className);
-
- if (localAttrs.containsKey("parseRecursively")) {
- String bString = localAttrs
- .get("parseRecursively")
- .toLowerCase(Locale.ENGLISH);
- if (bString.equals("true")) {
- pf.setParseRecursively(true);
- } else if (bString.equals("false")) {
- pf.setParseRecursively(false);
- } else {
- throw new RuntimeException("parseRecursively must have value
of \"true\" or \"false\": " + bString);
- }
- }
- if (pf instanceof DigestingAutoDetectParserFactory) {
- DigestingParser.Digester d = buildDigester(localAttrs);
- ((DigestingAutoDetectParserFactory) pf).setDigester(d);
- }
- return pf;
- }
-
- private DigestingParser.Digester buildDigester(Map<String, String>
localAttrs) {
-
- String readLimitString = localAttrs.get("digestMarkLimit");
- if (readLimitString == null) {
- throw new IllegalArgumentException("Must specify
\"digestMarkLimit\" for " + "the DigestingAutoDetectParserFactory");
- }
- int readLimit = -1;
-
- try {
- readLimit = Integer.parseInt(readLimitString);
- } catch (NumberFormatException e) {
- throw new IllegalArgumentException("Parameter \"digestMarkLimit\"
must be a parseable int: " + readLimitString);
- }
- String digestString = localAttrs.get("digest");
- try {
- return new CommonsDigester(readLimit, digestString);
- } catch (IllegalArgumentException commonsException) {
- try {
- return new BouncyCastleDigester(readLimit, digestString);
- } catch (IllegalArgumentException bcException) {
- throw new IllegalArgumentException("Tried both CommonsDigester
(" + commonsException.getMessage() + ") and BouncyCastleDigester (" +
bcException.getMessage() + ")",
- bcException);
- }
- }
- }
-}
diff --git
a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
deleted file mode 100644
index 9379f9d45..000000000
--- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.cli;
-
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.lang3.SystemUtils;
-
-/**
- * This takes a TikaCLI commandline and builds the full commandline for
- * org.apache.tika.batch.fs.FSBatchProcessCLI.
- * <p>
- * The "default" batch config file that this relies on
- * if no batch config file is specified on the commandline
- * is: tika-batch/src/main/resources/.../default-tika-batch-config.xml
- */
-class BatchCommandLineBuilder {
-
- static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
-
- protected static String[] build(String[] args) throws IOException {
-
- Map<String, String> processArgs = new LinkedHashMap<>();
- Map<String, String> jvmOpts = new LinkedHashMap<>();
- //take the args, and divide them into process args and options for
- //the forked jvm process (i.e. log files, etc)
- mapifyArgs(args, processArgs, jvmOpts);
-
- //now modify processArgs in place
- translateCommandLine(args, processArgs);
-
- //maybe the user specified a different classpath?!
- if (!jvmOpts.containsKey("-cp") &&
!jvmOpts.containsKey("--classpath")) {
- String cp = System.getProperty("java.class.path");
- jvmOpts.put("-cp", cp);
- }
-
- boolean hasLog4j = false;
- for (String k : jvmOpts.keySet()) {
- if (k.startsWith("-Dlog4j.configurationFile=")) {
- hasLog4j = true;
- break;
- }
- }
- //use the log4j config file inside the app
/resources/log4j2_batch_process.properties
- if (!hasLog4j) {
-
jvmOpts.put("-Dlog4j.configurationFile=log4j2_batch_process.properties", "");
- }
- //now build the full command line
- List<String> fullCommand = new ArrayList<>();
- fullCommand.add("java");
- boolean foundHeadlessOption = false;
- for (Map.Entry<String, String> e : jvmOpts.entrySet()) {
- fullCommand.add(e.getKey());
- if (e
- .getValue()
- .length() > 0) {
- fullCommand.add(commandLineSafe(e.getValue()));
- }
- if (e
- .getKey()
- .contains("java.awt.headless")) {
- foundHeadlessOption = true;
- }
- }
- //run in headless mode unless the user asks for something else
TIKA-2434
- if (!foundHeadlessOption) {
- fullCommand.add("-Djava.awt.headless=true");
- }
-
- fullCommand.add("org.apache.tika.batch.fs.FSBatchProcessCLI");
- //now add the process commands
- for (Map.Entry<String, String> e : processArgs.entrySet()) {
- fullCommand.add(e.getKey());
- if (e
- .getValue()
- .length() > 0) {
- fullCommand.add(commandLineSafe(e.getValue()));
- }
- }
- return fullCommand.toArray(new String[0]);
- }
-
- protected static String commandLineSafe(String arg) {
- if (arg == null) {
- return arg;
- }
- //need to test for " " on windows, can't just add double quotes
- //across platforms.
- if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
- arg = "\"" + arg + "\"";
- }
- return arg;
- }
-
-
- /**
- * Take the input args and separate them into args that belong on the
commandline
- * and those that belong as jvm args for the forked process.
- *
- * @param args -- literal args from TikaCLI commandline
- * @param commandLine args that should be part of the batch commandline
- * @param jvmArgs args that belong as jvm arguments for the forked
process
- */
- private static void mapifyArgs(final String[] args, final Map<String,
String> commandLine, final Map<String, String> jvmArgs) {
-
- if (args.length == 0) {
- return;
- }
-
- Matcher matcher = JVM_OPTS_PATTERN.matcher("");
- for (int i = 0; i < args.length; i++) {
- if (matcher
- .reset(args[i])
- .find()) {
- String jvmArg = matcher.group(1) + matcher.group(2);
- String v = "";
- if (i < args.length - 1 && !args[i + 1].startsWith("-")) {
- v = args[i + 1];
- i++;
- }
- jvmArgs.put(jvmArg, v);
- } else if (args[i].startsWith("-")) {
- String k = args[i];
- String v = "";
- if (i < args.length - 1 && !args[i + 1].startsWith("-")) {
- v = args[i + 1];
- i++;
- }
- commandLine.put(k, v);
- }
- }
- }
-
- private static void translateCommandLine(String[] args, Map<String,
String> map) throws IOException {
- //if there are only two args and they are both directories, treat the
first
- //as input and the second as output.
- if (args.length == 2 && !args[0].startsWith("-") &&
!args[1].startsWith("-")) {
- Path candInput = Paths.get(args[0]);
- Path candOutput = Paths.get(args[1]);
-
- if (Files.isRegularFile(candOutput)) {
- throw new IllegalArgumentException("Can't specify an existing
file as the " + "second argument for the output directory of a batch process");
- }
-
- if (Files.isDirectory(candInput)) {
- map.put("-inputDir", args[0]);
- map.put("-outputDir", args[1]);
- }
- }
- //look for tikaConfig
- for (String arg : args) {
- if (arg.startsWith("--config=")) {
- String configPath = arg.substring("--config=".length());
- map.put("-c", configPath);
- //now remove --config=x.config from the map :)
- map.remove(arg);
- break;
- }
- }
- //now translate output types
- if (map.containsKey("-h") || map.containsKey("--html")) {
- map.remove("-h");
- map.remove("--html");
- map.put("-basicHandlerType", "html");
- } else if (map.containsKey("-x") || map.containsKey("--xml")) {
- map.remove("-x");
- map.remove("--xml");
- map.put("-basicHandlerType", "xml");
- } else if (map.containsKey("-t") || map.containsKey("--text")) {
- map.remove("-t");
- map.remove("--text");
- map.put("-basicHandlerType", "text");
- } else if (map.containsKey("-m") || map.containsKey("--metadata")) {
- map.remove("-m");
- map.remove("--metadata");
- map.put("-basicHandlerType", "ignore");
- } else if (map.containsKey("-T") || map.containsKey("--text-main")) {
- map.remove("-T");
- map.remove("--text-main");
- map.put("-basicHandlerType", "body");
- }
-
- if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
- map.remove("-J");
- map.remove("--jsonRecursive");
- map.put("-recursiveParserWrapper", "true");
- }
-
- if (map.containsKey("--inputDir") || map.containsKey("-i")) {
- String v1 = map.remove("--inputDir");
- String v2 = map.remove("-i");
- String v = (v1 == null) ? v2 : v1;
- map.put("-inputDir", v);
- }
-
- if (map.containsKey("--outputDir") || map.containsKey("-o")) {
- String v1 = map.remove("--outputDir");
- String v2 = map.remove("-o");
- String v = (v1 == null) ? v2 : v1;
- map.put("-outputDir", v);
- }
-
- }
-}
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index aa087910a..f59d88743 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -67,7 +67,6 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.Tika;
import org.apache.tika.async.cli.TikaAsyncCLI;
-import org.apache.tika.batch.BatchProcessDriverCLI;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.TikaConfigSerializer;
import org.apache.tika.detect.CompositeDetector;
@@ -229,11 +228,6 @@ public class TikaCLI {
if (cli.testForHelp(args)) {
cli.usage();
return;
- } else if (cli.testForBatch(args)) {
- String[] batchArgs = BatchCommandLineBuilder.build(args);
- BatchProcessDriverCLI batchDriver = new
BatchProcessDriverCLI(batchArgs);
- batchDriver.execute();
- return;
} else if (cli.testForAsync(args)) {
async(args);
return;
@@ -322,10 +316,22 @@ public class TikaCLI {
}
private boolean testForAsync(String[] args) {
+ if (args.length == 2) {
+ if (Files.isDirectory(Paths.get(args[0]))) {
+ return true;
+ }
+ }
for (String arg : args) {
if (arg.equals("-a") || arg.equals("--async")) {
return true;
}
+ if (arg.equals("-i") || arg.startsWith("--input")) {
+ return true;
+ }
+ if (arg.equals("-o") || arg.startsWith("--output")) {
+ return true;
+ }
+
}
return false;
}
diff --git a/tika-app/src/main/resources/log4j2_batch_process.properties
b/tika-app/src/main/resources/log4j2_batch_process.properties
deleted file mode 100644
index 8715133e6..000000000
--- a/tika-app/src/main/resources/log4j2_batch_process.properties
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#info,debug, error,fatal ...
-status=info
-name=PropertiesConfig
-filters=threshold
-filter.threshold.type=ThresholdFilter
-filter.threshold.level=info
-appenders=console
-appender.console.type=Console
-appender.console.name=STDOUT
-appender.console.layout.type=PatternLayout
-appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
-rootLogger.level=info
-rootLogger.appenderRefs=stdout
-rootLogger.appenderRef.stdout.ref=STDOUT
diff --git a/tika-app/src/main/resources/tika-app-batch-config.xml
b/tika-app/src/main/resources/tika-app-batch-config.xml
deleted file mode 100644
index 12556c7f0..000000000
--- a/tika-app/src/main/resources/tika-app-batch-config.xml
+++ /dev/null
@@ -1,141 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<!-- NOTE: tika-batch is still an experimental feature.
- The configuration file will likely change and be backward incompatible
- with new versions of Tika. Please stay tuned.
- -->
-
-<tika-batch-config
- maxAliveTimeSeconds="-1"
- pauseOnEarlyTerminationMillis="10000"
- timeoutThresholdMillis="300000"
- timeoutCheckPulseMillis="1000"
- maxQueueSize="10000"
- numConsumers="default"> <!-- numConsumers = number of file consumers,
"default" = number of processors -1 -->
-
- <!-- options to allow on the commandline -->
- <commandline>
- <option opt="c" longOpt="tika-config" hasArg="true"
- description="TikaConfig file"/>
- <option opt="bc" longOpt="batch-config" hasArg="true"
- description="xml batch config file"/>
- <!-- We needed sorted for testing. We added random for performance.
- Where crawling a directory is slow, it might be beneficial to
- go randomly so that the parsers are triggered earlier. The
- default is operating system's choice ("os") which means whatever
order
- the os returns files in .listFiles(). -->
- <option opt="crawlOrder" hasArg="true"
- description="how does the crawler sort the directories and
files:
- (random|sorted|os)"/>
- <option opt="numConsumers" hasArg="true"
- description="number of fileConsumers threads"/>
- <option opt="maxFileSizeBytes" hasArg="true"
- description="maximum file size to process; do not process
files larger than this"/>
- <option opt="maxQueueSize" hasArg="true"
- description="maximum queue size for FileResources"/>
- <option opt="fileList" hasArg="true"
- description="file that contains a list of files (relative to
inputDir) to process"/>
- <option opt="fileListEncoding" hasArg="true"
- description="encoding for fileList"/>
- <option opt="inputDir" hasArg="true"
- description="root directory for the files to be processed"/>
- <option opt="startDir" hasArg="true"
- description="directory (under inputDir) at which to start
crawling"/>
- <option opt="outputDir" hasArg="true"
- description="output directory for output"/> <!-- do we want to
make this mandatory -->
- <option opt="recursiveParserWrapper"
- description="use the RecursiveParserWrapper or not (default =
false)"/>
- <option opt="streamOut" description="stream the output of the
RecursiveParserWrapper (default = false)"/>
- <option opt="handleExisting" hasArg="true"
- description="if an output file already exists, do you want to:
overwrite, rename or skip"/>
- <option opt="basicHandlerType" hasArg="true"
- description="what type of content handler: xml, text, html,
body"/>
- <option opt="outputSuffix" hasArg="true"
- description="suffix to add to the end of the output file
name"/>
- <option opt="timeoutThresholdMillis" hasArg="true"
- description="how long to wait before determining that a
consumer is stale"/>
- <option opt="includeFilePat" hasArg="true"
- description="regex that specifies which files to process"/>
- <option opt="excludeFilePat" hasArg="true"
- description="regex that specifies which files to avoid
processing"/>
- <option opt="reporterSleepMillis" hasArg="true"
- description="millisecond between reports by the reporter"/>
- <option opt="digest" hasArg="true"
- description="which digest(s) to use, e.g. 'md5,sha512'\"/>
- <option opt="digestMarkLimit" hasArg="true"
- description="max bytes to read for digest\"/>
- </commandline>
-
-
- <!-- can specify inputDir="input", but the default config should not
include this -->
- <!-- can also specify startDir="input/someDir" to specify which child
directory
- to start processing -->
- <crawler
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
- crawlOrder="random"
- maxFilesToAdd="-1"
- maxFilesToConsider="-1"
- includeFilePat=""
- excludeFilePat=""
- maxFileSizeBytes="-1"
- />
-<!--
- This is an example of a crawler that reads a list of files to be processed
from a
- file. This assumes that the files in the list are relative to inputDir.
- <crawler class="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
- fileList="files.txt"
- fileListEncoding="UTF-8"
- maxFilesToAdd="-1"
- maxFilesToConsider="-1"
- includeFilePat="(?i).pdf$"
- excludeFilePat="(?i).msg$"
- maxFileSizeBytes="-1"
- inputDir="input"
- />
--->
- <!--
- To wrap parser in RecursiveParserWrapper (tika-app's -J or
tika-server's /rmeta),
- add attribute recursiveParserWrapper="true" to consumers element.
-
- To wrap parser with DigestingParser add attributes e.g.:
- digest="md5,sha256" digestMarkLimit="10000000"
- -->
- <consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
- recursiveParserWrapper="false"
consumersManagerMaxMillis="60000">
- <parser
builderClass="org.apache.tika.batch.builders.AppParserFactoryBuilder"
- class="org.apache.tika.batch.DigestingAutoDetectParserFactory"
- parseRecursively="true"
- digest="md5" digestMarkLimit="1000000"/>
- <contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
- basicHandlerType="xml" writeLimit="-1"/>
- <!-- can specify custom output file suffix with:
- suffix=".mysuffix"
- if no suffix is specified, BasicTikaFSConsumersBuilder does its
best to guess -->
- <!-- can specify compression with
- compression="bzip2|gzip|zip" -->
-
- <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder"
reporterSleepMillis="1000"
- reporterStaleThresholdMillis="60000"/>
- <interrupter
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
-</tika-batch-config>
\ No newline at end of file
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 4913f96cd..d0d7ce100 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -20,17 +20,30 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
+import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
-public class TikaCLIAsyncTest extends TikaCLITest {
+public class TikaCLIAsyncTest {
+
+
+ static final File TEST_DATA_FILE = new
File("src/test/resources/test-data");
+
+ /* Test members */
+ private ByteArrayOutputStream outContent = null;
+ private ByteArrayOutputStream errContent = null;
+ private PrintStream stdout = null;
+ private PrintStream stderr = null;
private static Path ASYNC_CONFIG;
@TempDir
@@ -48,6 +61,45 @@ public class TikaCLIAsyncTest extends TikaCLITest {
Files.write(ASYNC_CONFIG, xml.getBytes(UTF_8));
}
+ /**
+ * reset resourcePrefix
+ * save original System.out and System.err
+ * clear outContent and errContent if they are not empty
+ * set outContent and errContent as System.out and System.err
+ */
+ @BeforeEach
+ public void setUp() throws Exception {
+ stdout = System.out;
+ stderr = System.err;
+ resetContent();
+ }
+
+ /**
+ * Tears down the test. Returns the System.out and System.err
+ */
+ @AfterEach
+ public void tearDown() {
+ System.setOut(stdout);
+ System.setErr(stderr);
+ }
+
+ /**
+ * clear outContent and errContent if they are not empty by create a new
one.
+ * set outContent and errContent as System.out and System.err
+ */
+ private void resetContent() throws Exception {
+ if (outContent == null || outContent.size() > 0) {
+ outContent = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(outContent, true, UTF_8.name()));
+ }
+
+ if (errContent == null || errContent.size() > 0) {
+ errContent = new ByteArrayOutputStream();
+ System.setErr(new PrintStream(errContent, true, UTF_8.name()));
+ }
+ }
+
+
@Test
public void testAsync() throws Exception {
String content = getParamOutContent("-a", "--config=" +
ASYNC_CONFIG.toAbsolutePath());
@@ -83,5 +135,14 @@ public class TikaCLIAsyncTest extends TikaCLITest {
}
}
+ /**
+ * reset outContent and errContent if they are not empty
+ * run given params in TikaCLI and return outContent String with UTF-8
+ */
+ String getParamOutContent(String... params) throws Exception {
+ resetContent();
+ TikaCLI.main(params);
+ return outContent.toString("UTF-8");
+ }
}
diff --git
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
deleted file mode 100644
index d25b35cfd..000000000
---
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.cli;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.utils.ProcessUtils;
-
-public class TikaCLIBatchCommandLineTest {
-
- Path testInput = null;
- Path testFile = null;
-
- String testInputPathForCommandLine;
- String escapedInputPathForCommandLine;
-
- @BeforeEach
- public void init() {
- testInput = Paths.get("testInput");
- try {
- Files.createDirectories(testInput);
- } catch (IOException e) {
- throw new RuntimeException("Failed to open test input directory");
- }
- testFile = Paths.get("testFile.txt");
- try (OutputStream os = Files.newOutputStream(testFile)) {
- IOUtils.write("test output", os, UTF_8);
- } catch (IOException e) {
- throw new RuntimeException("Couldn't open testFile");
- }
- testInputPathForCommandLine = testInput
- .toAbsolutePath()
- .toString();
- escapedInputPathForCommandLine =
BatchCommandLineBuilder.commandLineSafe(testInputPathForCommandLine);
- }
-
- @AfterEach
- public void tearDown() {
- try {
- //TODO: refactor this to use our FileUtils.deleteDirectory(Path)
- //when that is ready
- FileUtils.deleteDirectory(testInput.toFile());
- } catch (IOException e) {
- throw new RuntimeException(e);
- } finally {
- try {
- Files.deleteIfExists(testFile);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- }
- }
-
- @Test
- public void testJVMOpts() throws Exception {
- String[] params = {"-JXmx1g",
"-JDlog4j.configuration=batch_process_log4j.xml", "-inputDir",
testInputPathForCommandLine, "-outputDir", "testout-output"};
-
-
- String[] commandLine = BatchCommandLineBuilder.build(params);
- StringBuilder sb = new StringBuilder();
-
- for (String s : commandLine) {
- sb
- .append(s)
- .append(" ");
- }
- String s = sb.toString();
- int classInd = s.indexOf("org.apache.tika.batch.fs.FSBatchProcessCLI");
- int xmx = s.indexOf("-Xmx1g");
- int inputDir = s.indexOf("-inputDir");
- int log = s.indexOf("-Dlog4j.configuration");
- assertTrue(classInd > -1);
- assertTrue(xmx > -1);
- assertTrue(inputDir > -1);
- assertTrue(log > -1);
- assertTrue(xmx < classInd);
- assertTrue(log < classInd);
- assertTrue(inputDir > classInd);
- }
-
- @Test
- public void testBasicMappingOfArgs() throws Exception {
- String[] params = {"-JXmx1g",
"-JDlog4j.configuration=batch_process_log4j.xml", "-bc", "batch-config.xml",
"-J", "-h", "-inputDir", testInputPathForCommandLine};
-
- String[] commandLine = BatchCommandLineBuilder.build(params);
- Map<String, String> attrs = mapify(commandLine);
- assertEquals("true", attrs.get("-recursiveParserWrapper"));
- assertEquals("html", attrs.get("-basicHandlerType"));
- assertEquals("batch-config.xml", attrs.get("-bc"));
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- }
-
- @Test
- public void testTwoDirsNoFlags() throws Exception {
- String outputRoot = "outputRoot";
-
- String[] params = {testInputPathForCommandLine, outputRoot};
-
- String[] commandLine = BatchCommandLineBuilder.build(params);
- Map<String, String> attrs = mapify(commandLine);
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- assertEquals(outputRoot, attrs.get("-outputDir"));
- }
-
- @Test
- public void testTwoDirsVarious() throws Exception {
- String outputRoot = "outputRoot";
- String[] params = {"-i", testInputPathForCommandLine, "-o",
outputRoot};
-
- String[] commandLine = BatchCommandLineBuilder.build(params);
- Map<String, String> attrs = mapify(commandLine);
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- assertEquals(outputRoot, attrs.get("-outputDir"));
-
- params = new String[]{"--inputDir", testInputPathForCommandLine,
"--outputDir", outputRoot};
-
- commandLine = BatchCommandLineBuilder.build(params);
- attrs = mapify(commandLine);
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- assertEquals(outputRoot, attrs.get("-outputDir"));
-
- params = new String[]{"-inputDir", testInputPathForCommandLine,
"-outputDir", outputRoot};
-
- commandLine = BatchCommandLineBuilder.build(params);
- attrs = mapify(commandLine);
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- assertEquals(outputRoot, attrs.get("-outputDir"));
- }
-
- @Test
- public void testConfig() throws Exception {
- String outputRoot = "outputRoot";
- String configPath = "c:/somewhere/someConfig.xml";
-
- String[] params = {"--inputDir", testInputPathForCommandLine,
"--outputDir", outputRoot, "--config=" + configPath};
- String[] commandLine = BatchCommandLineBuilder.build(params);
- Map<String, String> attrs = mapify(commandLine);
- assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
- assertEquals(outputRoot, attrs.get("-outputDir"));
- assertEquals(configPath, attrs.get("-c"));
-
- }
-
- @Test
- public void testOneDirOneFileException() throws Exception {
- boolean ex = false;
- try {
- String path = testFile
- .toAbsolutePath()
- .toString();
- path = ProcessUtils.escapeCommandLine(path);
- String[] params = {testInputPathForCommandLine, path};
-
- String[] commandLine = BatchCommandLineBuilder.build(params);
- fail("Not allowed to have one dir and one file");
- } catch (IllegalArgumentException e) {
- ex = true;
- }
- assertTrue(ex, "exception on <dir> <file>");
- }
-
- private Map<String, String> mapify(String[] args) {
- Map<String, String> map = new LinkedHashMap<>();
- for (int i = 0; i < args.length; i++) {
- if (args[i].startsWith("-")) {
- String k = args[i];
- String v = "";
- if (i < args.length - 1 && !args[i + 1].startsWith("-")) {
- v = args[i + 1];
- i++;
- }
- map.put(k, v);
- }
- }
- return map;
- }
-
-}
diff --git
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
deleted file mode 100644
index d42a9795f..000000000
---
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.cli;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.io.ByteArrayOutputStream;
-import java.io.OutputStream;
-import java.io.PrintStream;
-import java.io.Reader;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.util.List;
-
-import org.apache.commons.io.FileUtils;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.serialization.JsonMetadataList;
-
-public class TikaCLIBatchIntegrationTest {
-
- private final String propsFileName =
"log4j2_batch_process_test.properties";
- private Path testInputDir = Paths.get("src/test/resources/test-data");
- private String testInputDirForCommandLine;
- private Path tempOutputDir;
- private String tempOutputDirForCommandLine;
- private Path customBatchLogging;
- private OutputStream out = null;
- private OutputStream err = null;
- private ByteArrayOutputStream outBuffer = null;
- private ByteArrayOutputStream errBuffer = null;
- private Path configFile = null;
-
- @BeforeEach
- public void setup() throws Exception {
- tempOutputDir = Files.createTempDirectory("tika-cli-test-batch-");
- outBuffer = new ByteArrayOutputStream();
- PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name());
- errBuffer = new ByteArrayOutputStream();
- PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name());
- out = System.out;
- err = System.err;
- System.setOut(outWriter);
- System.setErr(errWriter);
- testInputDirForCommandLine = testInputDir
- .toAbsolutePath()
- .toString();
- tempOutputDirForCommandLine = tempOutputDir
- .toAbsolutePath()
- .toString();
- customBatchLogging = tempOutputDir.resolve(propsFileName);
- configFile = Files.createTempFile("tika-app-batch-", ".xml");
- Files.copy(this
- .getClass()
- .getResourceAsStream("/" + propsFileName), customBatchLogging);
- Files.copy(this
- .getClass()
- .getResourceAsStream("/test-data/tika-config1.xml"),
configFile, StandardCopyOption.REPLACE_EXISTING);
- }
-
- @AfterEach
- public void tearDown() throws Exception {
- System.setOut(new PrintStream(out, true, UTF_8.name()));
- System.setErr(new PrintStream(err, true, UTF_8.name()));
- //TODO: refactor to use our deleteDirectory with straight path
- FileUtils.deleteDirectory(tempOutputDir.toFile());
- Files.delete(configFile);
- }
-
- @Test
- public void testSimplestBatchIntegration() throws Exception {
- String[] params = {testInputDirForCommandLine,
tempOutputDirForCommandLine};
- TikaCLI.main(params);
- assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
- assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
- }
-
- @Test
- public void testTikaConfig() throws Exception {
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "--config=" + configFile
- .toAbsolutePath()
- .toString()};
- TikaCLI.main(params);
- assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
- assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
- }
-
- @Test
- public void testBasicBatchIntegration() throws Exception {
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "-numConsumers", "2"};
- TikaCLI.main(params);
-
- assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
- assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
- }
-
- @Test
- public void testJsonRecursiveBatchIntegration() throws Exception {
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "-numConsumers", "10", "-J", //recursive Json
- "-t" //plain text in content
- };
- TikaCLI.main(params);
-
- Path jsonFile =
tempOutputDir.resolve("test_recursive_embedded.docx.json");
- try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
- List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
- assertTrue(metadataList
- .get(6)
- .get(TikaCoreProperties.TIKA_CONTENT)
- .contains("human events"));
- }
- }
-
- @Test
- public void testStreamingJsonRecursiveBatchIntegration() throws Exception {
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "-numConsumers", "10", "-J", //recursive Json
- "-t", //plain text in content
- "-streamOut"};
- TikaCLI.main(params);
-
- Path jsonFile =
tempOutputDir.resolve("test_recursive_embedded.docx.json");
- try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
- List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
- assertTrue(metadataList
- .get(6)
- .get(TikaCoreProperties.TIKA_CONTENT)
- .contains("human events"));
- //test that the last written object has been bumped to the first
by JsonMetadataList.fromJson()
- assertNull(metadataList
- .get(0)
- .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
- }
- }
-
- @Test
- public void testProcessLogFileConfig() throws Exception {
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "-numConsumers", "2",
"-JDlog4j.configurationFile=" + customBatchLogging.toUri()};
- TikaCLI.main(params);
-
- assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
- assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
- String sysOutString = new String(outBuffer.toByteArray(), UTF_8);
- assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
- }
-
- @Test
- public void testDigester() throws Exception {
-/*
- try {
- String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
- "-o", escape(tempOutputDir.getAbsolutePath()),
- "-numConsumers", "10",
- "-J", //recursive Json
- "-t" //plain text in content
- };
- TikaCLI.main(params);
- reader = new InputStreamReader(
- new FileInputStream(new File(tempOutputDir,
"test_recursive_embedded.docx.json")), UTF_8);
- List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
- assertEquals("59f626e09a8c16ab6dbc2800c685f772",
metadataList.get(0).get("X-TIKA:digest:MD5"));
- assertEquals("22e6e91f408d018417cd452d6de3dede",
metadataList.get(5).get("X-TIKA:digest:MD5"));
- } finally {
- IOUtils.closeQuietly(reader);
- }
-*/
- String[] params = {"-i", testInputDirForCommandLine, "-o",
tempOutputDirForCommandLine, "-numConsumers", "10", "-J", //recursive Json
- "-t", //plain text in content
- "-digest", "sha512"};
- TikaCLI.main(params);
- Path jsonFile =
tempOutputDir.resolve("test_recursive_embedded.docx.json");
- try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
-
- List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
- assertNotNull(metadataList
- .get(0)
- .get("X-TIKA:digest:SHA512"));
- assertTrue(metadataList
- .get(0)
- .get("X-TIKA:digest:SHA512")
- .startsWith("ee46d973ee1852c01858"));
- }
- }
-
- private void assertFileExists(Path path) {
- assertTrue(Files.isRegularFile(path), "File doesn't exist: " +
path.toAbsolutePath());
- }
-
-}
diff --git
a/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
b/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
deleted file mode 100644
index bcd31dffa..000000000
---
a/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.extractor;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.batch.DigestingAutoDetectParserFactory;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-
-public class TestEmbeddedDocumentUtil {
- //TODO -- figure out how to mock this into tika-core
-
- @Test
- public void testSimple() {
- Parser p = new AutoDetectParser();
- ParseContext parseContext = new ParseContext();
- parseContext.set(Parser.class, p);
- Parser txtParser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.csv.TextAndCSVParser.class,
parseContext);
- assertNotNull(txtParser);
- assertEquals(org.apache.tika.parser.csv.TextAndCSVParser.class,
txtParser.getClass());
-
- }
-
- @Test
- public void testDoublyDecorated() {
- Parser d = new
DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig());
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d, true);
- ParseContext parseContext = new ParseContext();
- parseContext.set(Parser.class, wrapper);
- Parser txtParser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.csv.TextAndCSVParser.class,
parseContext);
- assertNotNull(txtParser);
- assertEquals(org.apache.tika.parser.csv.TextAndCSVParser.class,
txtParser.getClass());
- }
-}
diff --git a/tika-pipes/tika-async-cli/pom.xml
b/tika-pipes/tika-async-cli/pom.xml
index 4151f528d..aab6604f4 100644
--- a/tika-pipes/tika-async-cli/pom.xml
+++ b/tika-pipes/tika-async-cli/pom.xml
@@ -38,11 +38,8 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
</dependency>
<!-- logging -->
<dependency>
@@ -53,6 +50,13 @@
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-emitter-fs</artifactId>
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
new file mode 100644
index 000000000..34458beb0
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+class SimpleAsyncConfig {
+
+ private String inputDir;
+ private String outputDir;
+ private Integer numClients;
+ private Long timeoutMs;
+ private String xmx;
+ private String fileList;
+
+ public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList) {
+ this.inputDir = inputDir;
+ this.outputDir = outputDir;
+ this.numClients = numClients;
+ this.timeoutMs = timeoutMs;
+ this.xmx = xmx;
+ this.fileList = fileList;
+ }
+
+ public String getInputDir() {
+ return inputDir;
+ }
+
+ public String getOutputDir() {
+ return outputDir;
+ }
+
+ public Integer getNumClients() {
+ return numClients;
+ }
+
+ public Long getTimeoutMs() {
+ return timeoutMs;
+ }
+
+ public String getXmx() {
+ return xmx;
+ }
+
+ public String getFileList() {
+ return fileList;
+ }
+}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 37b4d4262..93b83ddda 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -16,10 +16,17 @@
*/
package org.apache.tika.async.cli;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.TimeoutException;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -32,8 +39,91 @@ public class TikaAsyncCLI {
private static final long TIMEOUT_MS = 600_000;
private static final Logger LOG =
LoggerFactory.getLogger(TikaAsyncCLI.class);
+ private static Options getOptions() {
+ Options options = new Options();
+ options.addOption("i", "inputDir", true, "input directory");
+ options.addOption("o", "outputDir", true, "output directory");
+
+ options.addOption("n", "numClients", true, "number of forked clients");
+ options.addOption("x", "Xmx", true, "heap for the forked clients in
usual jvm heap amount, e.g. -x 1g");
+ options.addOption("?", "help", false, "this help message");
+ options.addOption("t", "timeoutMs", true, "timeout for each parse in
milliseconds");
+ options.addOption("l", "fileList", true, "file list");
+
+ return options;
+ }
+
public static void main(String[] args) throws Exception {
- Path tikaConfigPath = Paths.get(args[0]);
+ if (args.length == 0) {
+ usage(getOptions());
+ } else if (args.length == 1) {
+ processWithTikaConfig(Paths.get(args[0]));
+ } else {
+ processCommandLine(args);
+ }
+ }
+
+ private static void processCommandLine(String[] args) throws Exception {
+
+ SimpleAsyncConfig simpleAsyncConfig = parseCommandLine(args);
+
+ Path tikaConfig = null;
+ try {
+ tikaConfig = Files.createTempFile("tika-async-tmp-", ".xml");
+ TikaConfigAsyncWriter tikaConfigAsyncWriter = new
TikaConfigAsyncWriter(simpleAsyncConfig);
+ tikaConfigAsyncWriter.write(tikaConfig);
+ processWithTikaConfig(tikaConfig);
+ } finally {
+ if (tikaConfig != null) {
+ Files.delete(tikaConfig);
+ }
+ }
+ }
+
+ //not private for testing purposes
+ static SimpleAsyncConfig parseCommandLine(String[] args) throws
ParseException {
+ if (args.length == 2 && ! args[0].startsWith("-")) {
+ return new SimpleAsyncConfig(args[0], args[1], null, null, null,
null);
+ }
+
+ Options options = getOptions();
+
+ CommandLineParser cliParser = new DefaultParser();
+
+ CommandLine line = cliParser.parse(options, args);
+ if (line.hasOption("help")) {
+ usage(options);
+ }
+ String inputDir = null;
+ String outputDir = null;
+ String xmx = null;
+ Long timeoutMs = null;
+ Integer numClients = null;
+ String fileList = null;
+ if (line.hasOption("i")) {
+ inputDir = line.getOptionValue("i");
+ }
+ if (line.hasOption("o")) {
+ outputDir = line.getOptionValue("o");
+ }
+ if (line.hasOption("x")) {
+ xmx = line.getOptionValue("x");
+ }
+ if (line.hasOption("t")) {
+ timeoutMs = Long.parseLong(line.getOptionValue("t"));
+ }
+ if (line.hasOption("n")) {
+ numClients = Integer.parseInt(line.getOptionValue("n"));
+ }
+ if (line.hasOption("l")) {
+ fileList = line.getOptionValue("l");
+ }
+ return new SimpleAsyncConfig(inputDir, outputDir,
+ numClients, timeoutMs, xmx, fileList);
+ }
+
+
+ private static void processWithTikaConfig(Path tikaConfigPath) throws
Exception {
PipesIterator pipesIterator = PipesIterator.build(tikaConfigPath);
long start = System.currentTimeMillis();
try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath,
pipesIterator)) {
@@ -54,6 +144,16 @@ public class TikaAsyncCLI {
}
long elapsed = System.currentTimeMillis() - start;
LOG.info("Successfully finished processing {} files in {} ms",
processor.getTotalProcessed(), elapsed);
+
}
}
+
+ private static void usage(Options options) {
+ System.out.println("Two primary options:");
+ System.out.println("\t1. Specify a tika-config.xml on the commandline
that includes the definitions for async");
+ System.out.println("\t2. Commandline:");
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp("tikaAsynCli", options);
+ System.exit(1);
+ }
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
new file mode 100644
index 000000000..ddb5ecdb4
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+import org.apache.tika.utils.StringUtils;
+
+class TikaConfigAsyncWriter {
+
+ private static final String FETCHER_NAME = "fsf";
+ private static final String EMITTER_NAME = "fse";
+
+ private final SimpleAsyncConfig simpleAsyncConfig;
+
+ TikaConfigAsyncWriter(SimpleAsyncConfig simpleAsyncConfig) {
+ this.simpleAsyncConfig = simpleAsyncConfig;
+ }
+
+ void write(Path output) throws IOException {
+ try {
+ _write(output);
+ } catch (Exception e) {
+ throw new IOException(e);
+ }
+ }
+
+ void _write(Path output) throws ParserConfigurationException,
TransformerException, IOException {
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ Document document = dbf.newDocumentBuilder().newDocument();
+ Element properties = document.createElement("properties");
+ document.appendChild(properties);
+ writePipesIterator(document, properties);
+ writeFetchers(document, properties);
+ writeEmitters(document, properties);
+ writeAsync(document, properties);
+ Transformer transformer = TransformerFactory
+ .newInstance().newTransformer();
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+ try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8)) {
+ StreamResult result = new StreamResult(writer);
+ DOMSource source = new DOMSource(document);
+ transformer.transform(source, result);
+ }
+
+ }
+
+ private void writePipesIterator(Document document, Element properties) {
+ if (! StringUtils.isBlank(simpleAsyncConfig.getFileList())) {
+ writeFileListIterator(document, properties);
+ } else {
+ writeFileSystemIterator(document, properties);
+ }
+ }
+
+ private void writeFileSystemIterator(Document document, Element
properties) {
+ Element pipesIterator = createAndGetElement(document, properties,
"pipesIterator",
+ "class",
"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator");
+ appendTextElement(document, pipesIterator, "basePath",
+
Paths.get(simpleAsyncConfig.getInputDir()).toAbsolutePath().toString());
+ appendTextElement(document, pipesIterator, "fetcherName",
FETCHER_NAME);
+ appendTextElement(document, pipesIterator, "emitterName",
EMITTER_NAME);
+ }
+
+ private void writeFileListIterator(Document document, Element properties) {
+ Element pipesIterator = createAndGetElement(document, properties,
"pipesIterator",
+ "class",
"org.apache.tika.pipes.pipesiterator.filelist.FileListPipesIterator");
+ appendTextElement(document, pipesIterator, "fetcherName",
FETCHER_NAME);
+ appendTextElement(document, pipesIterator, "emitterName",
EMITTER_NAME);
+ appendTextElement(document, pipesIterator, "fileList",
+
Paths.get(simpleAsyncConfig.getFileList()).toAbsolutePath().toString());
+ appendTextElement(document, pipesIterator, "hasHeader", "false");
+ }
+
+ private void writeEmitters(Document document, Element properties) {
+ Element emitters = createAndGetElement(document, properties,
"emitters");
+ Element emitter = createAndGetElement( document, emitters, "emitter",
+ "class", "org.apache.tika.pipes.emitter.fs.FileSystemEmitter");
+ appendTextElement(document, emitter, "name", EMITTER_NAME);
+ appendTextElement(document, emitter, "basePath",
+
Paths.get(simpleAsyncConfig.getOutputDir()).toAbsolutePath().toString());
+ }
+
+ private void writeFetchers(Document document, Element properties) {
+ Element fetchers = createAndGetElement(document, properties,
"fetchers");
+ Element fetcher = createAndGetElement(document, fetchers, "fetcher",
+ "class", "org.apache.tika.pipes.fetcher.fs.FileSystemFetcher");
+ appendTextElement(document, fetcher, "name", FETCHER_NAME);
+ if (!StringUtils.isBlank(simpleAsyncConfig.getInputDir())) {
+ appendTextElement(document, fetcher, "basePath", Paths
+ .get(simpleAsyncConfig.getInputDir())
+ .toAbsolutePath()
+ .toString());
+ } else {
+ appendTextElement(document, fetcher, "basePath", "");
+ }
+ }
+
+ private void writeAsync(Document document, Element properties) {
+ Element async = createAndGetElement(document, properties, "async");
+ properties.appendChild(async);
+ if (simpleAsyncConfig.getNumClients() != null) {
+ appendTextElement(document, async, "numClients",
Integer.toString(simpleAsyncConfig.getNumClients()));
+ }
+ if (simpleAsyncConfig.getXmx() != null) {
+ Element forkedJvmArgs = createAndGetElement(document, async,
"forkedJvmArgs");
+ appendTextElement(document, forkedJvmArgs, "arg", "-Xmx" +
simpleAsyncConfig.getXmx());
+ }
+ if (simpleAsyncConfig.getTimeoutMs() != null) {
+ appendTextElement(document, async, "timeoutMillis",
Long.toString(simpleAsyncConfig.getTimeoutMs()));
+ }
+ }
+
+ private static void appendTextElement(Document document, Element parent,
String itemName, String text, String... attrs) {
+ Element el = createAndGetElement(document, parent, itemName, attrs);
+ el.setTextContent(text);
+ }
+
+ private static Element createAndGetElement(Document document, Element
parent, String elementName, String... attrs) {
+ Element el = document.createElement(elementName);
+ parent.appendChild(el);
+ for (int i = 0; i < attrs.length; i += 2) {
+ el.setAttribute(attrs[i], attrs[i + 1]);
+ }
+ return el;
+ }
+
+}
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
new file mode 100644
index 000000000..4e38aac9c
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import org.junit.jupiter.api.Test;
+
+public class AsyncCliParserTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ SimpleAsyncConfig simpleAsyncConfig =
TikaAsyncCLI.parseCommandLine(new String[]{"input", "output"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertNull(simpleAsyncConfig.getNumClients());
+ assertNull(simpleAsyncConfig.getTimeoutMs());
+ assertNull(simpleAsyncConfig.getXmx());
+
+ simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new String[]{"-o",
"output", "-i", "input"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertNull(simpleAsyncConfig.getNumClients());
+ assertNull(simpleAsyncConfig.getTimeoutMs());
+ assertNull(simpleAsyncConfig.getXmx());
+
+ simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new
String[]{"-output", "output", "-input", "input"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertNull(simpleAsyncConfig.getNumClients());
+ assertNull(simpleAsyncConfig.getTimeoutMs());
+ assertNull(simpleAsyncConfig.getXmx());
+
+ simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new
String[]{"--output", "output", "--input", "input"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertNull(simpleAsyncConfig.getNumClients());
+ assertNull(simpleAsyncConfig.getTimeoutMs());
+ assertNull(simpleAsyncConfig.getXmx());
+
+ simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new
String[]{"--output=output", "--input=input"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertNull(simpleAsyncConfig.getNumClients());
+ assertNull(simpleAsyncConfig.getTimeoutMs());
+ assertNull(simpleAsyncConfig.getXmx());
+ }
+
+ @Test
+ public void testAll() throws Exception {
+ SimpleAsyncConfig simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(
+ new String[]{"-i", "input", "-o", "output", "-n", "5", "-t",
"30000", "-x", "1g"});
+ assertEquals("input", simpleAsyncConfig.getInputDir());
+ assertEquals("output", simpleAsyncConfig.getOutputDir());
+ assertNull(simpleAsyncConfig.getFileList());
+ assertEquals(5, simpleAsyncConfig.getNumClients());
+ assertEquals(30000L, simpleAsyncConfig.getTimeoutMs());
+ assertEquals("1g", simpleAsyncConfig.getXmx());
+ }
+
+ //TODO -- test for file list with and without inputDir
+}