This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4519
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4519 by this push:
new af7b25209 TIKA-4519 -- tika-app tests work
af7b25209 is described below
commit af7b25209b282e034abd7622168b40e68a8bbf18
Author: tallison <[email protected]>
AuthorDate: Mon Nov 3 14:39:32 2025 -0500
TIKA-4519 -- tika-app tests work
---
tika-app/pom.xml | 7 +++
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 26 ++++++----
.../test/java/org/apache/tika/cli/TikaCLITest.java | 14 +++---
tika-pipes/tika-async-cli/pom.xml | 6 +++
.../org/apache/tika/async/cli/PluginsWriter.java | 14 ++++--
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 1 +
.../tika/async/cli/TikaConfigAsyncWriter.java | 17 +------
.../apache/tika/async/cli/AsyncProcessorTest.java | 55 ++++++++++------------
.../tika/async/cli/TikaConfigAsyncWriterTest.java | 2 +-
.../test/resources/configs/TIKA-4207-emitter.xml | 29 ------------
.../test/resources/configs/tika-config-default.xml | 21 +++++++++
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 25 +++++++++-
.../apache/tika/pipes/core/async/AsyncConfig.java | 6 ++-
.../tika/pipes/core/async/AsyncProcessor.java | 2 +-
14 files changed, 126 insertions(+), 99 deletions(-)
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 97b0b6e16..28b558495 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -153,6 +153,13 @@
<type>jar</type>
<overWrite>true</overWrite>
</artifactItem>
+ <artifactItem>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-emitter-file-system</artifactId>
+ <version>${project.version}</version>
+ <type>jar</type>
+ <overWrite>true</overWrite>
+ </artifactItem>
</artifactItems>
</configuration>
</execution>
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index b0f879d80..7ea7982c5 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -43,13 +43,21 @@ public class TikaCLIAsyncTest {
final static String JSON_TEMPLATE = """
{
- "pipesPluginsConfig" : {
+ "plugins" : {
"fetchers": {
"file-system-fetcher": {
- "basePath": "BASE_PATH",
+ "basePath": "FETCHER_BASE_PATH",
"extractFileSystemMetadata": false
}
},
+ "emitters": {
+ "file-system-emitter": {
+ "basePath": "EMITTER_BASE_PATH",
+ "fileExtension": "jsn",
+ "onExists":"EXCEPTION",
+ "prettyPrint": true
+ }
+ },
"pf4j.pluginsDir": "PLUGINS_DIR"
}
}
@@ -73,15 +81,15 @@ public class TikaCLIAsyncTest {
public static void setUpClass() throws Exception {
ASYNC_CONFIG = Files.createTempFile(ASYNC_OUTPUT_DIR, "async-config-",
".xml");
String xml = "<properties>" + "<async>" + "<numClients>3</numClients>"
+ "<tikaConfig>" + ASYNC_CONFIG.toAbsolutePath() + "</tikaConfig>" + "</async>"
+
- "<emitters>" + "<emitter
class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
"<name>fse</name>" + "<basePath>" +
- ASYNC_OUTPUT_DIR.toAbsolutePath() + "</basePath>" +
"<prettyPrint>true</prettyPrint>" + "</emitter>" + "</emitters>" +
"<pipesIterator
class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
"<basePath>" + TEST_DATA_FILE.getAbsolutePath() + "</basePath>" +
- "<fetcherName>file-system-fetcher</fetcherName>" +
"<emitterName>fse</emitterName>" + "</pipesIterator>" + "</properties>";
+ "<fetcherPluginId>file-system-fetcher</fetcherPluginId>" +
"<emitterPluginId>file-system-emitter</emitterPluginId>" + "</pipesIterator>" +
"</properties>";
Files.write(ASYNC_CONFIG, xml.getBytes(UTF_8));
ASYNC_PLUGINS_CONFIG = Files.createTempFile(ASYNC_OUTPUT_DIR,
"plugins-", ".json");
Path pluginsDir = Paths.get("target/plugins");
- String json = JSON_TEMPLATE.replace("BASE_PATH",
TEST_DATA_FILE.getAbsolutePath().toString()).replace("PLUGINS_DIR",
pluginsDir.toAbsolutePath().toString());
+ String json = JSON_TEMPLATE.replace("FETCHER_BASE_PATH",
TEST_DATA_FILE.getAbsolutePath().toString())
+ .replace("EMITTER_BASE_PATH",
ASYNC_OUTPUT_DIR.toAbsolutePath().toString())
+ .replace("PLUGINS_DIR",
pluginsDir.toAbsolutePath().toString());
Files.writeString(ASYNC_PLUGINS_CONFIG, json, UTF_8);
}
@@ -126,6 +134,8 @@ public class TikaCLIAsyncTest {
@Test
public void testAsync() throws Exception {
+ //extension is "jsn" to avoid conflict with json config
+
String content = getParamOutContent("-c",
ASYNC_CONFIG.toAbsolutePath().toString(),
"-a", ASYNC_PLUGINS_CONFIG.toAbsolutePath().toString());
@@ -135,11 +145,11 @@ public class TikaCLIAsyncTest {
.listFiles()) {
if (f
.getName()
- .endsWith(".json")) {
+ .endsWith(".jsn")) {
//check one file for pretty print
if (f
.getName()
- .equals("coffee.xls.json")) {
+ .equals("coffee.xls.jsn")) {
checkForPrettyPrint(f);
}
json++;
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index b4f6301a7..cd10f4566 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -282,8 +282,10 @@ public class TikaCLITest {
@Test
public void testRUnpack() throws Exception {
+ //TODO -- rework this to use two separate emitters
+ //one for bytes and one for json
String[] expectedChildren = new String[]{
- "testPDFPackage.pdf.json",
+ "testPDFPackage.pdf.jsn",
//the first two test that the default single file config is
working
"testPDFPackage.pdf-embed/00000001-embedded-1",
"testPDFPackage.pdf-embed/00000002-image0.jpg",
@@ -294,7 +296,7 @@ public class TikaCLITest {
@Test
public void testPSTRUnpack() throws Exception {
- String[] expectedChildren = new String[]{"testPST.pst.json",
+ String[] expectedChildren = new String[]{"testPST.pst.jsn",
"testPST.pst-embed/00000007-First email.msg",
"testPST.pst-embed/00000001-Feature Generators.msg",
"testPST.pst-embed/00000008-First email.msg",
@@ -305,7 +307,7 @@ public class TikaCLITest {
"testPST.pst-embed/00000009-attachment.docx",
"testPST.pst-embed/00000006-[WEBINAR] - %22Introducing
Couchbase Server 2.5%22.msg"};
testRecursiveUnpack("testPST.pst", expectedChildren, 2);
- try (Reader reader =
Files.newBufferedReader(extractDir.resolve("testPST.pst.json"))) {
+ try (Reader reader =
Files.newBufferedReader(extractDir.resolve("testPST.pst.jsn"))) {
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
for (Metadata m : metadataList) {
String content = m.get(TikaCoreProperties.TIKA_CONTENT);
@@ -400,13 +402,13 @@ public class TikaCLITest {
Path asyncConfig = Files.createTempFile("async-config-", ".json");
Path pluginsDir = Paths.get("target/plugins");
- String json = JSON_TEMPLATE.replace("BASE_PATH",
TEST_DATA_FILE.getAbsolutePath().toString())
+ String json = JSON_TEMPLATE.replace("FETCHER_BASE_PATH",
TEST_DATA_FILE.getAbsolutePath().toString())
+ .replace("EMITTER_BASE_PATH",
extractDir.toAbsolutePath().toString())
.replace("PLUGINS_DIR",
pluginsDir.toAbsolutePath().toString());
Files.writeString(asyncConfig, json, UTF_8);
String[] params = {"-Z",
"-a", asyncConfig.toAbsolutePath().toString(),
-
ProcessUtils.escapeCommandLine(input.toAbsolutePath().toString()),
ProcessUtils.escapeCommandLine(extractDir
.toAbsolutePath()
@@ -424,7 +426,7 @@ public class TikaCLITest {
assertEquals(expectedLength, jsonFile.length);
for (String expectedChildName : expectedChildrenFileNames) {
- assertTrue(fileNames.contains(expectedChildName));
+ assertTrue(fileNames.contains(expectedChildName),
expectedChildName);
}
}
diff --git a/tika-pipes/tika-async-cli/pom.xml
b/tika-pipes/tika-async-cli/pom.xml
index 4fe950200..e1c02d5f1 100644
--- a/tika-pipes/tika-async-cli/pom.xml
+++ b/tika-pipes/tika-async-cli/pom.xml
@@ -50,6 +50,12 @@
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-emitter-file-system</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-fetcher-file-system</artifactId>
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index 80d7d46f0..308ad4ae1 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -25,13 +25,19 @@ public class PluginsWriter {
final static String JSON_TEMPLATE = """
{
- "pipesPluginsConfig" : {
+ "plugins" : {
"fetchers": {
"file-system-fetcher": {
- "basePath": "BASE_PATH",
+ "basePath": "FETCHER_BASE_PATH",
"extractFileSystemMetadata": false
}
},
+ "emitters": {
+ "file-system-emitter": {
+ "basePath": "EMITTER_BASE_PATH",
+ "fileExtension": "json"
+ }
+ },
"pf4j.pluginsDir": "PLUGINS_DIR"
}
}
@@ -52,8 +58,8 @@ public class PluginsWriter {
}
}
try {
- String json = JSON_TEMPLATE.replace("BASE_PATH",
baseInput.toAbsolutePath().toString());
- System.out.println("PWD: " + Paths.get("").toAbsolutePath());
+ String json = JSON_TEMPLATE.replace("FETCHER_BASE_PATH",
baseInput.toAbsolutePath().toString());
+ json = json.replace("EMITTER_BASE_PATH",
baseOutput.toAbsolutePath().toString());
String pluginString = "plugins";
Path plugins = Paths.get(pluginString);
if (Files.isDirectory(plugins)) {
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 3eb29bb39..40ca2b2bd 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -103,6 +103,7 @@ public class TikaAsyncCLI {
try {
pipesIterator = PipesIterator.build(tikaConfig);
} catch (IOException | TikaException e) {
+ e.printStackTrace();
//swallow
}
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
index e8306c9da..4fa8af1fb 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
@@ -50,7 +50,7 @@ class TikaConfigAsyncWriter {
private static final Logger LOG =
LoggerFactory.getLogger(TikaAsyncCLI.class);
protected static final String FETCHER_NAME = "file-system-fetcher";
- protected static final String EMITTER_NAME = "fse";
+ protected static final String EMITTER_NAME = "file-system-emitter";
private final SimpleAsyncConfig simpleAsyncConfig;
@@ -92,7 +92,6 @@ class TikaConfigAsyncWriter {
}
writePipesIterator(document, properties, baseInput);
- writeEmitters(document, properties, baseOutput);
writeAsync(document, properties, output);
Transformer transformer = TransformerFactory
.newInstance().newTransformer();
@@ -135,20 +134,6 @@ class TikaConfigAsyncWriter {
appendTextElement(document, pipesIterator, "hasHeader", "false");
}
- private void writeEmitters(Document document, Element properties, Path
baseOutput) {
- Element emitters = findChild("emitters", properties);
- if (emitters != null) {
- LOG.info("emitters already exist in tika-config. Not overwriting
with commandline");
- return;
- }
-
- emitters = createAndGetElement(document, properties, "emitters");
- Element emitter = createAndGetElement( document, emitters, "emitter",
- "class", "org.apache.tika.pipes.emitter.fs.FileSystemEmitter");
- appendTextElement(document, emitter, "name", EMITTER_NAME);
- appendTextElement(document, emitter, "basePath",
baseOutput.toAbsolutePath().toString());
- }
-
private void writeAsync(Document document, Element properties, Path
thisTikaConfig) {
Element async = findChild("async", properties);
if (async != null) {
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 771c02e87..49e85ba64 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -52,12 +52,19 @@ public class AsyncProcessorTest extends TikaTest {
final static String JSON_TEMPLATE_TEST = """
{
- "pipesPluginsConfig" : {
+ "plugins" : {
"fetchers": {
"file-system-fetcher": {
- "basePath": "BASE_PATH",
+ "basePath": "FETCHER_BASE_PATH",
"extractFileSystemMetadata": false
}
+ },
+ "emitters": {
+ "file-system-emitter": {
+ "basePath": "EMITTER_BASE_PATH",
+ "fileExtension": "",
+ "onExists":"EXCEPTION"
+ }
}
}
}
@@ -68,44 +75,32 @@ public class AsyncProcessorTest extends TikaTest {
private Path basedir;
private Path inputDir;
- private Path bytesDir;
-
- private Path jsonDir;
+ private Path outputDir;
private Path configDir;
+ private Path tikaConfigPath;
+
@BeforeEach
public void setUp() throws IOException {
inputDir = basedir.resolve("input");
- bytesDir = basedir.resolve("bytes");
-
- jsonDir = basedir.resolve("json");
+ outputDir = basedir.resolve("output");
configDir = basedir.resolve("config");
- Path tikaConfig = configDir.resolve("tika-config.xml");
Files.createDirectories(basedir);
Files.createDirectories(configDir);
Files.createDirectories(inputDir);
- String xml =
IOUtils.toString(AsyncProcessorTest.class.getResourceAsStream("/configs/TIKA-4207-emitter.xml"),
StandardCharsets.UTF_8);
- //do stuff to xml
- xml = xml.replace("BASE_PATH", inputDir
- .toAbsolutePath()
- .toString());
- xml = xml.replace("JSON_PATH", jsonDir
- .toAbsolutePath()
- .toString());
- xml = xml.replace("BYTES_PATH", bytesDir
- .toAbsolutePath()
- .toString());
-
- Files.writeString(tikaConfig, xml, StandardCharsets.UTF_8);
-
+ tikaConfigPath = configDir.resolve("tika-config.xml");
+
Files.copy(AsyncProcessorTest.class.getResourceAsStream("/configs/tika-config-default.xml"),
tikaConfigPath);
Path pipesConfig = configDir.resolve("tika-pipes.json");
String jsonTemp = JSON_TEMPLATE_TEST
- .replace("BASE_PATH", inputDir.toAbsolutePath().toString());
+ .replace("FETCHER_BASE_PATH",
inputDir.toAbsolutePath().toString())
+ .replace("EMITTER_BASE_PATH",
outputDir.toAbsolutePath().toString());
+
+
Files.writeString(pipesConfig, jsonTemp, StandardCharsets.UTF_8);
Path mock = inputDir.resolve("mock.xml");
@@ -118,11 +113,11 @@ public class AsyncProcessorTest extends TikaTest {
public void testBasic() throws Exception {
// TikaAsyncCLI cli = new TikaAsyncCLI();
// cli.main(new String[]{
configDir.resolve("tika-config.xml").toAbsolutePath().toString()});
- AsyncProcessor processor = new
AsyncProcessor(configDir.resolve("tika-config.xml"),
configDir.resolve("tika-pipes.json"));
+ AsyncProcessor processor = new AsyncProcessor(tikaConfigPath,
configDir.resolve("tika-pipes.json"));
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new
EmbeddedDocumentBytesConfig(true);
embeddedDocumentBytesConfig.setIncludeOriginal(true);
- embeddedDocumentBytesConfig.setEmitter("bytes");
+ embeddedDocumentBytesConfig.setEmitter("file-system-emitter");
embeddedDocumentBytesConfig.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.NONE);
embeddedDocumentBytesConfig.setEmbeddedIdPrefix("-");
ParseContext parseContext = new ParseContext();
@@ -130,7 +125,7 @@ public class AsyncProcessorTest extends TikaTest {
parseContext.set(EmbeddedDocumentBytesConfig.class,
embeddedDocumentBytesConfig);
FetchEmitTuple t =
new FetchEmitTuple("myId-1", new
FetchKey("file-system-fetcher", "mock.xml"),
- new EmitKey("json", "emit-1"), new Metadata(),
parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
+ new EmitKey("file-system-emitter", "emit-1"), new
Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
processor.offer(t, 1000);
@@ -143,15 +138,15 @@ public class AsyncProcessorTest extends TikaTest {
}
processor.close();
- String container =
Files.readString(bytesDir.resolve("emit-1-embed/emit-1-0"));
+ String container =
Files.readString(outputDir.resolve("emit-1-embed/emit-1-0"));
assertContains("\"dc:creator\">Nikolai Lobachevsky", container);
- String xmlEmbedded =
Files.readString(bytesDir.resolve("emit-1-embed/emit-1-1"));
+ String xmlEmbedded =
Files.readString(outputDir.resolve("emit-1-embed/emit-1-1"));
assertContains("name=\"dc:creator\"", xmlEmbedded);
assertContains(">embeddedAuthor</metadata>", xmlEmbedded);
List<Metadata> metadataList;
- try (BufferedReader reader =
Files.newBufferedReader(jsonDir.resolve("emit-1.json"))) {
+ try (BufferedReader reader =
Files.newBufferedReader(outputDir.resolve("emit-1"))) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(2, metadataList.size());
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
index 3dd105926..44608e1ee 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
@@ -49,7 +49,7 @@ public class TikaConfigAsyncWriterTest {
TikaConfigAsyncWriter writer = new
TikaConfigAsyncWriter(simpleAsyncConfig);
writer.write(target);
- Set<String> expected = Set.of("service-loader", "parsers",
"pipesIterator", "emitters", "async");
+ Set<String> expected = Set.of("service-loader", "parsers",
"pipesIterator", "async");
Set<String> properties = loadProperties(target);
assertEquals(expected, properties);
}
diff --git
a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
deleted file mode 100644
index 1f5229480..000000000
--- a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <name>json</name>
- <basePath>JSON_PATH</basePath>
- </emitter>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <name>bytes</name>
- <basePath>BYTES_PATH</basePath>
- </emitter>
- </emitters>
-</properties>
\ No newline at end of file
diff --git
a/tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-default.xml
b/tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-default.xml
new file mode 100644
index 000000000..008a36dfd
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-default.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+</properties>
\ No newline at end of file
diff --git
a/tika-pipes/tika-emitters/tika-emitter-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
b/tika-pipes/tika-emitters/tika-emitter-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index cb03a1e26..87aed84d2 100644
---
a/tika-pipes/tika-emitters/tika-emitter-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++
b/tika-pipes/tika-emitters/tika-emitter-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -29,6 +29,8 @@ import java.util.List;
import java.util.Optional;
import org.pf4j.Extension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
@@ -52,6 +54,9 @@ import org.apache.tika.utils.StringUtils;
@Extension
public class FileSystemEmitter extends AbstractStreamEmitter {
+ private static final Logger LOG =
LoggerFactory.getLogger(FileSystemEmitter.class);
+
+
private FileSystemEmitterConfig fileSystemEmitterConfig;
public FileSystemEmitter() throws IOException {
@@ -62,12 +67,18 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
public void configure(PluginConfig pluginConfig) throws
TikaConfigException, IOException {
checkPluginId(pluginConfig.pluginId());
fileSystemEmitterConfig =
FileSystemEmitterConfig.load(pluginConfig.jsonConfig());
- //checkConfig(fileSystemEmitterConfig);
+ checkConfig(fileSystemEmitterConfig);
+ }
+
+ private void checkConfig(FileSystemEmitterConfig fileSystemEmitterConfig) {
+ if (fileSystemEmitterConfig.onExists() == null) {
+ throw new IllegalArgumentException("Must configure 'onExists' as
'skip', 'exception' or 'replace'");
+ }
}
@Override
public void emit(String emitKey, List<Metadata> metadataList, ParseContext
parseContext) throws IOException {
-
+ LOG.warn("about to emit: {}", emitKey);
if (metadataList == null || metadataList.isEmpty()) {
throw new IOException("metadata list must not be null or of size
0");
}
@@ -100,6 +111,8 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
@Override
public void emit(String emitKey, InputStream inputStream, Metadata
userMetadata, ParseContext parseContext) throws IOException {
+ LOG.warn("about to stream emit: {}", emitKey);
+
FileSystemEmitterConfig config = getConfig(parseContext);
Path output;
@@ -114,18 +127,25 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
}
if (!Files.isDirectory(output.getParent())) {
+ LOG.warn("creating parent directory: {}", output);
Files.createDirectories(output.getParent());
}
+ LOG.warn("on exists: {}", config.onExists());
if (config.onExists() == ON_EXISTS.REPLACE) {
+ LOG.warn("copying {}", output);
Files.copy(inputStream, output,
StandardCopyOption.REPLACE_EXISTING);
} else if (config.onExists() == ON_EXISTS.EXCEPTION) {
+ LOG.warn("copying 2 {}", output);
Files.copy(inputStream, output);
} else if (config.onExists() == ON_EXISTS.SKIP) {
if (!Files.isRegularFile(output)) {
try {
+ LOG.warn("copying 3 {}", output);
+
Files.copy(inputStream, output);
} catch (FileAlreadyExistsException e) {
//swallow
+ LOG.warn("file exists");
}
}
}
@@ -138,6 +158,7 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
Optional<PluginConfig> pluginConfigOpt =
pluginConfigs.get(getPluginId());
if (pluginConfigOpt.isPresent()) {
config =
FileSystemEmitterConfig.load(pluginConfigOpt.get().jsonConfig());
+ checkConfig(config);
}
}
return config;
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncConfig.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncConfig.java
index b0808f7fa..2bde10515 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncConfig.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncConfig.java
@@ -39,8 +39,10 @@ public class AsyncConfig extends PipesConfigBase {
public static AsyncConfig load(Path tikaConfig, Path pipesPluginsConfig)
throws IOException, TikaConfigException {
AsyncConfig asyncConfig = new AsyncConfig();
- try (InputStream is = Files.newInputStream(tikaConfig)) {
- asyncConfig.configure("async", is);
+ if (tikaConfig != null) {
+ try (InputStream is = Files.newInputStream(tikaConfig)) {
+ asyncConfig.configure("async", is);
+ }
}
if (asyncConfig.getTikaConfig() == null) {
asyncConfig.setTikaConfig(tikaConfig);
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncProcessor.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncProcessor.java
index 2c15d05ef..6abce6093 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncProcessor.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/async/AsyncProcessor.java
@@ -83,7 +83,7 @@ public class AsyncProcessor implements Closeable {
this.executorCompletionService =
new ExecutorCompletionService<>(executorService);
try {
- if
(!tikaConfigPath.toAbsolutePath().equals(asyncConfig.getTikaConfig().toAbsolutePath()))
{
+ if (asyncConfig.getTikaConfig() != null &&
!tikaConfigPath.toAbsolutePath().equals(asyncConfig.getTikaConfig().toAbsolutePath()))
{
LOG.warn("TikaConfig for AsyncProcessor ({}) is different " +
"from TikaConfig for workers ({}). If this is
intended," +
" please ignore this warning.",
tikaConfigPath.toAbsolutePath(),