This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4b66205620 TIKA-4736 -- image extraction fails (#2828)
4b66205620 is described below
commit 4b66205620250bb59602dca11b0f44374152f0f2
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 21 21:35:07 2026 -0400
TIKA-4736 -- image extraction fails (#2828)
---
docs/modules/ROOT/pages/using-tika/cli/index.adoc | 3 +++
.../main/java/org/apache/tika/cli/AsyncHelper.java | 7 +++++
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 ++
.../java/org/apache/tika/cli/AsyncHelperTest.java | 18 +++++++++++++
.../org/apache/tika/async/cli/PluginsWriter.java | 10 +++++++
.../apache/tika/async/cli/SimpleAsyncConfig.java | 13 +++++++++
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 16 ++++++++++-
.../apache/tika/async/cli/AsyncCliParserTest.java | 25 +++++++++++++++++
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 18 ++++++++++++-
.../fs/FileSystemEmitterRuntimeConfigTest.java | 31 ++++++++++++++++++++++
10 files changed, 141 insertions(+), 2 deletions(-)
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index f5d15608cf..c9f9da8f03 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -200,6 +200,9 @@ to the output directory.
|`--content-only`
|Output only extracted content (no metadata, no JSON wrapper); implies
`--concatenate`
+|`--on-exists`
+|Behavior when an output file already exists: `exception` (default), `replace`
or `skip`
+
|`-T` or `--timeoutMs`
|Timeout for each parse in milliseconds
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index 2320fa7df5..df99107871 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -44,6 +44,13 @@ public class AsyncHelper {
}
argList.add("-o");
argList.add(dir);
+ } else if ("--extract".equals(arg)) {
+ // tika-app documents --extract as the long form of -z.
TikaAsyncCLI
+ // only knows -z/--unzipShallow (and -Z/--unzipRecursive), so
without
+ // this translation --extract falls through as an unrecognized
arg and
+ // trips the "unknown args" / "set inputDir once" errors
(TIKA-4736).
+ // -z passes through untranslated and is already recognized.
+ argList.add("-z");
} else if ("-a".equals(arg)) {
//do nothing
} else if (arg.startsWith(UNPACK_FORMAT_KEY)) {
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index aeea5f9eda..d4a5628489 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -801,6 +801,8 @@ public class TikaCLI {
out.println(" -pX or --password=X Use document password X");
out.println(" -z or --extract Extract all attachements into
current directory");
out.println(" --extract-dir=<dir> Specify target directory for
-z");
+ out.println(" --on-exists=<mode> When an output file already
exists: exception");
+ out.println(" (default), replace or skip");
out.println(" --maxEmbeddedDepth=X Maximum depth for embedded
document extraction");
out.println(" --maxEmbeddedCount=X Maximum number of embedded
documents to extract");
out.println(" -r or --pretty-print For JSON, XML and XHTML
outputs, adds newlines and");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index f1a3b79864..bb668b7660 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -57,6 +57,24 @@ public class AsyncHelperTest {
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
+ @Test
+ public void testExtractLongFormTranslatedToZ() throws Exception {
+ // TIKA-4736: tika-app's --extract is the long form of -z. It must be
+ // translated to -z (which TikaAsyncCLI recognizes); otherwise it falls
+ // through as an unknown arg and the batch parse fails.
+ String[] args = new String[]{"--extract", "--extract-dir=ImageFiles",
"input.pdf"};
+ String[] expected = new String[]{"-z", "-o", "ImageFiles",
"input.pdf"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
+ @Test
+ public void testShortFormZUnchanged() throws Exception {
+ // -z is already recognized by TikaAsyncCLI and must pass through
untranslated.
+ String[] args = new String[]{"-z", "--extract-dir=ImageFiles",
"input.pdf"};
+ String[] expected = new String[]{"-z", "-o", "ImageFiles",
"input.pdf"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
@Test
public void testJsonRecursiveSkipped() throws Exception {
// -J is the default in async mode, so it's just skipped
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
index ef04527d95..dbc3de3935 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/PluginsWriter.java
@@ -151,6 +151,16 @@ public class PluginsWriter {
}
}
+ // Override the emitter's onExists policy if set on the CLI
(--on-exists)
+ if (!StringUtils.isBlank(simpleAsyncConfig.getOnExists())
+ && emitters != null && emitters.has("fse")) {
+ ObjectNode fse = (ObjectNode) emitters.get("fse");
+ if (fse != null && fse.has("file-system-emitter")) {
+ ObjectNode fsEmitter = (ObjectNode)
fse.get("file-system-emitter");
+ fsEmitter.put("onExists", simpleAsyncConfig.getOnExists());
+ }
+ }
+
// Write timeout limits to parse-context if configured on CLI
if (simpleAsyncConfig.getTimeoutMs() != null) {
ObjectNode parseContext = root.has("parse-context")
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 5ea5e764ba..f10788d89a 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -46,6 +46,10 @@ class SimpleAsyncConfig {
private final String unpackMode; // "ZIPPED" or "DIRECTORY"
private final boolean unpackIncludeMetadata;
+ // Emitter behavior when an output file already exists: "EXCEPTION",
"REPLACE" or "SKIP".
+ // null leaves the emitter/config default (EXCEPTION) in place.
+ private String onExists;
+
//TODO -- switch to a builder
public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList,
String tikaConfig,
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
@@ -136,6 +140,14 @@ class SimpleAsyncConfig {
return unpackIncludeMetadata;
}
+ public String getOnExists() {
+ return onExists;
+ }
+
+ public void setOnExists(String onExists) {
+ this.onExists = onExists;
+ }
+
@Override
public String toString() {
return "SimpleAsyncConfig{" +
@@ -154,6 +166,7 @@ class SimpleAsyncConfig {
", unpackFormat='" + unpackFormat + '\'' +
", unpackMode='" + unpackMode + '\'' +
", unpackIncludeMetadata=" + unpackIncludeMetadata +
+ ", onExists='" + onExists + '\'' +
'}';
}
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 3f02173a5e..845b5b1940 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -82,6 +82,8 @@ public class TikaAsyncCLI {
"output mode for unpacking: ZIPPED (default) or DIRECTORY");
options.addOption(null, "unpack-include-metadata", false,
"include metadata.json in Frictionless output");
+ options.addOption(null, "on-exists", true,
+ "behavior when an output file already exists: exception
(default), replace or skip");
return options;
}
@@ -235,6 +237,16 @@ public class TikaAsyncCLI {
unpackIncludeMetadata = true;
}
+ String onExists = null;
+ if (line.hasOption("on-exists")) {
+ String v =
line.getOptionValue("on-exists").toUpperCase(java.util.Locale.ROOT);
+ if (!v.equals("EXCEPTION") && !v.equals("REPLACE") &&
!v.equals("SKIP")) {
+ throw new TikaConfigException("Can't understand --on-exists=" +
+ line.getOptionValue("on-exists") + "; must be one of:
exception, replace, skip");
+ }
+ onExists = v;
+ }
+
if (line.getArgList().size() > 2) {
throw new TikaConfigException("Can't have more than 2 unknown
args: " + line.getArgList());
}
@@ -282,10 +294,12 @@ public class TikaAsyncCLI {
outputDir = Paths.get("output").toAbsolutePath().toString();
}
- return new SimpleAsyncConfig(inputDir, outputDir,
+ SimpleAsyncConfig config = new SimpleAsyncConfig(inputDir, outputDir,
numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType,
extractBytesMode, pluginsDir, concatenate, contentOnly,
unpackFormat, unpackMode, unpackIncludeMetadata);
+ config.setOnExists(onExists);
+ return config;
}
private static BasicContentHandlerFactory.HANDLER_TYPE
getHandlerType(String t) throws TikaConfigException {
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index 8795549aab..b8960b7c08 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.file.Files;
@@ -30,6 +31,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.sax.BasicContentHandlerFactory;
public class AsyncCliParserTest {
@@ -91,6 +93,29 @@ public class AsyncCliParserTest {
assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.XML,
simpleAsyncConfig.getHandlerType());
}
+ @Test
+ public void testOnExists() throws Exception {
+ // TIKA-4736: --on-exists is normalized to upper case and carried on
the config.
+ SimpleAsyncConfig replace = TikaAsyncCLI.parseCommandLine(
+ new String[]{"-i", "input", "-o", "output", "--on-exists",
"replace"});
+ assertEquals("REPLACE", replace.getOnExists());
+
+ SimpleAsyncConfig skip = TikaAsyncCLI.parseCommandLine(
+ new String[]{"-i", "input", "-o", "output", "--on-exists",
"skip"});
+ assertEquals("SKIP", skip.getOnExists());
+
+ // Default (unset) leaves the emitter/config default (EXCEPTION) in
place.
+ SimpleAsyncConfig dflt = TikaAsyncCLI.parseCommandLine(
+ new String[]{"-i", "input", "-o", "output"});
+ assertNull(dflt.getOnExists());
+ }
+
+ @Test
+ public void testOnExistsInvalid() {
+ assertThrows(TikaConfigException.class, () ->
TikaAsyncCLI.parseCommandLine(
+ new String[]{"-i", "input", "-o", "output", "--on-exists",
"bogus"}));
+ }
+
@Test
public void testFileListWithInputDir(@TempDir Path tmp) throws Exception {
Path fileList = tmp.resolve("files.txt");
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index e990652b30..564beff9af 100644
---
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -122,6 +122,8 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8,
StandardOpenOption.CREATE_NEW)) { //CREATE_NEW forces an
IOException if the file already exists
JsonMetadataList.toJson(metadataList, writer,
config.prettyPrint());
+ } catch (FileAlreadyExistsException e) {
+ throw alreadyExistsException(output);
}
} else {
try (Writer writer = Files.newBufferedWriter(output,
StandardCharsets.UTF_8)) {
@@ -157,7 +159,11 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
if (config.onExists() == FileSystemEmitterConfig.ON_EXISTS.REPLACE) {
Files.copy(inputStream, output,
StandardCopyOption.REPLACE_EXISTING);
} else if (config.onExists() ==
FileSystemEmitterConfig.ON_EXISTS.EXCEPTION) {
- Files.copy(inputStream, output);
+ try {
+ Files.copy(inputStream, output);
+ } catch (FileAlreadyExistsException e) {
+ throw alreadyExistsException(output);
+ }
} else if (config.onExists() ==
FileSystemEmitterConfig.ON_EXISTS.SKIP) {
if (!Files.isRegularFile(output)) {
try {
@@ -169,6 +175,16 @@ public class FileSystemEmitter extends
AbstractStreamEmitter {
}
}
+ /**
+ * Actionable error for the {@code onExists=EXCEPTION} case; the bare
+ * {@link FileAlreadyExistsException} reports only the path (TIKA-4736).
+ */
+ private static IOException alreadyExistsException(Path output) {
+ return new IOException("Output already exists (onExists=EXCEPTION, not
overwritten): "
+ + output.toAbsolutePath()
+ + ". Use an empty output dir, delete the file, or set onExists
to REPLACE or SKIP.");
+ }
+
private FileSystemEmitterConfig getConfig(ParseContext parseContext)
throws TikaConfigException, IOException {
FileSystemEmitterConfig config = fileSystemEmitterConfig;
String configKey = getExtensionConfig().id();
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
index c13bef3004..f3abd3dd67 100644
---
a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitterRuntimeConfigTest.java
@@ -229,4 +229,35 @@ public class FileSystemEmitterRuntimeConfigTest {
assertFalse(Files.exists(otherDir.resolve("test.json")),
"File should not be created in other directory");
}
+
+ @Test
+ public void testExceptionMessageWhenOutputExists(@TempDir Path tempDir)
throws Exception {
+ // TIKA-4736: the onExists=EXCEPTION failure should carry an
actionable message,
+ // not just the bare path from FileAlreadyExistsException.
+ String config = String.format(Locale.ROOT,
+ "{\"basePath\":\"%s\", \"onExists\":\"EXCEPTION\"}",
+ tempDir.toString().replace("\\", "\\\\"));
+ FileSystemEmitter emitter = FileSystemEmitter.build(
+ new ExtensionConfig("test-emitter", "test", config));
+ ParseContext context = new ParseContext();
+
+ // Bytes path (the --extract / image-extraction scenario)
+ emitter.emit("img", new
ByteArrayInputStream("a".getBytes(StandardCharsets.UTF_8)),
+ new Metadata(), context);
+ IOException bytesEx = assertThrows(IOException.class, () ->
+ emitter.emit("img", new
ByteArrayInputStream("b".getBytes(StandardCharsets.UTF_8)),
+ new Metadata(), context));
+ assertTrue(bytesEx.getMessage().contains("onExists=EXCEPTION"),
bytesEx.getMessage());
+ assertTrue(bytesEx.getMessage().contains("REPLACE or SKIP"),
bytesEx.getMessage());
+
+ // Metadata-JSON path
+ List<Metadata> metadataList = new ArrayList<>();
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.TIKA_CONTENT, "x");
+ metadataList.add(m);
+ emitter.emit("meta.json", metadataList, context);
+ IOException metaEx = assertThrows(IOException.class, () ->
+ emitter.emit("meta.json", metadataList, context));
+ assertTrue(metaEx.getMessage().contains("onExists=EXCEPTION"),
metaEx.getMessage());
+ }
}