This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4515 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 38f3fe52124227e20a7d49d5c98576f68ba16608 Author: tallison <[email protected]> AuthorDate: Fri Oct 10 17:40:43 2025 -0400 TIKA-4515 -- add fully recursive extraction --- .../apache/tika/async/cli/SimpleAsyncConfig.java | 9 ++- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 39 ++++++++++--- .../tika/async/cli/TikaConfigAsyncWriter.java | 13 ++--- .../tika/async/cli/TikaConfigAsyncWriterTest.java | 4 +- .../AbstractEmbeddedDocumentBytesHandler.java | 65 +++++++++++++++++++--- .../extractor/EmbeddedDocumentBytesConfig.java | 57 +++++++++++++------ 6 files changed, 144 insertions(+), 43 deletions(-) diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java index 603f80e3d..0c3987165 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java @@ -25,9 +25,11 @@ class SimpleAsyncConfig { private String xmx; private String fileList; private String tikaConfig;//path to the tikaConfig file to be used in the forked process + private boolean extractBytes; //TODO -- switch to a builder - public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, String tikaConfig) { + public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, + String tikaConfig, boolean extractBytes) { this.inputDir = inputDir; this.outputDir = outputDir; this.numClients = numClients; @@ -35,6 +37,7 @@ class SimpleAsyncConfig { this.xmx = xmx; this.fileList = fileList; this.tikaConfig = tikaConfig; + this.extractBytes = extractBytes; } public String getInputDir() { @@ -64,4 +67,8 @@ class SimpleAsyncConfig { public String getTikaConfig() { return tikaConfig; } + + public boolean isExtractBytes() { + return extractBytes; + } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 2a87a4b1a..b4bbd5ed8 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -30,8 +30,10 @@ import org.apache.commons.cli.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.async.AsyncProcessor; +import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; public class TikaAsyncCLI { @@ -51,6 +53,7 @@ public class TikaAsyncCLI { options.addOption("l", "fileList", true, "file list"); options.addOption("c", "config", true, "tikaConfig to inherit from -- " + "commandline options will not overwrite existing iterators, emitters, fetchers and async"); + options.addOption("Z", "unzip", false, "extract raw bytes from attachments"); return options; } @@ -59,7 +62,7 @@ public class TikaAsyncCLI { if (args.length == 0) { usage(getOptions()); } else if (args.length == 1) { - processWithTikaConfig(Paths.get(args[0])); + processWithTikaConfig(Paths.get(args[0]), false); } else { processCommandLine(args); } @@ -74,7 +77,7 @@ public class TikaAsyncCLI { tikaConfig = Files.createTempFile("tika-async-tmp-", ".xml"); TikaConfigAsyncWriter tikaConfigAsyncWriter = new TikaConfigAsyncWriter(simpleAsyncConfig); tikaConfigAsyncWriter.write(tikaConfig); - processWithTikaConfig(tikaConfig); + processWithTikaConfig(tikaConfig, simpleAsyncConfig.isExtractBytes()); } finally { if (tikaConfig != null) { Files.delete(tikaConfig); @@ -85,7 +88,8 @@ public class TikaAsyncCLI { //not private for testing purposes static SimpleAsyncConfig parseCommandLine(String[] args) throws ParseException, IOException { if (args.length == 2 && ! args[0].startsWith("-")) { - return new SimpleAsyncConfig(args[0], args[1], null, null, null, null, null); + return new SimpleAsyncConfig(args[0], args[1], null, + null, null, null, null, false); } Options options = getOptions(); @@ -103,6 +107,7 @@ public class TikaAsyncCLI { Integer numClients = null; String fileList = null; String tikaConfig = null; + boolean extractBytes = false; if (line.hasOption("i")) { inputDir = line.getOptionValue("i"); } @@ -121,21 +126,25 @@ public class TikaAsyncCLI { if (line.hasOption("l")) { fileList = line.getOptionValue("l"); } - if (line.hasOption("c")) { tikaConfig = line.getOptionValue("c"); } + if (line.hasOption("Z")) { + extractBytes = true; + } + return new SimpleAsyncConfig(inputDir, outputDir, - numClients, timeoutMs, xmx, fileList, tikaConfig); + numClients, timeoutMs, xmx, fileList, tikaConfig, extractBytes); } - private static void processWithTikaConfig(Path tikaConfigPath) throws Exception { + private static void processWithTikaConfig(Path tikaConfigPath, boolean extractBytes) throws Exception { PipesIterator pipesIterator = PipesIterator.build(tikaConfigPath); long start = System.currentTimeMillis(); try (AsyncProcessor processor = new AsyncProcessor(tikaConfigPath, pipesIterator)) { for (FetchEmitTuple t : pipesIterator) { + configureExtractBytes(t, extractBytes); boolean offered = processor.offer(t, TIMEOUT_MS); if (!offered) { throw new TimeoutException("timed out waiting to add a fetch emit tuple"); @@ -155,12 +164,28 @@ public class TikaAsyncCLI { } } + private static void configureExtractBytes(FetchEmitTuple t, boolean extractBytes) { + if (! extractBytes) { + return; + } + ParseContext parseContext = t.getParseContext(); + EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(); + config.setExtractEmbeddedDocumentBytes(true); + config.setEmitter(TikaConfigAsyncWriter.EMITTER_NAME); + config.setIncludeOriginal(false); + config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.DETECTED); + config.setEmbeddedIdPrefix("-"); + config.setZeroPadName(8); + config.setKeyBaseStrategy(EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_AS_IS); + parseContext.set(EmbeddedDocumentBytesConfig.class, config); + } + private static void usage(Options options) throws IOException { System.out.println("Two primary options:"); System.out.println("\t1. Specify a tika-config.xml on the commandline that includes the definitions for async"); System.out.println("\t2. Commandline:"); org.apache.commons.cli.help.HelpFormatter helpFormatter = org.apache.commons.cli.help.HelpFormatter.builder().get(); - helpFormatter.printHelp("tikaAsynCli", null, options, null, true); + helpFormatter.printHelp("tikaAsyncCli", null, options, null, true); System.exit(1); } } diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java index 5ff8f5d46..d27329731 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java @@ -50,7 +50,7 @@ class TikaConfigAsyncWriter { private static final Logger LOG = LoggerFactory.getLogger(TikaAsyncCLI.class); private static final String FETCHER_NAME = "fsf"; - private static final String EMITTER_NAME = "fse"; + static final String EMITTER_NAME = "fse"; private final SimpleAsyncConfig simpleAsyncConfig; @@ -85,7 +85,7 @@ class TikaConfigAsyncWriter { writePipesIterator(document, properties); writeFetchers(document, properties); writeEmitters(document, properties); - writeAsync(document, properties); + writeAsync(document, properties, output); Transformer transformer = TransformerFactory .newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); @@ -166,7 +166,7 @@ class TikaConfigAsyncWriter { } } - private void writeAsync(Document document, Element properties) { + private void writeAsync(Document document, Element properties, Path thisTikaConfig) { Element async = findChild("async", properties); if (async != null) { LOG.info("async already exists in tika-config. Not overwriting with commandline"); @@ -190,10 +190,9 @@ class TikaConfigAsyncWriter { if (simpleAsyncConfig.getTimeoutMs() != null) { appendTextElement(document, async, "timeoutMillis", Long.toString(simpleAsyncConfig.getTimeoutMs())); } - if (simpleAsyncConfig.getTikaConfig() != null) { - Path p = Paths.get(simpleAsyncConfig.getTikaConfig()); - appendTextElement(document, async, "tikaConfig", p.toAbsolutePath().toString()); - } + appendTextElement(document, async, "tikaConfig", thisTikaConfig.toAbsolutePath().toString()); + + appendTextElement(document, async, "maxForEmitBatchBytes", "0"); } private static void appendTextElement(Document document, Element parent, String itemName, String text, String... attrs) { diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java index bd5457ee4..adafdafd6 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java @@ -42,7 +42,7 @@ public class TikaConfigAsyncWriterTest { public void testBasic(@TempDir Path dir) throws Exception { Path p = Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-parsers.xml").toURI()); SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", "output", 4, - 10000L, "-Xmx1g", null, p.toAbsolutePath().toString()); + 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false); Path target = dir.resolve("combined.xml"); TikaConfigAsyncWriter writer = new TikaConfigAsyncWriter(simpleAsyncConfig); writer.write(target); @@ -56,7 +56,7 @@ public class TikaConfigAsyncWriterTest { public void testDontOverwriteEmitters(@TempDir Path dir) throws Exception { Path p = Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-emitters.xml").toURI()); SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", "output", 4, - 10000L, "-Xmx1g", null, p.toAbsolutePath().toString()); + 10000L, "-Xmx1g", null, p.toAbsolutePath().toString(), false); Path target = dir.resolve("combined.xml"); TikaConfigAsyncWriter writer = new TikaConfigAsyncWriter(simpleAsyncConfig); writer.write(target); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java index 3348eb720..80ff66984 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java @@ -18,19 +18,26 @@ package org.apache.tika.pipes.core.extractor; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Locale; +import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; import org.apache.tika.io.FilenameUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.utils.StringUtils; public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler { + private static final MimeTypes MIME_TYPES = TikaConfig.getDefaultConfig().getMimeRepository(); + List<Integer> ids = new ArrayList<>(); public String getEmitKey(String containerEmitKey, int embeddedId, @@ -43,8 +50,24 @@ public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDo StringBuilder emitKey = new StringBuilder(); - if (StringUtils.isBlank(embeddedDocumentBytesConfig.getEmitKeyBase())) { + if (embeddedDocumentBytesConfig.getKeyBaseStrategy() == + EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_AS_IS) { + emitKey.append(containerEmitKey); + emitKey.append("-embed"); + emitKey.append("/"); + emitKey.append(embeddedIdString).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()); + Path p = Paths.get(metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + String fName = p.getFileName().toString(); + emitKey.append(fName); + if (! fName.contains(".")) { + appendSuffix(emitKey, metadata, embeddedDocumentBytesConfig); + } + + return emitKey.toString(); + } else if (embeddedDocumentBytesConfig.getKeyBaseStrategy() == + EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_NUMBERED) { emitKey.append(containerEmitKey); + emitKey.append("-embed"); emitKey.append("/") .append(FilenameUtils.getName(containerEmitKey)); } else { @@ -55,14 +78,7 @@ public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDo //the file extension emitKey.append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()) .append(embeddedIdString); - - if (embeddedDocumentBytesConfig.getSuffixStrategy().equals( - EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) { - String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - String suffix = FilenameUtils.getSuffixFromPath(fName); - suffix = suffix.toLowerCase(Locale.US); - emitKey.append(suffix); - } + appendSuffix(emitKey, metadata, embeddedDocumentBytesConfig); return emitKey.toString(); } @@ -75,4 +91,35 @@ public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDo public List<Integer> getIds() { return ids; } + + private void appendSuffix(StringBuilder emitKey, Metadata metadata, EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig) { + if (embeddedDocumentBytesConfig.getSuffixStrategy().equals( + EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) { + String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + String suffix = FilenameUtils.getSuffixFromPath(fName); + suffix = suffix.toLowerCase(Locale.US); + emitKey.append(suffix); + } else if (embeddedDocumentBytesConfig.getSuffixStrategy() + .equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.DETECTED)) { + emitKey.append(getExtension(metadata)); + } + } + + private String getExtension(Metadata metadata) { + String mime = metadata.get(Metadata.CONTENT_TYPE); + try { + String ext = MIME_TYPES + .forName(mime) + .getExtension(); + if (ext == null) { + return ".bin"; + } else { + return ext; + } + } catch (MimeTypeException e) { + //swallow + } + return ".bin"; + + } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java index dca605da7..6a449b5bf 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java @@ -43,6 +43,23 @@ public class EmbeddedDocumentBytesConfig implements Serializable { throw new IllegalArgumentException("can't parse " + s); } } + + public enum KEY_BASE_STRATEGY { + CONTAINER_NAME_NUMBERED, + CONTAINER_NAME_AS_IS, + CUSTOM_BASE; + + public static KEY_BASE_STRATEGY parse(String s) { + if (s.equalsIgnoreCase(CONTAINER_NAME_NUMBERED.name())) { + return CONTAINER_NAME_NUMBERED; + } else if (s.equalsIgnoreCase(CONTAINER_NAME_AS_IS.name())) { + return CONTAINER_NAME_AS_IS; + } else if (s.equalsIgnoreCase(CUSTOM_BASE.name())) { + return CUSTOM_BASE; + } + throw new IllegalArgumentException("can't parse " + s); + } + } //for our current custom serialization, this can't be final. :( private boolean extractEmbeddedDocumentBytes; @@ -56,9 +73,10 @@ public class EmbeddedDocumentBytesConfig implements Serializable { private boolean includeOriginal = false; + private KEY_BASE_STRATEGY keyBaseStrategy = KEY_BASE_STRATEGY.CONTAINER_NAME_NUMBERED; //This should be set per file. This allows a custom //emit key base that bypasses the algorithmic generation of the emitKey - //from the primary json emitKey + //from the primary json emitKey when keyBase Strategy is CUSTOM_BASE private String emitKeyBase = ""; /** @@ -94,6 +112,10 @@ public class EmbeddedDocumentBytesConfig implements Serializable { return suffixStrategy; } + public KEY_BASE_STRATEGY getKeyBaseStrategy() { + return keyBaseStrategy; + } + public String getEmbeddedIdPrefix() { return embeddedIdPrefix; } @@ -118,6 +140,14 @@ public class EmbeddedDocumentBytesConfig implements Serializable { setSuffixStrategy(SUFFIX_STRATEGY.valueOf(suffixStrategy)); } + public void setKeyBaseStrategy(KEY_BASE_STRATEGY keyBaseStrategy) { + this.keyBaseStrategy = keyBaseStrategy; + } + + public void setKeyBaseStrategy(String keyBaseStrategy) { + setKeyBaseStrategy(KEY_BASE_STRATEGY.valueOf(keyBaseStrategy)); + } + public void setEmbeddedIdPrefix(String embeddedIdPrefix) { this.embeddedIdPrefix = embeddedIdPrefix; } @@ -140,28 +170,20 @@ public class EmbeddedDocumentBytesConfig implements Serializable { @Override public String toString() { - return "EmbeddedDocumentBytesConfig{" + "extractEmbeddedDocumentBytes=" + extractEmbeddedDocumentBytes + ", zeroPadName=" + - zeroPadName + ", suffixStrategy=" + - suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + '\'' + ", emitter='" + emitter + '\'' + - ", includeOriginal=" + includeOriginal + ", emitKeyBase='" + - emitKeyBase + '\'' + '}'; + return "EmbeddedDocumentBytesConfig{" + "extractEmbeddedDocumentBytes=" + extractEmbeddedDocumentBytes + ", zeroPadName=" + zeroPadName + ", suffixStrategy=" + + suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + '\'' + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal + ", keyBaseStrategy=" + + keyBaseStrategy + ", emitKeyBase='" + emitKeyBase + '\'' + '}'; } @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { + public final boolean equals(Object o) { + if (!(o instanceof EmbeddedDocumentBytesConfig config)) { return false; } - EmbeddedDocumentBytesConfig that = (EmbeddedDocumentBytesConfig) o; - return extractEmbeddedDocumentBytes == that.extractEmbeddedDocumentBytes && zeroPadName == that.zeroPadName - && includeOriginal == that.includeOriginal && - suffixStrategy == that.suffixStrategy && Objects.equals(embeddedIdPrefix, that.embeddedIdPrefix) - && Objects.equals(emitter, that.emitter) && - Objects.equals(emitKeyBase, that.emitKeyBase); + return extractEmbeddedDocumentBytes == config.extractEmbeddedDocumentBytes && zeroPadName == config.zeroPadName && includeOriginal == config.includeOriginal && + suffixStrategy == config.suffixStrategy && Objects.equals(embeddedIdPrefix, config.embeddedIdPrefix) && Objects.equals(emitter, config.emitter) && + keyBaseStrategy == config.keyBaseStrategy && Objects.equals(emitKeyBase, config.emitKeyBase); } @Override @@ -172,6 +194,7 @@ public class EmbeddedDocumentBytesConfig implements Serializable { result = 31 * result + Objects.hashCode(embeddedIdPrefix); result = 31 * result + Objects.hashCode(emitter); result = 31 * result + Boolean.hashCode(includeOriginal); + result = 31 * result + Objects.hashCode(keyBaseStrategy); result = 31 * result + Objects.hashCode(emitKeyBase); return result; }
