This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4637 in repository https://gitbox.apache.org/repos/asf/tika.git
commit c1f1c9fdc6de72acdedce482ae828e9b98b2b30d Author: tallison <[email protected]> AuthorDate: Fri Jan 30 08:03:44 2026 -0500 TIKA-4637 -- UNPACK mode first steps --- .../EmbeddedDocumentByteStoreExtractorFactory.java | 5 +- .../apache/tika/async/cli/SimpleAsyncConfig.java | 20 +- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 32 +- .../apache/tika/async/cli/AsyncProcessorTest.java | 2 +- .../java/org/apache/tika/pipes/api/ParseMode.java | 22 +- ...ytesHandler.java => AbstractUnpackHandler.java} | 2 +- ...ytesHandler.java => EmittingUnpackHandler.java} | 9 +- .../core/extractor/TempFileUnpackHandler.java | 159 ++++++++++ .../tika/pipes/core/extractor/UnpackConfig.java | 131 +++------ ...{RUnpackExtractor.java => UnpackExtractor.java} | 23 +- ...torFactory.java => UnpackExtractorFactory.java} | 12 +- .../tika/pipes/core/extractor/UnpackSelector.java | 143 +++++++++ .../apache/tika/pipes/core/server/EmitHandler.java | 24 +- .../core/server/MetadataListAndEmbeddedBytes.java | 4 +- .../tika/pipes/core/server/ParseHandler.java | 4 +- .../apache/tika/pipes/core/server/PipesServer.java | 34 ++- .../apache/tika/pipes/core/server/PipesWorker.java | 247 ++++++++++++++-- ...igSelectorTest.java => UnpackSelectorTest.java} | 34 ++- .../pipes/core/DigestingOpenContainersTest.java | 4 +- .../org/apache/tika/pipes/core/UnpackModeTest.java | 327 +++++++++++++++++++++ .../test-documents/test-with-embedded-bytes.json | 200 ++++++------- .../tika/server/core/resource/AsyncResource.java | 3 +- .../apache/tika/server/standard/TikaPipesTest.java | 2 +- 23 files changed, 1146 insertions(+), 297 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java index f7237bd6ac..4da3edb2b9 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java @@ -22,9 +22,8 @@ package org.apache.tika.extractor; * {@link EmbeddedDocumentBytesHandler} in the * {@link org.apache.tika.parser.ParseContext} should extend this. * - * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} - * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom - * EmbeddedDocumentExtractor. + * This is a shim interface to signal to PipesServer to use UnpackExtractor + * if the user doesn't configure a custom EmbeddedDocumentExtractor. * * TODO: Figure out how to simplify this and allow for emitting of the source document. */ diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java index 1382303037..f5233494a9 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java @@ -20,6 +20,12 @@ import org.apache.tika.sax.BasicContentHandlerFactory; class SimpleAsyncConfig { + enum ExtractBytesMode { + NONE, // no byte extraction + SHALLOW, // -z: depth=1, no throw on max depth + RECURSIVE // -Z: full recursion + } + private String inputDir; private String outputDir; private Integer numClients; @@ -27,13 +33,13 @@ class SimpleAsyncConfig { private String xmx; private String fileList; private String tikaConfig;//path to the tikaConfig file to be used in the forked process - private boolean extractBytes; + private ExtractBytesMode extractBytesMode; private final BasicContentHandlerFactory.HANDLER_TYPE handlerType; private final String pluginsDir; //TODO -- switch to a builder public SimpleAsyncConfig(String inputDir, String outputDir, Integer numClients, Long timeoutMs, String xmx, String fileList, - String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, boolean extractBytes, - String pluginsDir) { + String tikaConfig, BasicContentHandlerFactory.HANDLER_TYPE handlerType, + ExtractBytesMode extractBytesMode, String pluginsDir) { this.inputDir = inputDir; this.outputDir = outputDir; this.numClients = numClients; @@ -42,7 +48,7 @@ class SimpleAsyncConfig { this.fileList = fileList; this.tikaConfig = tikaConfig; this.handlerType = handlerType; - this.extractBytes = extractBytes; + this.extractBytesMode = extractBytesMode; this.pluginsDir = pluginsDir; } @@ -74,8 +80,8 @@ class SimpleAsyncConfig { return tikaConfig; } - public boolean isExtractBytes() { - return extractBytes; + public ExtractBytesMode getExtractBytesMode() { + return extractBytesMode; } public BasicContentHandlerFactory.HANDLER_TYPE getHandlerType() { @@ -96,7 +102,7 @@ class SimpleAsyncConfig { ", xmx='" + xmx + '\'' + ", fileList='" + fileList + '\'' + ", tikaConfig='" + tikaConfig + '\'' + - ", extractBytes=" + extractBytes + + ", extractBytesMode=" + extractBytesMode + ", handlerType=" + handlerType + ", pluginsDir='" + pluginsDir + '\'' + '}'; diff --git a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java index 015917d51a..02a0966393 100644 --- a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java +++ b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java @@ -33,10 +33,12 @@ import org.apache.commons.cli.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.config.loader.TikaJsonConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.fetcher.FetchKey; import org.apache.tika.pipes.api.pipesiterator.PipesIterator; @@ -66,7 +68,8 @@ public class TikaAsyncCLI { options.addOption("p", "pluginsDir", true, "plugins directory"); //options.addOption("l", "fileList", true, "file list"); options.addOption("c", "config", true, "tikaConfig.json"); - options.addOption("Z", "unzip", false, "extract raw bytes from attachments"); + options.addOption("z", "unzipShallow", false, "extract raw bytes from direct attachments only (depth=1)"); + options.addOption("Z", "unzipRecursive", false, "extract raw bytes from all attachments recursively"); return options; } @@ -146,7 +149,8 @@ public class TikaAsyncCLI { if (args.length == 2 && ! args[0].startsWith("-")) { return new SimpleAsyncConfig(args[0], args[1], 1, 30000L, "-Xmx1g", null, null, - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, false, null); + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + SimpleAsyncConfig.ExtractBytesMode.NONE, null); } Options options = getOptions(); @@ -167,7 +171,7 @@ public class TikaAsyncCLI { String asyncConfig = null; String pluginsDir = null; BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT; - boolean extractBytes = false; + SimpleAsyncConfig.ExtractBytesMode extractBytesMode = SimpleAsyncConfig.ExtractBytesMode.NONE; if (line.hasOption("i")) { inputDir = line.getOptionValue("i"); } @@ -190,7 +194,9 @@ public class TikaAsyncCLI { tikaConfig = line.getOptionValue("c"); } if (line.hasOption("Z")) { - extractBytes = true; + extractBytesMode = SimpleAsyncConfig.ExtractBytesMode.RECURSIVE; + } else if (line.hasOption("z")) { + extractBytesMode = SimpleAsyncConfig.ExtractBytesMode.SHALLOW; } if (line.hasOption('h')) { handlerType = getHandlerType(line.getOptionValue('h')); @@ -242,7 +248,7 @@ public class TikaAsyncCLI { return new SimpleAsyncConfig(inputDir, outputDir, numClients, timeoutMs, xmx, fileList, tikaConfig, handlerType, - extractBytes, pluginsDir); + extractBytesMode, pluginsDir); } private static BasicContentHandlerFactory.HANDLER_TYPE getHandlerType(String t) throws TikaConfigException { @@ -298,12 +304,24 @@ public class TikaAsyncCLI { if (asyncConfig == null) { return; } - if (!asyncConfig.isExtractBytes()) { + SimpleAsyncConfig.ExtractBytesMode mode = asyncConfig.getExtractBytesMode(); + if (mode == SimpleAsyncConfig.ExtractBytesMode.NONE) { return; } ParseContext parseContext = t.getParseContext(); + // Use the new UNPACK ParseMode for embedded byte extraction + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + // For SHALLOW mode (-z), set depth limit to 1 (direct children only) + if (mode == SimpleAsyncConfig.ExtractBytesMode.SHALLOW) { + EmbeddedLimits limits = new EmbeddedLimits(); + limits.setMaxDepth(1); + limits.setThrowOnMaxDepth(false); + parseContext.set(EmbeddedLimits.class, limits); + } + // For RECURSIVE mode (-Z), use default unlimited depth + UnpackConfig config = new UnpackConfig(); - config.setExtractEmbeddedDocumentBytes(true); config.setEmitter(TikaConfigAsyncWriter.EMITTER_NAME); config.setIncludeOriginal(false); config.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED); diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java index 782ea015b7..c794594816 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java @@ -112,7 +112,7 @@ public class AsyncProcessorTest extends TikaTest { public void testRecursiveUnpacking() throws Exception { AsyncProcessor processor = AsyncProcessor.load(configDir.resolve("tika-config.json")); - UnpackConfig embeddedDocumentBytesConfig = new UnpackConfig(true); + UnpackConfig embeddedDocumentBytesConfig = new UnpackConfig(); embeddedDocumentBytesConfig.setIncludeOriginal(true); embeddedDocumentBytesConfig.setEmitter("fse-bytes"); embeddedDocumentBytesConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.NONE); diff --git a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java index 4a6887ca2c..3513de3b55 100644 --- a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java +++ b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java @@ -51,7 +51,25 @@ public enum ParseMode { * is returned. Use this mode when you only need file identification * (mime type, hash) without text extraction. */ - NO_PARSE; + NO_PARSE, + + /** + * Extracts embedded document bytes and emits them, with full RMETA metadata. + * <p> + * This mode parses like RMETA (returning a metadata object per document) AND + * automatically extracts and emits embedded document bytes. An emitter is + * required for the byte extraction. + * <p> + * With PASSBACK_ALL emit strategy, embedded bytes are still emitted during + * parsing, but metadata is passed back to the client instead of being emitted. + * This is useful when you want bytes written to storage but need metadata + * returned for further processing (e.g., indexing to a database). + * <p> + * This mode simplifies byte extraction by handling all the internal setup + * (UnpackExtractor, EmittingUnpackHandler) automatically. + * Users just need to specify the emitter in UnpackConfig or FetchEmitTuple. + */ + UNPACK; /** * Parses a string to a ParseMode enum value. @@ -70,7 +88,7 @@ public enum ParseMode { } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Invalid parse mode: '" + modeString + "'. " + - "Must be one of: RMETA, CONCATENATE, NO_PARSE"); + "Must be one of: RMETA, CONCATENATE, NO_PARSE, UNPACK"); } } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractUnpackHandler.java similarity index 97% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractUnpackHandler.java index 798b80f625..d0df8cb613 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractUnpackHandler.java @@ -28,7 +28,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; -public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler { +public abstract class AbstractUnpackHandler implements EmbeddedDocumentBytesHandler { List<Integer> ids = new ArrayList<>(); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingEmbeddedDocumentBytesHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingUnpackHandler.java similarity index 90% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingEmbeddedDocumentBytesHandler.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingUnpackHandler.java index b7e8fd4a69..65cef5fac1 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingEmbeddedDocumentBytesHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmittingUnpackHandler.java @@ -31,7 +31,7 @@ import org.apache.tika.pipes.api.emitter.StreamEmitter; import org.apache.tika.pipes.core.emitter.EmitterManager; import org.apache.tika.pipes.core.emitter.TikaEmitterException; -public class EmittingEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler { +public class EmittingUnpackHandler extends AbstractUnpackHandler { private final EmitKey containerEmitKey; private final UnpackConfig unpackConfig; private final StreamEmitter emitter; @@ -39,11 +39,12 @@ public class EmittingEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocume private static final Metadata METADATA = new Metadata(); private static final ParseContext PARSE_CONTEXT = new ParseContext(); - public EmittingEmbeddedDocumentBytesHandler(FetchEmitTuple fetchEmitTuple, - EmitterManager emitterManager) throws TikaException, IOException { + public EmittingUnpackHandler(FetchEmitTuple fetchEmitTuple, + EmitterManager emitterManager, + ParseContext parseContext) throws TikaException, IOException { this.containerEmitKey = fetchEmitTuple.getEmitKey(); - this.unpackConfig = fetchEmitTuple.getParseContext().get(UnpackConfig.class); + this.unpackConfig = parseContext.get(UnpackConfig.class); if (this.unpackConfig == null) { throw new TikaConfigException("UnpackConfig must not be null!"); } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java new file mode 100644 index 0000000000..6fbf96c484 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/TempFileUnpackHandler.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core.extractor; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.FileUtils; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.pipes.api.emitter.EmitKey; + +/** + * An EmbeddedDocumentBytesHandler that writes embedded bytes to a temporary directory + * for later zipping. Files are stored with their emit key names (flattened, with path + * separators replaced). + */ +public class TempFileUnpackHandler extends AbstractUnpackHandler + implements Closeable { + + private final Path tempDirectory; + private final EmitKey containerEmitKey; + private final UnpackConfig unpackConfig; + private final List<EmbeddedFileInfo> embeddedFiles = new ArrayList<>(); + private Path originalDocumentPath; + private String originalDocumentName; + private boolean closed = false; + + /** + * Information about an embedded file stored in the temp directory. + */ + public record EmbeddedFileInfo(int id, String fileName, Path filePath, Metadata metadata) { + } + + public TempFileUnpackHandler(EmitKey containerEmitKey, + UnpackConfig unpackConfig) throws IOException { + this.containerEmitKey = containerEmitKey; + this.unpackConfig = unpackConfig; + this.tempDirectory = Files.createTempDirectory("tika-unpack-"); + } + + @Override + public void add(int id, Metadata metadata, InputStream inputStream) throws IOException { + super.add(id, metadata, inputStream); + + // Generate the file name based on emit key logic + String emitKey = getEmitKey(containerEmitKey.getEmitKey(), id, unpackConfig, metadata); + + // Flatten the path for zip entry name - use just the filename portion + String fileName = flattenFileName(emitKey, id); + + // Write to temp file + Path tempFile = tempDirectory.resolve(fileName); + try (OutputStream os = Files.newOutputStream(tempFile)) { + inputStream.transferTo(os); + } + + embeddedFiles.add(new EmbeddedFileInfo(id, fileName, tempFile, metadata)); + } + + /** + * Flattens an emit key path to a simple filename suitable for a zip entry. + * Replaces path separators and uses the last component plus id for uniqueness. + */ + private String flattenFileName(String emitKey, int id) { + // Get the last path component + int lastSlash = Math.max(emitKey.lastIndexOf('/'), emitKey.lastIndexOf('\\')); + if (lastSlash >= 0 && lastSlash < emitKey.length() - 1) { + return emitKey.substring(lastSlash + 1); + } + return emitKey; + } + + /** + * Returns the temporary directory where embedded files are stored. + */ + public Path getTempDirectory() { + return tempDirectory; + } + + /** + * Returns information about all embedded files stored. + */ + public List<EmbeddedFileInfo> getEmbeddedFiles() { + return embeddedFiles; + } + + /** + * Returns true if there are any embedded files stored. + */ + public boolean hasEmbeddedFiles() { + return !embeddedFiles.isEmpty(); + } + + /** + * Stores the original container document for inclusion in the zip. + * Call this before parsing if includeOriginal is enabled. + * + * @param inputStream the original document input stream + * @param fileName the file name for the original document + */ + public void storeOriginalDocument(InputStream inputStream, String fileName) throws IOException { + this.originalDocumentName = fileName; + this.originalDocumentPath = tempDirectory.resolve("_original_" + fileName); + try (OutputStream os = Files.newOutputStream(originalDocumentPath)) { + inputStream.transferTo(os); + } + } + + /** + * Returns the path to the original document if stored. + */ + public Path getOriginalDocumentPath() { + return originalDocumentPath; + } + + /** + * Returns the name of the original document if stored. + */ + public String getOriginalDocumentName() { + return originalDocumentName; + } + + /** + * Returns true if the original document was stored. + */ + public boolean hasOriginalDocument() { + return originalDocumentPath != null && Files.exists(originalDocumentPath); + } + + @Override + public void close() throws IOException { + if (!closed) { + closed = true; + // Clean up temp directory - caller should have already zipped if needed + FileUtils.deleteDirectory(tempDirectory.toFile()); + } + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackConfig.java index dde5298c71..c747208b1a 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackConfig.java @@ -17,15 +17,11 @@ package org.apache.tika.pipes.core.extractor; import java.io.Serializable; -import java.util.HashSet; import java.util.Objects; -import java.util.Set; import org.apache.tika.config.TikaComponent; -import org.apache.tika.extractor.BasicEmbeddedBytesSelector; -import org.apache.tika.extractor.EmbeddedBytesSelector; -@TikaComponent(name = "unpack-config") +@TikaComponent public class UnpackConfig implements Serializable { /** @@ -33,9 +29,6 @@ public class UnpackConfig implements Serializable { */ private static final long serialVersionUID = -3861669115439125268L; - - public static UnpackConfig SKIP = new UnpackConfig(false); - public enum SUFFIX_STRATEGY { NONE, EXISTING, DETECTED; @@ -70,9 +63,6 @@ public class UnpackConfig implements Serializable { throw new IllegalArgumentException("can't parse " + s); } } - //for our current custom serialization, this can't be final. :( - private boolean extractEmbeddedDocumentBytes; - private int zeroPadName = 0; private SUFFIX_STRATEGY suffixStrategy = SUFFIX_STRATEGY.NONE; @@ -89,35 +79,14 @@ public class UnpackConfig implements Serializable { //from the primary json emitKey when keyBase Strategy is CUSTOM private String emitKeyBase = ""; - // Filter parameters for embedded bytes selection - private Set<String> includeMimeTypes = new HashSet<>(); - private Set<String> excludeMimeTypes = new HashSet<>(); - private Set<String> includeEmbeddedResourceTypes = new HashSet<>(); - private Set<String> excludeEmbeddedResourceTypes = new HashSet<>(); + // Zipping options + private boolean zipEmbeddedFiles = false; + private boolean includeMetadataInZip = false; /** - * Create an UnpackConfig with - * {@link UnpackConfig#extractEmbeddedDocumentBytes} - * set to <code>true</code> + * Create an UnpackConfig with default settings. */ public UnpackConfig() { - this.extractEmbeddedDocumentBytes = true; - } - - public UnpackConfig(boolean extractEmbeddedDocumentBytes) { - this.extractEmbeddedDocumentBytes = extractEmbeddedDocumentBytes; - } - - public static UnpackConfig getSKIP() { - return SKIP; - } - - public boolean isExtractEmbeddedDocumentBytes() { - return extractEmbeddedDocumentBytes; - } - - public void setExtractEmbeddedDocumentBytes(boolean extractEmbeddedDocumentBytes) { - this.extractEmbeddedDocumentBytes = extractEmbeddedDocumentBytes; } public int getZeroPadName() { @@ -184,60 +153,38 @@ public class UnpackConfig implements Serializable { return emitKeyBase; } - public Set<String> getIncludeMimeTypes() { - return includeMimeTypes; - } - - public void setIncludeMimeTypes(Set<String> includeMimeTypes) { - this.includeMimeTypes = new HashSet<>(includeMimeTypes); - } - - public Set<String> getExcludeMimeTypes() { - return excludeMimeTypes; - } - - public void setExcludeMimeTypes(Set<String> excludeMimeTypes) { - this.excludeMimeTypes = new HashSet<>(excludeMimeTypes); - } - - public Set<String> getIncludeEmbeddedResourceTypes() { - return includeEmbeddedResourceTypes; - } - - public void setIncludeEmbeddedResourceTypes(Set<String> includeEmbeddedResourceTypes) { - this.includeEmbeddedResourceTypes = new HashSet<>(includeEmbeddedResourceTypes); - } - - public Set<String> getExcludeEmbeddedResourceTypes() { - return excludeEmbeddedResourceTypes; + /** + * Whether to zip all embedded files into a single archive before emitting. + * When true, embedded files are collected during parsing and then zipped + * and emitted as a single archive after parsing completes. + */ + public boolean isZipEmbeddedFiles() { + return zipEmbeddedFiles; } - public void setExcludeEmbeddedResourceTypes(Set<String> excludeEmbeddedResourceTypes) { - this.excludeEmbeddedResourceTypes = new HashSet<>(excludeEmbeddedResourceTypes); + public void setZipEmbeddedFiles(boolean zipEmbeddedFiles) { + this.zipEmbeddedFiles = zipEmbeddedFiles; } /** - * Creates an EmbeddedBytesSelector based on the configured filter parameters. - * - * @return an EmbeddedBytesSelector that will filter embedded documents based on - * configured mime types and resource types + * Whether to include the metadata JSON for each embedded document in the zip file. + * Only applicable when {@link #isZipEmbeddedFiles()} is true. */ - public EmbeddedBytesSelector createEmbeddedBytesSelector() { - if (includeMimeTypes.isEmpty() && excludeMimeTypes.isEmpty() - && includeEmbeddedResourceTypes.isEmpty() && excludeEmbeddedResourceTypes.isEmpty()) { - return EmbeddedBytesSelector.ACCEPT_ALL; - } - return new BasicEmbeddedBytesSelector(includeMimeTypes, excludeMimeTypes, - includeEmbeddedResourceTypes, excludeEmbeddedResourceTypes); + public boolean isIncludeMetadataInZip() { + return includeMetadataInZip; + } + + public void setIncludeMetadataInZip(boolean includeMetadataInZip) { + this.includeMetadataInZip = includeMetadataInZip; } @Override public String toString() { - return "UnpackConfig{" + "extractEmbeddedDocumentBytes=" + extractEmbeddedDocumentBytes + ", zeroPadName=" + zeroPadName + ", suffixStrategy=" + - suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + '\'' + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal + ", keyBaseStrategy=" + - keyBaseStrategy + ", emitKeyBase='" + emitKeyBase + '\'' + - ", includeMimeTypes=" + includeMimeTypes + ", excludeMimeTypes=" + excludeMimeTypes + - ", includeEmbeddedResourceTypes=" + includeEmbeddedResourceTypes + ", excludeEmbeddedResourceTypes=" + excludeEmbeddedResourceTypes + '}'; + return "UnpackConfig{" + "zeroPadName=" + zeroPadName + ", suffixStrategy=" + + suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + '\'' + + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal + + ", keyBaseStrategy=" + keyBaseStrategy + ", emitKeyBase='" + emitKeyBase + '\'' + + ", zipEmbeddedFiles=" + zipEmbeddedFiles + ", includeMetadataInZip=" + includeMetadataInZip + '}'; } @Override @@ -246,29 +193,27 @@ public class UnpackConfig implements Serializable { return false; } - return extractEmbeddedDocumentBytes == config.extractEmbeddedDocumentBytes && zeroPadName == config.zeroPadName && includeOriginal == config.includeOriginal && - suffixStrategy == config.suffixStrategy && Objects.equals(embeddedIdPrefix, config.embeddedIdPrefix) && Objects.equals(emitter, config.emitter) && - keyBaseStrategy == config.keyBaseStrategy && Objects.equals(emitKeyBase, config.emitKeyBase) && - Objects.equals(includeMimeTypes, config.includeMimeTypes) && - Objects.equals(excludeMimeTypes, config.excludeMimeTypes) && - Objects.equals(includeEmbeddedResourceTypes, config.includeEmbeddedResourceTypes) && - Objects.equals(excludeEmbeddedResourceTypes, config.excludeEmbeddedResourceTypes); + return zeroPadName == config.zeroPadName && includeOriginal == config.includeOriginal && + suffixStrategy == config.suffixStrategy && + Objects.equals(embeddedIdPrefix, config.embeddedIdPrefix) && + Objects.equals(emitter, config.emitter) && + keyBaseStrategy == config.keyBaseStrategy && + Objects.equals(emitKeyBase, config.emitKeyBase) && + zipEmbeddedFiles == config.zipEmbeddedFiles && + includeMetadataInZip == config.includeMetadataInZip; } @Override public int hashCode() { - int result = Boolean.hashCode(extractEmbeddedDocumentBytes); - result = 31 * result + zeroPadName; + int result = zeroPadName; result = 31 * result + Objects.hashCode(suffixStrategy); result = 31 * result + Objects.hashCode(embeddedIdPrefix); result = 31 * result + Objects.hashCode(emitter); result = 31 * result + Boolean.hashCode(includeOriginal); result = 31 * result + Objects.hashCode(keyBaseStrategy); result = 31 * result + Objects.hashCode(emitKeyBase); - result = 31 * result + Objects.hashCode(includeMimeTypes); - result = 31 * result + Objects.hashCode(excludeMimeTypes); - result = 31 * result + Objects.hashCode(includeEmbeddedResourceTypes); - result = 31 * result + Objects.hashCode(excludeEmbeddedResourceTypes); + result = 31 * result + Boolean.hashCode(zipEmbeddedFiles); + result = 31 * result + Boolean.hashCode(includeMetadataInZip); return result; } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractor.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractor.java similarity index 91% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractor.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractor.java index 356411cf6c..9732e689de 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractor.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractor.java @@ -35,7 +35,6 @@ import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; -import org.apache.tika.extractor.EmbeddedBytesSelector; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; import org.apache.tika.extractor.EmbeddedStreamTranslator; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; @@ -48,24 +47,23 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; /** - * Recursive Unpacker and text and metadata extractor. + * Embedded document extractor that parses and unpacks embedded documents, + * extracting both text/metadata and raw bytes. * * @since Apache Tika 3.0.0 */ -public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { +public class UnpackExtractor extends ParsingEmbeddedDocumentExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); private static final File ABSTRACT_PATH = new File(""); - private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; - private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); private long bytesExtracted = 0; private final long maxEmbeddedBytesForExtraction; - public RUnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { + public UnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { super(context); this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; } @@ -158,7 +156,10 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { return; } - if (! embeddedBytesSelector.select(metadata)) { + // Get UnpackSelector from ParseContext - if configured, use it to filter + // If no selector configured, accept all embedded documents + UnpackSelector selector = context.get(UnpackSelector.class); + if (selector != null && !selector.select(metadata)) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("skipping embedded bytes {} <-> {}", metadata.get(Metadata.CONTENT_TYPE), @@ -189,12 +190,4 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { LOGGER.warn("problem writing out embedded bytes", e); } } - - public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) { - this.embeddedBytesSelector = embeddedBytesSelector; - } - - public EmbeddedBytesSelector getEmbeddedBytesSelector() { - return embeddedBytesSelector; - } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractorFactory.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractorFactory.java similarity index 72% rename from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractorFactory.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractorFactory.java index 1e77c2fb94..7bcec5f902 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/RUnpackExtractorFactory.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackExtractorFactory.java @@ -22,17 +22,11 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -@TikaComponent(name = "runpack-extractor-factory") -public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtractorFactory { +@TikaComponent(name = "unpack-extractor-factory") +public class UnpackExtractorFactory implements EmbeddedDocumentByteStoreExtractorFactory { @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - UnpackConfig config = parseContext.get(UnpackConfig.class); - if (config == null) { - config = UnpackConfig.SKIP; - } - RUnpackExtractor ex = new RUnpackExtractor(parseContext, Long.MAX_VALUE); - ex.setEmbeddedBytesSelector(config.createEmbeddedBytesSelector()); - return ex; + return new UnpackExtractor(parseContext, Long.MAX_VALUE); } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackSelector.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackSelector.java new file mode 100644 index 0000000000..048321fe58 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/UnpackSelector.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core.extractor; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.extractor.EmbeddedBytesSelector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.StringUtils; + +/** + * Selector for filtering which embedded documents should have their bytes extracted + * during UNPACK mode. Configure via ParseContext in tika-config.json. + * <p> + * Example configuration: + * <pre> + * { + * "parse-context": { + * "unpack-selector": { + * "includeMimeTypes": ["image/jpeg", "image/png"], + * "excludeMimeTypes": ["application/pdf"], + * "includeEmbeddedResourceTypes": ["ATTACHMENT"], + * "excludeEmbeddedResourceTypes": ["INLINE"] + * } + * } + * } + * </pre> + */ +@TikaComponent(name = "unpack-selector") +public class UnpackSelector implements EmbeddedBytesSelector { + + private Set<String> includeMimeTypes = new HashSet<>(); + private Set<String> excludeMimeTypes = new HashSet<>(); + private Set<String> includeEmbeddedResourceTypes = new HashSet<>(); + private Set<String> excludeEmbeddedResourceTypes = new HashSet<>(); + + public UnpackSelector() { + } + + @Override + public boolean select(Metadata metadata) { + // If no filters configured, accept all + if (includeMimeTypes.isEmpty() && excludeMimeTypes.isEmpty() + && includeEmbeddedResourceTypes.isEmpty() && excludeEmbeddedResourceTypes.isEmpty()) { + return true; + } + + String mime = metadata.get(Metadata.CONTENT_TYPE); + if (mime == null) { + mime = ""; + } else { + // If mime matters at all, make sure to get the mime without parameters + if (!includeMimeTypes.isEmpty() || !excludeMimeTypes.isEmpty()) { + MediaType mt = MediaType.parse(mime); + if (mt != null) { + mime = mt.getType() + "/" + mt.getSubtype(); + } + } + } + + if (excludeMimeTypes.contains(mime)) { + return false; + } + if (!includeMimeTypes.isEmpty() && !includeMimeTypes.contains(mime)) { + return false; + } + + String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + // If a parser doesn't specify the type, treat it as ATTACHMENT + embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : + embeddedResourceType; + + if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) { + return false; + } + if (!includeEmbeddedResourceTypes.isEmpty() + && !includeEmbeddedResourceTypes.contains(embeddedResourceType)) { + return false; + } + + return true; + } + + public Set<String> getIncludeMimeTypes() { + return includeMimeTypes; + } + + public void setIncludeMimeTypes(Set<String> includeMimeTypes) { + this.includeMimeTypes = new HashSet<>(includeMimeTypes); + } + + public Set<String> getExcludeMimeTypes() { + return excludeMimeTypes; + } + + public void setExcludeMimeTypes(Set<String> excludeMimeTypes) { + this.excludeMimeTypes = new HashSet<>(excludeMimeTypes); + } + + public Set<String> getIncludeEmbeddedResourceTypes() { + return includeEmbeddedResourceTypes; + } + + public void setIncludeEmbeddedResourceTypes(Set<String> includeEmbeddedResourceTypes) { + this.includeEmbeddedResourceTypes = new HashSet<>(includeEmbeddedResourceTypes); + } + + public Set<String> getExcludeEmbeddedResourceTypes() { + return excludeEmbeddedResourceTypes; + } + + public void setExcludeEmbeddedResourceTypes(Set<String> excludeEmbeddedResourceTypes) { + this.excludeEmbeddedResourceTypes = new HashSet<>(excludeEmbeddedResourceTypes); + } + + @Override + public String toString() { + return "UnpackSelector{" + + "includeMimeTypes=" + includeMimeTypes + + ", excludeMimeTypes=" + excludeMimeTypes + + ", includeEmbeddedResourceTypes=" + includeEmbeddedResourceTypes + + ", excludeEmbeddedResourceTypes=" + excludeEmbeddedResourceTypes + + '}'; + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java index dddf11c502..cea86da760 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/EmitHandler.java @@ -32,6 +32,7 @@ import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.api.emitter.EmitKey; import org.apache.tika.pipes.api.emitter.Emitter; @@ -41,7 +42,6 @@ import org.apache.tika.pipes.core.EmitStrategyConfig; import org.apache.tika.pipes.core.PassbackFilter; import org.apache.tika.pipes.core.emitter.EmitDataImpl; import org.apache.tika.pipes.core.emitter.EmitterManager; -import org.apache.tika.pipes.core.extractor.UnpackConfig; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.StringUtils; @@ -68,7 +68,6 @@ class EmitHandler { //we need to apply the metadata filter after we pull out the stacktrace filterMetadata(parseData, parseContext); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); - UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); if (StringUtils.isBlank(stack) || onParseException == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) { injectUserMetadata(t.getMetadata(), parseData.getMetadataList()); @@ -78,8 +77,9 @@ class EmitHandler { t.setEmitKey(emitKey); } EmitDataImpl emitDataTuple = new EmitDataImpl(t.getEmitKey().getEmitKey(), parseData.getMetadataList(), stack); - if (shouldEmit(unpackConfig, parseData, emitDataTuple, parseContext)) { - return emit(t.getId(), emitKey, unpackConfig.isExtractEmbeddedDocumentBytes(), + ParseMode parseMode = parseContext.get(ParseMode.class); + if (shouldEmit(parseMode, parseData, emitDataTuple, parseContext)) { + return emit(t.getId(), emitKey, parseMode == ParseMode.UNPACK, parseData, stack, parseContext); } else { if (StringUtils.isBlank(stack)) { @@ -153,7 +153,7 @@ class EmitHandler { } - private boolean shouldEmit(UnpackConfig unpackConfig, MetadataListAndEmbeddedBytes parseData, + private boolean shouldEmit(ParseMode parseMode, MetadataListAndEmbeddedBytes parseData, EmitDataImpl emitDataTuple, ParseContext parseContext) { EmitStrategy strategy = emitStrategy; long thresholdBytes = directEmitThresholdBytes; @@ -166,10 +166,18 @@ class EmitHandler { } } - if (strategy == EmitStrategy.EMIT_ALL) { + // UNPACK mode: bytes are already emitted during parsing + // For PASSBACK_ALL, don't emit metadata - pass it back to client instead + // For other strategies, also emit metadata + if (parseMode == ParseMode.UNPACK) { + if (strategy == EmitStrategy.PASSBACK_ALL) { + // Bytes were emitted during parsing, metadata will be passed back + return false; + } return true; - } else if (unpackConfig.isExtractEmbeddedDocumentBytes() && - parseData.toBePackagedForStreamEmitter()) { + } + + if (strategy == EmitStrategy.EMIT_ALL) { return true; } else if (strategy == EmitStrategy.PASSBACK_ALL) { return false; diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/MetadataListAndEmbeddedBytes.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/MetadataListAndEmbeddedBytes.java index 1c9ea38ce2..dc52f5a59e 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/MetadataListAndEmbeddedBytes.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/MetadataListAndEmbeddedBytes.java @@ -22,7 +22,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.pipes.core.extractor.EmittingEmbeddedDocumentBytesHandler; +import org.apache.tika.pipes.core.extractor.EmittingUnpackHandler; class MetadataListAndEmbeddedBytes { @@ -66,7 +66,7 @@ class MetadataListAndEmbeddedBytes { * @return */ public boolean toBePackagedForStreamEmitter() { - return !(embeddedDocumentBytesHandler instanceof EmittingEmbeddedDocumentBytesHandler); + return !(embeddedDocumentBytesHandler instanceof EmittingUnpackHandler); } @Override diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 6e86502d2b..7ddf05ccba 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -88,7 +88,9 @@ class ParseHandler { ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(parseContext); if (parseMode == ParseMode.NO_PARSE) { metadataList = detectOnly(fetchEmitTuple, stream, metadata, parseContext); - } else if (parseMode == ParseMode.RMETA) { + } else if (parseMode == ParseMode.RMETA || parseMode == ParseMode.UNPACK) { + // UNPACK uses the same recursive parsing as RMETA + // The difference is in setup (PipesWorker) - UNPACK has mandatory byte extraction metadataList = parseRecursive(fetchEmitTuple, contentHandlerFactory, stream, metadata, parseContext); } else { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index d5a6c72497..b1054a73c9 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@ -63,6 +63,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; import org.apache.tika.pipes.core.EmitStrategy; import org.apache.tika.pipes.core.EmitStrategyConfig; @@ -71,7 +72,8 @@ import org.apache.tika.pipes.core.PipesConfig; import org.apache.tika.pipes.core.config.ConfigStore; import org.apache.tika.pipes.core.config.ConfigStoreFactory; import org.apache.tika.pipes.core.emitter.EmitterManager; -import org.apache.tika.pipes.core.extractor.RUnpackExtractorFactory; +import org.apache.tika.pipes.core.extractor.UnpackConfig; +import org.apache.tika.pipes.core.extractor.UnpackExtractorFactory; import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.core.serialization.JsonPipesIpc; import org.apache.tika.plugins.ExtensionConfig; @@ -79,6 +81,7 @@ import org.apache.tika.plugins.TikaPluginManager; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.serialization.ParseContextUtils; import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.StringUtils; /** * This server is forked from the PipesClient. This class isolates @@ -300,6 +303,8 @@ public class PipesServer implements AutoCloseable { CountDownLatch countDownLatch = new CountDownLatch(1); FetchEmitTuple fetchEmitTuple = readFetchEmitTuple(); + // Validate before merging with global config + validateFetchEmitTuple(fetchEmitTuple); // Create merged ParseContext: defaults from tika-config + request overrides ParseContext mergedContext = createMergedParseContext(fetchEmitTuple.getParseContext()); // Resolve friendly-named configs in ParseContext to actual objects @@ -460,6 +465,29 @@ public class PipesServer implements AutoCloseable { return null; } + /** + * Validates the FetchEmitTuple before merging with global config. + * If the tuple explicitly sets UnpackConfig with an emitter but ParseMode is not UNPACK, + * that's a configuration error. + */ + private void validateFetchEmitTuple(FetchEmitTuple fetchEmitTuple) throws TikaConfigException { + ParseContext requestContext = fetchEmitTuple.getParseContext(); + if (requestContext == null) { + return; + } + UnpackConfig unpackConfig = requestContext.get(UnpackConfig.class); + ParseMode parseMode = requestContext.get(ParseMode.class); + + // If tuple explicitly has UnpackConfig with emitter but not UNPACK mode, that's an error + if (unpackConfig != null && !StringUtils.isBlank(unpackConfig.getEmitter()) + && parseMode != ParseMode.UNPACK) { + throw new TikaConfigException( + "FetchEmitTuple has UnpackConfig with emitter '" + unpackConfig.getEmitter() + + "' but ParseMode is " + parseMode + ". " + + "To extract embedded bytes, set ParseMode.UNPACK in the ParseContext."); + } + } + protected void initializeResources() throws TikaException, IOException, SAXException { TikaJsonConfig tikaJsonConfig = tikaLoader.getConfig(); @@ -489,10 +517,10 @@ public class PipesServer implements AutoCloseable { private ParseContext createMergedParseContext(ParseContext requestContext) throws TikaConfigException { // Create fresh context with defaults from tika-config (e.g., DigesterFactory) ParseContext mergedContext = tikaLoader.loadParseContext(); - // If no embedded document extractor factory is configured, use RUnpackExtractorFactory + // If no embedded document extractor factory is configured, use UnpackExtractorFactory // as the default for pipes scenarios (supports embedded byte extraction) if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null) { - mergedContext.set(EmbeddedDocumentExtractorFactory.class, new RUnpackExtractorFactory()); + mergedContext.set(EmbeddedDocumentExtractorFactory.class, new UnpackExtractorFactory()); } // Overlay request's values (request takes precedence) mergedContext.copyFrom(requestContext); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index d2f4d2cff6..3b4967d765 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@ -18,17 +18,22 @@ package org.apache.tika.pipes.core.server; import java.io.Closeable; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.concurrent.Callable; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; +import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentByteStoreExtractorFactory; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -36,12 +41,17 @@ import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.api.PipesResult; +import org.apache.tika.pipes.api.emitter.EmitKey; +import org.apache.tika.pipes.api.emitter.Emitter; +import org.apache.tika.pipes.api.emitter.StreamEmitter; import org.apache.tika.pipes.core.PipesResults; import org.apache.tika.pipes.core.emitter.EmitterManager; -import org.apache.tika.pipes.core.extractor.EmittingEmbeddedDocumentBytesHandler; -import org.apache.tika.pipes.core.extractor.RUnpackExtractor; +import org.apache.tika.pipes.core.extractor.EmittingUnpackHandler; +import org.apache.tika.pipes.core.extractor.TempFileUnpackHandler; import org.apache.tika.pipes.core.extractor.UnpackConfig; +import org.apache.tika.pipes.core.extractor.UnpackExtractorFactory; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.StringUtils; @@ -74,6 +84,7 @@ class PipesWorker implements Callable<PipesResult> { @Override public PipesResult call() throws Exception { MetadataListAndEmbeddedBytes parseData = null; + TempFileUnpackHandler tempHandler = null; try { //this can be null if there is a fetch exception ParseDataOrPipesResult parseDataResult = parseFromTuple(); @@ -87,9 +98,28 @@ class PipesWorker implements Callable<PipesResult> { if (parseData == null || metadataIsEmpty(parseData.getMetadataList())) { return PipesResults.EMPTY_OUTPUT; } + + // Check if we need to zip and emit embedded files + EmbeddedDocumentBytesHandler handler = parseContext.get(EmbeddedDocumentBytesHandler.class); + if (handler instanceof TempFileUnpackHandler) { + tempHandler = (TempFileUnpackHandler) handler; + PipesResult zipResult = zipAndEmitEmbeddedFiles(tempHandler); + if (zipResult != null) { + // Zipping/emitting failed - return the error + return zipResult; + } + } + return emitHandler.emitParseData(fetchEmitTuple, parseData, parseContext); } finally { - if (parseData != null && parseData.hasEmbeddedDocumentByteStore() && + // Clean up temp handler if used + if (tempHandler != null) { + try { + tempHandler.close(); + } catch (IOException e) { + LOG.warn("problem closing temp file handler", e); + } + } else if (parseData != null && parseData.hasEmbeddedDocumentByteStore() && parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) { try { ((Closeable) parseData.getEmbeddedDocumentBytesHandler()).close(); @@ -105,6 +135,147 @@ class PipesWorker implements Callable<PipesResult> { return metadataList == null || metadataList.isEmpty(); } + /** + * Zips all embedded files from the temp handler and emits the zip to the user's emitter. + * + * @param tempHandler the handler containing embedded files in temp directory + * @return PipesResult if there was an error, null if successful + */ + private PipesResult zipAndEmitEmbeddedFiles(TempFileUnpackHandler tempHandler) { + // Skip if no embedded files + if (!tempHandler.hasEmbeddedFiles()) { + LOG.debug("No embedded files to zip"); + return null; + } + + UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); + String emitterName = unpackConfig.getEmitter(); + Emitter emitter; + try { + emitter = emitterManager.getEmitter(emitterName); + } catch (Exception e) { + LOG.warn("Failed to get emitter for zip: {}", emitterName, e); + return new PipesResult(PipesResult.RESULT_STATUS.EMITTER_NOT_FOUND, + "Emitter not found for zipping: " + emitterName); + } + + if (!(emitter instanceof StreamEmitter)) { + LOG.warn("Emitter {} is not a StreamEmitter, cannot emit zip", emitterName); + return new PipesResult(PipesResult.RESULT_STATUS.EMIT_EXCEPTION, + "Emitter must be a StreamEmitter to emit zipped embedded files. Found: " + + emitter.getClass().getName()); + } + + StreamEmitter streamEmitter = (StreamEmitter) emitter; + EmitKey containerEmitKey = fetchEmitTuple.getEmitKey(); + + // Create zip file in temp directory + Path zipFile = tempHandler.getTempDirectory().resolve("embedded-files.zip"); + try { + createZipFile(zipFile, tempHandler, unpackConfig); + } catch (IOException e) { + LOG.warn("Failed to create zip file", e); + return new PipesResult(PipesResult.RESULT_STATUS.EMIT_EXCEPTION, + "Failed to create zip file: " + ExceptionUtils.getStackTrace(e)); + } + + // Emit the zip file + String zipEmitKey = containerEmitKey.getEmitKey() + "-embedded.zip"; + try (InputStream zipStream = Files.newInputStream(zipFile)) { + streamEmitter.emit(zipEmitKey, zipStream, new Metadata(), parseContext); + } catch (IOException e) { + LOG.warn("Failed to emit zip file", e); + return new PipesResult(PipesResult.RESULT_STATUS.EMIT_EXCEPTION, + "Failed to emit zip file: " + ExceptionUtils.getStackTrace(e)); + } + + LOG.debug("Successfully zipped and emitted {} embedded files to {}", + tempHandler.getEmbeddedFiles().size(), zipEmitKey); + return null; + } + + /** + * Creates a zip file containing all embedded files. + */ + private void createZipFile(Path zipFile, TempFileUnpackHandler tempHandler, + UnpackConfig unpackConfig) throws IOException { + try (ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile))) { + // Include original document if requested + if (unpackConfig.isIncludeOriginal() && tempHandler.hasOriginalDocument()) { + ZipEntry originalEntry = new ZipEntry(tempHandler.getOriginalDocumentName()); + zos.putNextEntry(originalEntry); + Files.copy(tempHandler.getOriginalDocumentPath(), zos); + zos.closeEntry(); + } + + for (TempFileUnpackHandler.EmbeddedFileInfo fileInfo : tempHandler.getEmbeddedFiles()) { + // Add the embedded file + ZipEntry fileEntry = new ZipEntry(fileInfo.fileName()); + zos.putNextEntry(fileEntry); + Files.copy(fileInfo.filePath(), zos); + zos.closeEntry(); + + // Add metadata JSON if requested + if (unpackConfig.isIncludeMetadataInZip()) { + String metadataFileName = fileInfo.fileName() + ".metadata.json"; + ZipEntry metadataEntry = new ZipEntry(metadataFileName); + zos.putNextEntry(metadataEntry); + writeMetadataAsJson(zos, fileInfo.metadata()); + zos.closeEntry(); + } + } + } + } + + /** + * Writes metadata as JSON to the output stream. + */ + private void writeMetadataAsJson(OutputStream os, Metadata metadata) throws IOException { + ObjectMapper mapper = new ObjectMapper(); + // Convert metadata to a map for JSON serialization + java.util.Map<String, Object> metadataMap = new java.util.LinkedHashMap<>(); + for (String name : metadata.names()) { + String[] values = metadata.getValues(name); + if (values.length == 1) { + metadataMap.put(name, values[0]); + } else { + metadataMap.put(name, values); + } + } + mapper.writeValue(os, metadataMap); + } + + /** + * Stores the original document to the temp handler for inclusion in the zip. + * Uses TikaInputStream's internal file caching to avoid consuming the stream. + */ + private void storeOriginalDocument(TikaInputStream tis, TempFileUnpackHandler tempHandler) + throws IOException { + // Get the file name from fetch key + String fetchKey = fetchEmitTuple.getFetchKey().getFetchKey(); + String fileName = fetchKey; + int lastSlash = Math.max(fetchKey.lastIndexOf('/'), fetchKey.lastIndexOf('\\')); + if (lastSlash >= 0 && lastSlash < fetchKey.length() - 1) { + fileName = fetchKey.substring(lastSlash + 1); + } + + // TikaInputStream caches to a temp file internally - get that file + Path originalPath = tis.getPath(); + if (originalPath != null && Files.exists(originalPath)) { + // Copy from the cached file + try (InputStream is = Files.newInputStream(originalPath)) { + tempHandler.storeOriginalDocument(is, fileName); + } + } else { + // Stream hasn't been cached yet - we need to read and reset + tis.mark(Integer.MAX_VALUE); + try { + tempHandler.storeOriginalDocument(tis, fileName); + } finally { + tis.reset(); + } + } + } protected ParseDataOrPipesResult parseFromTuple() throws TikaException, InterruptedException { //start a new metadata object to gather info from the fetch process @@ -126,6 +297,15 @@ class PipesWorker implements Callable<PipesResult> { } try (TikaInputStream tis = tisOrResult.tis()) { + // Store original document for zipping if requested + EmbeddedDocumentBytesHandler handler = localContext.get(EmbeddedDocumentBytesHandler.class); + if (handler instanceof TempFileUnpackHandler) { + TempFileUnpackHandler tempHandler = (TempFileUnpackHandler) handler; + UnpackConfig uc = localContext.get(UnpackConfig.class); + if (uc != null && uc.isIncludeOriginal()) { + storeOriginalDocument(tis, tempHandler); + } + } return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, localContext); } catch (SecurityException e) { LOG.error("security exception id={}", fetchEmitTuple.getId(), e); @@ -150,31 +330,46 @@ class PipesWorker implements Callable<PipesResult> { parseContext.set(MetadataWriteLimiterFactory.class, defaultMetadataWriteLimiterFactory); } + ParseMode parseMode = parseContext.get(ParseMode.class); UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); - if (unpackConfig == null) { - //make sure there's one here -- or do we make this default in fetchemit tuple? - parseContext.set(UnpackConfig.class, UnpackConfig.SKIP); - return parseContext; - } - EmbeddedDocumentExtractorFactory factory = parseContext.get(EmbeddedDocumentExtractorFactory.class); - if (factory == null) { - parseContext.set(EmbeddedDocumentExtractor.class, - new RUnpackExtractor(parseContext, Long.MAX_VALUE)); - } else { - if (!(factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) { - throw new TikaConfigException("EmbeddedDocumentExtractorFactory must be an " + - "instance of EmbeddedDocumentByteStoreExtractorFactory if you want " + - "to extract embedded bytes! I see this embedded doc factory: " + - factory.getClass() + " and a request: " + - unpackConfig); + + // For UNPACK mode, automatically set up byte extraction + if (parseMode == ParseMode.UNPACK) { + if (unpackConfig == null) { + unpackConfig = new UnpackConfig(); + parseContext.set(UnpackConfig.class, unpackConfig); } - } - // Only set up embedded document bytes handler if an emitter is configured - if (!StringUtils.isBlank(unpackConfig.getEmitter())) { - parseContext.set(EmbeddedDocumentBytesHandler.class, - new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple, emitterManager)); + + // Determine emitter: prefer UnpackConfig, fall back to FetchEmitTuple + String emitterName = unpackConfig.getEmitter(); + if (StringUtils.isBlank(emitterName)) { + emitterName = fetchEmitTuple.getEmitKey().getEmitterId(); + if (StringUtils.isBlank(emitterName)) { + throw new TikaConfigException( + "UNPACK parse mode requires an emitter. Set emitter in UnpackConfig " + + "or specify an emitterId in FetchEmitTuple.emitKey."); + } + unpackConfig.setEmitter(emitterName); + } + + // Set up the extractor factory - the extractor will be created during parsing + // with the correct context (after RecursiveParserWrapper sets up EmbeddedParserDecorator) + parseContext.set(EmbeddedDocumentExtractorFactory.class, new UnpackExtractorFactory()); + + // Set up the bytes handler - use temp file handler if zipping requested + if (unpackConfig.isZipEmbeddedFiles()) { + parseContext.set(EmbeddedDocumentBytesHandler.class, + new TempFileUnpackHandler(fetchEmitTuple.getEmitKey(), unpackConfig)); + } else { + parseContext.set(EmbeddedDocumentBytesHandler.class, + new EmittingUnpackHandler(fetchEmitTuple, emitterManager, parseContext)); + } + + return parseContext; } + // For non-UNPACK modes, no byte extraction setup needed + // UnpackConfig may be present from config file but is only used when ParseMode.UNPACK is set return parseContext; } diff --git a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackConfigSelectorTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackSelectorTest.java similarity index 69% rename from tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackConfigSelectorTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackSelectorTest.java index 685d8d0715..8a131797d8 100644 --- a/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackConfigSelectorTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/core/extractor/UnpackSelectorTest.java @@ -24,20 +24,17 @@ import java.util.Set; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; -import org.apache.tika.extractor.EmbeddedBytesSelector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; -public class UnpackConfigSelectorTest extends TikaTest { +public class UnpackSelectorTest extends TikaTest { @Test - public void testEmbeddedBytesSelector() throws Exception { - UnpackConfig config = new UnpackConfig(); - config.setIncludeMimeTypes(Set.of("application/pdf", "application/rtf", "text/plain")); - config.setIncludeEmbeddedResourceTypes(Set.of("ATTACHMENT", "INLINE")); - - EmbeddedBytesSelector selector = config.createEmbeddedBytesSelector(); + public void testUnpackSelector() throws Exception { + UnpackSelector selector = new UnpackSelector(); + selector.setIncludeMimeTypes(Set.of("application/pdf", "application/rtf", "text/plain")); + selector.setIncludeEmbeddedResourceTypes(Set.of("ATTACHMENT", "INLINE")); assertFalse(selector.select(getMetadata("", ""))); assertTrue(selector.select(getMetadata("application/pdf", ""))); @@ -51,8 +48,7 @@ public class UnpackConfigSelectorTest extends TikaTest { @Test public void testAcceptAllWhenNoFilters() { - UnpackConfig config = new UnpackConfig(); - EmbeddedBytesSelector selector = config.createEmbeddedBytesSelector(); + UnpackSelector selector = new UnpackSelector(); // With no filters, should accept all assertTrue(selector.select(getMetadata("application/pdf", ""))); @@ -60,6 +56,24 @@ public class UnpackConfigSelectorTest extends TikaTest { assertTrue(selector.select(getMetadata("", ""))); } + @Test + public void testExcludeMimeTypes() { + UnpackSelector selector = new UnpackSelector(); + selector.setExcludeMimeTypes(Set.of("application/pdf")); + + assertTrue(selector.select(getMetadata("application/docx", ""))); + assertFalse(selector.select(getMetadata("application/pdf", ""))); + } + + @Test + public void testExcludeEmbeddedResourceTypes() { + UnpackSelector selector = new UnpackSelector(); + selector.setExcludeEmbeddedResourceTypes(Set.of("MACRO")); + + assertTrue(selector.select(getMetadata("application/pdf", "ATTACHMENT"))); + assertFalse(selector.select(getMetadata("application/pdf", "MACRO"))); + } + private Metadata getMetadata(String mime, String embeddedResourceType) { Metadata m = new Metadata(); if (!StringUtils.isBlank(mime)) { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/DigestingOpenContainersTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/DigestingOpenContainersTest.java index 1beb9fba75..bf02f11697 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/DigestingOpenContainersTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/DigestingOpenContainersTest.java @@ -31,7 +31,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.pipes.core.extractor.RUnpackExtractorFactory; +import org.apache.tika.pipes.core.extractor.UnpackExtractorFactory; public class DigestingOpenContainersTest extends TikaTest { @@ -45,7 +45,7 @@ public class DigestingOpenContainersTest extends TikaTest { ParseContext parseContext = loader.loadParseContext(); //this models what happens in tika-pipes if (parseContext.get(EmbeddedDocumentExtractorFactory.class) == null) { - parseContext.set(EmbeddedDocumentExtractorFactory.class, new RUnpackExtractorFactory()); + parseContext.set(EmbeddedDocumentExtractorFactory.class, new UnpackExtractorFactory()); } List<Metadata> metadataList = getRecursiveMetadata("testLargeOLEDoc.doc", autoDetectParser, parseContext); diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UnpackModeTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UnpackModeTest.java new file mode 100644 index 0000000000..80024bb622 --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/UnpackModeTest.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaJsonConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.api.FetchEmitTuple; +import org.apache.tika.pipes.api.ParseMode; +import org.apache.tika.pipes.api.PipesResult; +import org.apache.tika.pipes.api.emitter.EmitKey; +import org.apache.tika.pipes.api.fetcher.FetchKey; +import org.apache.tika.pipes.core.extractor.UnpackConfig; + +/** + * Tests for the UNPACK ParseMode functionality. + */ +public class UnpackModeTest { + + String fetcherName = "fsf"; + String emitterName = "fse"; + String testDocWithEmbedded = "mock-embedded.xml"; + + private PipesClient init(Path tmp, String testFileName) throws Exception { + Path tikaConfigPath = PluginsTestHelper.getFileSystemFetcherConfig(tmp, tmp.resolve("input"), tmp.resolve("output")); + PluginsTestHelper.copyTestFilesToTmpInput(tmp, testFileName); + + TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(tikaConfigPath); + PipesConfig pipesConfig = PipesConfig.load(tikaJsonConfig); + return new PipesClient(pipesConfig, tikaConfigPath); + } + + @Test + public void testUnpackModeBasic(@TempDir Path tmp) throws Exception { + // Test that UNPACK mode works and returns metadata like RMETA + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), "UNPACK mode should succeed. Status: " + pipesResult.status() + + ", Message: " + pipesResult.message()); + + // UNPACK mode may return EMIT_SUCCESS (without emitData) if passback filter is not used + // Check if we have emitData, otherwise just verify success + if (pipesResult.emitData() != null && pipesResult.emitData().getMetadataList() != null) { + // With RMETA-like behavior, we should get metadata for container + embedded docs + // mock-embedded.xml has 4 embedded documents, so we expect 5 metadata objects + List<Metadata> metadataList = pipesResult.emitData().getMetadataList(); + assertEquals(5, metadataList.size(), + "UNPACK should return RMETA-style metadata list (container + 4 embedded docs)"); + + // Verify container metadata + assertEquals("Nikolai Lobachevsky", metadataList.get(0).get("author")); + + // Verify embedded metadata + for (int i = 1; i < metadataList.size(); i++) { + assertEquals("embeddedAuthor", metadataList.get(i).get("author"), + "Embedded document " + i + " should have embedded author"); + } + } + // Even without emitData passback, the fact that isSuccess() is true means UNPACK worked + } + + @Test + public void testUnpackModeAutoSetup(@TempDir Path tmp) throws Exception { + // Test that UNPACK mode works without explicit UnpackConfig + // It should automatically set up UnpackExtractor and EmittingUnpackHandler + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + // No UnpackConfig set - should be created automatically + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), + "UNPACK should work without explicit UnpackConfig. Status: " + pipesResult.status() + + ", Message: " + pipesResult.message()); + } + + @Test + public void testUnpackModeRequiresEmitter(@TempDir Path tmp) throws Exception { + // Test that UNPACK mode fails gracefully when no emitter is specified + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + // Create EmitKey with no emitterId to trigger the error + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey("", ""), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + // Should fail because no emitter is configured + // The error could be a crash (TikaConfigException thrown), initialization failure, or task exception + assertTrue(!pipesResult.isSuccess(), + "UNPACK without emitter should fail. Status: " + pipesResult.status()); + assertNotNull(pipesResult.message()); + assertTrue(pipesResult.message().contains("emitter") || pipesResult.message().contains("UNPACK") || + pipesResult.message().contains("TikaConfigException"), + "Error message should mention emitter requirement: " + pipesResult.message()); + } + + @Test + public void testUnpackModeReturnsMetadata(@TempDir Path tmp) throws Exception { + // Test that UNPACK mode returns full metadata list like RMETA + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), "Processing should succeed. Status: " + pipesResult.status() + + ", Message: " + pipesResult.message()); + + // Check if emitData is available (depends on emit strategy) + if (pipesResult.emitData() != null && pipesResult.emitData().getMetadataList() != null) { + List<Metadata> metadataList = pipesResult.emitData().getMetadataList(); + assertTrue(metadataList.size() > 1, + "UNPACK should return multiple metadata objects for documents with embedded content"); + + // Each metadata object should have content type + for (Metadata m : metadataList) { + assertNotNull(m.get("Content-Type"), "Each document should have Content-Type"); + } + } + } + + @Test + public void testUnpackModeWithCustomUnpackConfig(@TempDir Path tmp) throws Exception { + // Test that UNPACK mode respects custom UnpackConfig settings + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + // Create custom UnpackConfig with specific settings + UnpackConfig unpackConfig = new UnpackConfig(); + unpackConfig.setEmitter(emitterName); + unpackConfig.setZeroPadName(8); + unpackConfig.setSuffixStrategy(UnpackConfig.SUFFIX_STRATEGY.DETECTED); + parseContext.set(UnpackConfig.class, unpackConfig); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), + "UNPACK with custom UnpackConfig should succeed. Status: " + pipesResult.status()); + } + + @Test + public void testUnpackModeWithIncludeOriginal(@TempDir Path tmp) throws Exception { + // Test that includeOriginal=true works with UNPACK mode + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + UnpackConfig unpackConfig = new UnpackConfig(); + unpackConfig.setEmitter(emitterName); + unpackConfig.setIncludeOriginal(true); + parseContext.set(UnpackConfig.class, unpackConfig); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), + "UNPACK with includeOriginal should succeed. Status: " + pipesResult.status()); + } + + @Test + public void testUnpackModeVsRmetaMode(@TempDir Path tmp) throws Exception { + // Compare UNPACK mode output with RMETA mode to verify metadata consistency + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + // Process with RMETA + ParseContext rmetaContext = new ParseContext(); + rmetaContext.set(ParseMode.class, ParseMode.RMETA); + + PipesResult rmetaResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded + "-rmeta", new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded + "-rmeta"), new Metadata(), rmetaContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + // Process with UNPACK + ParseContext unpackContext = new ParseContext(); + unpackContext.set(ParseMode.class, ParseMode.UNPACK); + + PipesResult unpackResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded + "-unpack", new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded + "-unpack"), new Metadata(), unpackContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + // Both should succeed + assertTrue(rmetaResult.isSuccess(), "RMETA processing should succeed. Status: " + rmetaResult.status()); + assertTrue(unpackResult.isSuccess(), "UNPACK processing should succeed. Status: " + unpackResult.status() + + ", Message: " + unpackResult.message()); + + // If emitData is available for both, compare them + if (rmetaResult.emitData() != null && rmetaResult.emitData().getMetadataList() != null && + unpackResult.emitData() != null && unpackResult.emitData().getMetadataList() != null) { + List<Metadata> rmetaList = rmetaResult.emitData().getMetadataList(); + List<Metadata> unpackList = unpackResult.emitData().getMetadataList(); + + assertEquals(rmetaList.size(), unpackList.size(), + "UNPACK should return same number of metadata objects as RMETA"); + + // Compare key metadata values + for (int i = 0; i < rmetaList.size(); i++) { + assertEquals(rmetaList.get(i).get("author"), unpackList.get(i).get("author"), + "Author metadata should match at index " + i); + assertEquals(rmetaList.get(i).get("Content-Type"), unpackList.get(i).get("Content-Type"), + "Content-Type should match at index " + i); + } + } + } + + @Test + public void testUnpackModeWithSimpleDocument(@TempDir Path tmp) throws Exception { + // Test UNPACK mode with a simple document (no embedded files) + String simpleDoc = "mock_times.xml"; + PipesClient pipesClient = init(tmp, simpleDoc); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(simpleDoc, new FetchKey(fetcherName, simpleDoc), + new EmitKey(emitterName, simpleDoc), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), + "UNPACK should work with simple documents. Status: " + pipesResult.status() + + ", Message: " + pipesResult.message()); + + // Check emitData if available + if (pipesResult.emitData() != null && pipesResult.emitData().getMetadataList() != null) { + assertEquals(1, pipesResult.emitData().getMetadataList().size(), + "Simple document should have exactly one metadata object"); + } + } + + @Test + public void testParseModeParseMethod() { + // Test the parse() method includes UNPACK in error message + try { + ParseMode.parse("INVALID_MODE"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("UNPACK"), + "Error message should include UNPACK as a valid option: " + e.getMessage()); + } + + // Test that UNPACK can be parsed + assertEquals(ParseMode.UNPACK, ParseMode.parse("UNPACK")); + assertEquals(ParseMode.UNPACK, ParseMode.parse("unpack")); + assertEquals(ParseMode.UNPACK, ParseMode.parse("Unpack")); + } + + @Test + public void testUnpackModeBytesEmittedToOutputDir(@TempDir Path tmp) throws Exception { + // Test that embedded bytes are actually emitted to the output directory + Path outputDir = tmp.resolve("output"); + Files.createDirectories(outputDir); + + PipesClient pipesClient = init(tmp, testDocWithEmbedded); + + ParseContext parseContext = new ParseContext(); + parseContext.set(ParseMode.class, ParseMode.UNPACK); + + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple(testDocWithEmbedded, new FetchKey(fetcherName, testDocWithEmbedded), + new EmitKey(emitterName, testDocWithEmbedded), new Metadata(), parseContext, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT)); + + assertTrue(pipesResult.isSuccess(), "UNPACK should succeed"); + + // Check that output files were created for the embedded documents + // The exact naming depends on the EmittingUnpackHandler configuration + // At minimum, we verify the metadata JSON was written + assertTrue(Files.exists(outputDir.resolve(testDocWithEmbedded + ".json")) || + Files.list(outputDir).count() > 0, + "Output directory should contain emitted files"); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json index daef89edaa..aadadcc5ab 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-json/src/test/resources/test-documents/test-with-embedded-bytes.json @@ -5,8 +5,8 @@ "emitter": "fs", "emitKey": "0.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -20,8 +20,8 @@ "emitter": "fs", "emitKey": "1.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -35,8 +35,8 @@ "emitter": "fs", "emitKey": "2.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -50,8 +50,8 @@ "emitter": "fs", "emitKey": "3.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -65,8 +65,8 @@ "emitter": "fs", "emitKey": "4.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -80,8 +80,8 @@ "emitter": "fs", "emitKey": "5.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -95,8 +95,8 @@ "emitter": "fs", "emitKey": "6.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -110,8 +110,8 @@ "emitter": "fs", "emitKey": "7.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -125,8 +125,8 @@ "emitter": "fs", "emitKey": "8.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -140,8 +140,8 @@ "emitter": "fs", "emitKey": "9.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -155,8 +155,8 @@ "emitter": "fs", "emitKey": "10.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -170,8 +170,8 @@ "emitter": "fs", "emitKey": "11.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -185,8 +185,8 @@ "emitter": "fs", "emitKey": "12.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -200,8 +200,8 @@ "emitter": "fs", "emitKey": "13.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -215,8 +215,8 @@ "emitter": "fs", "emitKey": "14.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -230,8 +230,8 @@ "emitter": "fs", "emitKey": "15.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -245,8 +245,8 @@ "emitter": "fs", "emitKey": "16.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -260,8 +260,8 @@ "emitter": "fs", "emitKey": "17.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -275,8 +275,8 @@ "emitter": "fs", "emitKey": "18.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -290,8 +290,8 @@ "emitter": "fs", "emitKey": "19.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -305,8 +305,8 @@ "emitter": "fs", "emitKey": "20.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -320,8 +320,8 @@ "emitter": "fs", "emitKey": "21.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -335,8 +335,8 @@ "emitter": "fs", "emitKey": "22.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -350,8 +350,8 @@ "emitter": "fs", "emitKey": "23.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -365,8 +365,8 @@ "emitter": "fs", "emitKey": "24.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -380,8 +380,8 @@ "emitter": "fs", "emitKey": "25.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -395,8 +395,8 @@ "emitter": "fs", "emitKey": "26.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -410,8 +410,8 @@ "emitter": "fs", "emitKey": "27.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -425,8 +425,8 @@ "emitter": "fs", "emitKey": "28.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -440,8 +440,8 @@ "emitter": "fs", "emitKey": "29.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -455,8 +455,8 @@ "emitter": "fs", "emitKey": "30.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -470,8 +470,8 @@ "emitter": "fs", "emitKey": "31.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -485,8 +485,8 @@ "emitter": "fs", "emitKey": "32.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -500,8 +500,8 @@ "emitter": "fs", "emitKey": "33.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -515,8 +515,8 @@ "emitter": "fs", "emitKey": "34.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -530,8 +530,8 @@ "emitter": "fs", "emitKey": "35.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -545,8 +545,8 @@ "emitter": "fs", "emitKey": "36.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -560,8 +560,8 @@ "emitter": "fs", "emitKey": "37.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -575,8 +575,8 @@ "emitter": "fs", "emitKey": "38.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -590,8 +590,8 @@ "emitter": "fs", "emitKey": "39.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -605,8 +605,8 @@ "emitter": "fs", "emitKey": "40.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -620,8 +620,8 @@ "emitter": "fs", "emitKey": "41.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -635,8 +635,8 @@ "emitter": "fs", "emitKey": "42.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -650,8 +650,8 @@ "emitter": "fs", "emitKey": "43.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -665,8 +665,8 @@ "emitter": "fs", "emitKey": "44.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -680,8 +680,8 @@ "emitter": "fs", "emitKey": "45.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -695,8 +695,8 @@ "emitter": "fs", "emitKey": "46.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -710,8 +710,8 @@ "emitter": "fs", "emitKey": "47.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -725,8 +725,8 @@ "emitter": "fs", "emitKey": "48.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -740,8 +740,8 @@ "emitter": "fs", "emitKey": "49.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -755,8 +755,8 @@ "emitter": "fs", "emitKey": "50.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -770,8 +770,8 @@ "emitter": "fs", "emitKey": "51.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -785,8 +785,8 @@ "emitter": "fs", "emitKey": "52.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -800,8 +800,8 @@ "emitter": "fs", "emitKey": "53.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -815,8 +815,8 @@ "emitter": "fs", "emitKey": "54.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -830,8 +830,8 @@ "emitter": "fs", "emitKey": "55.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -845,8 +845,8 @@ "emitter": "fs", "emitKey": "56.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -860,8 +860,8 @@ "emitter": "fs", "emitKey": "57.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -875,8 +875,8 @@ "emitter": "fs", "emitKey": "58.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -890,8 +890,8 @@ "emitter": "fs", "emitKey": "59.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -905,8 +905,8 @@ "emitter": "fs", "emitKey": "60.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -920,8 +920,8 @@ "emitter": "fs", "emitKey": "61.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -935,8 +935,8 @@ "emitter": "fs", "emitKey": "62.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -950,8 +950,8 @@ "emitter": "fs", "emitKey": "63.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -965,8 +965,8 @@ "emitter": "fs", "emitKey": "64.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -980,8 +980,8 @@ "emitter": "fs", "emitKey": "65.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -995,8 +995,8 @@ "emitter": "fs", "emitKey": "66.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1010,8 +1010,8 @@ "emitter": "fs", "emitKey": "67.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1025,8 +1025,8 @@ "emitter": "fs", "emitKey": "68.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1040,8 +1040,8 @@ "emitter": "fs", "emitKey": "69.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1055,8 +1055,8 @@ "emitter": "fs", "emitKey": "70.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1070,8 +1070,8 @@ "emitter": "fs", "emitKey": "71.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1085,8 +1085,8 @@ "emitter": "fs", "emitKey": "72.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1100,8 +1100,8 @@ "emitter": "fs", "emitKey": "73.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1115,8 +1115,8 @@ "emitter": "fs", "emitKey": "74.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1130,8 +1130,8 @@ "emitter": "fs", "emitKey": "75.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1145,8 +1145,8 @@ "emitter": "fs", "emitKey": "76.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1160,8 +1160,8 @@ "emitter": "fs", "emitKey": "77.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1175,8 +1175,8 @@ "emitter": "fs", "emitKey": "78.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1190,8 +1190,8 @@ "emitter": "fs", "emitKey": "79.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1205,8 +1205,8 @@ "emitter": "fs", "emitKey": "80.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1220,8 +1220,8 @@ "emitter": "fs", "emitKey": "81.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1235,8 +1235,8 @@ "emitter": "fs", "emitKey": "82.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1250,8 +1250,8 @@ "emitter": "fs", "emitKey": "83.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1265,8 +1265,8 @@ "emitter": "fs", "emitKey": "84.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1280,8 +1280,8 @@ "emitter": "fs", "emitKey": "85.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1295,8 +1295,8 @@ "emitter": "fs", "emitKey": "86.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1310,8 +1310,8 @@ "emitter": "fs", "emitKey": "87.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1325,8 +1325,8 @@ "emitter": "fs", "emitKey": "88.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1340,8 +1340,8 @@ "emitter": "fs", "emitKey": "89.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1355,8 +1355,8 @@ "emitter": "fs", "emitKey": "90.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1370,8 +1370,8 @@ "emitter": "fs", "emitKey": "91.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1385,8 +1385,8 @@ "emitter": "fs", "emitKey": "92.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1400,8 +1400,8 @@ "emitter": "fs", "emitKey": "93.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1415,8 +1415,8 @@ "emitter": "fs", "emitKey": "94.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1430,8 +1430,8 @@ "emitter": "fs", "emitKey": "95.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1445,8 +1445,8 @@ "emitter": "fs", "emitKey": "96.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1460,8 +1460,8 @@ "emitter": "fs", "emitKey": "97.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1475,8 +1475,8 @@ "emitter": "fs", "emitKey": "98.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", @@ -1490,8 +1490,8 @@ "emitter": "fs", "emitKey": "99.xml.json", "onParseException": "emit", + "parseMode": "UNPACK", "embeddedDocumentBytesConfig": { - "extractEmbeddedDocumentBytes": true, "zeroPadName": 0, "suffixStrategy": "NONE", "embeddedIdPrefix": "-", diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index ef764c404b..60b675b879 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -114,8 +114,7 @@ public class AsyncResource { } ParseContext parseContext = t.getParseContext(); UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class); - if (unpackConfig != null && unpackConfig.isExtractEmbeddedDocumentBytes() && - !StringUtils.isAllBlank(unpackConfig.getEmitter())) { + if (unpackConfig != null && !StringUtils.isAllBlank(unpackConfig.getEmitter())) { String bytesEmitter = unpackConfig.getEmitter(); if (!emitterManager .getSupported() diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index 8e69634ff4..e1374d889f 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -251,7 +251,7 @@ public class TikaPipesTest extends CXFTestBase { @Test public void testBytes() throws Exception { - UnpackConfig config = new UnpackConfig(true); + UnpackConfig config = new UnpackConfig(); config.setEmitter(EMITTER_BYTES_ID); config.setIncludeOriginal(true); config.setKeyBaseStrategy(UnpackConfig.KEY_BASE_STRATEGY.CUSTOM);
