This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4207 by this push: new 8cdaff4b3 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor 8cdaff4b3 is described below commit 8cdaff4b3e2a4a477f753f3bfca751d804721a9d Author: tallison <talli...@apache.org> AuthorDate: Thu Mar 28 07:11:46 2024 -0400 TIKA-4207 -- further refactorings to simplify class structure and bring back the default ParsingEmbeddedDocumentExtractor --- ...a => AbstractEmbeddedDocumentBytesHandler.java} | 2 +- ...java => BasicEmbeddedDocumentBytesHandler.java} | 12 ++- ...EmbeddedDocumentByteStoreExtractorFactory.java} | 24 +++--- ...tore.java => EmbeddedDocumentBytesHandler.java} | 4 +- .../tika/extractor/EmbeddedDocumentUtil.java | 2 +- .../ParsingEmbeddedDocumentExtractor.java | 93 +--------------------- .../ParsingEmbeddedDocumentExtractorFactory.java | 74 +---------------- ...ocumentExtractor.java => RUnpackExtractor.java} | 19 +++-- ...orFactory.java => RUnpackExtractorFactory.java} | 11 ++- .../org/apache/tika/parser/AutoDetectParser.java | 11 ++- .../apache/tika/parser/AutoDetectParserConfig.java | 4 +- .../java/org/apache/tika/pipes/PipesServer.java | 67 +++++++++++----- .../extractor/EmbeddedDocumentBytesConfig.java | 9 +++ ...a => EmittingEmbeddedDocumentBytesHandler.java} | 15 ++-- .../tika/parser/AutoDetectParserConfigTest.java | 10 +-- .../org/apache/tika/pipes/PipesServerTest.java | 17 +++- .../config/TIKA-4207-embedded-bytes-config.xml | 2 +- .../apache/tika/pipes/TIKA-4207-limit-bytes.xml | 2 +- .../apache/tika/example/ExtractEmbeddedFiles.java | 2 +- .../parser/microsoft/pst/OutlookPSTParserTest.java | 2 +- .../apache/tika/parser/pdf/PDFRenderingTest.java | 2 +- .../resources/configs/tika-config-no-names.xml | 2 +- .../resources/configs/tika-config-with-names.xml | 2 +- 23 files changed, 142 insertions(+), 246 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java similarity index 96% rename from tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java rename to tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java index 15b26451a..3f2f38f94 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java @@ -28,7 +28,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.utils.StringUtils; -public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocumentByteStore { +public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler { List<Integer> ids = new ArrayList<>(); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java similarity index 80% rename from tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java rename to tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java index d3aeb4507..cf6441b4f 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java @@ -27,9 +27,16 @@ import org.apache.commons.io.input.UnsynchronizedBufferedInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; -public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByteStore { +/** + * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores + * all the bytes in memory. Users can retrieve the documents with {@link #getDocument(int)}. + * + * We'll need to make this cache to disk at some point if there are many bytes of + * embedded documents. + */ +public class BasicEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler { private final EmbeddedDocumentBytesConfig config; - public BasicEmbeddedDocumentByteStore(EmbeddedDocumentBytesConfig config) { + public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig config) { this.config = config; } //this won't scale, but let's start fully in memory for now; @@ -40,7 +47,6 @@ public class BasicEmbeddedDocumentByteStore extends AbstractEmbeddedDocumentByte docBytes.put(id, IOUtils.toByteArray(is)); } - @Override public InputStream getDocument(int id) throws IOException { return new UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get(); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java similarity index 59% copy from tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java copy to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java index 8e1e8e325..f7237bd6a 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java @@ -16,18 +16,18 @@ */ package org.apache.tika.extractor; -import java.io.Closeable; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import org.apache.tika.metadata.Metadata; - -public interface EmbeddedDocumentByteStore extends Closeable { - //we need metadata for the emitter store...can we get away without it? - void add(int id, Metadata metadata, InputStream inputStream) throws IOException; - - InputStream getDocument(int id) throws IOException; +/** + * This factory creates EmbeddedDocumentExtractors that require an + * {@link EmbeddedDocumentBytesHandler} in the + * {@link org.apache.tika.parser.ParseContext} should extend this. + * + * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} + * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom + * EmbeddedDocumentExtractor. + * + * TODO: Figure out how to simplify this and allow for emitting of the source document. + */ +public interface EmbeddedDocumentByteStoreExtractorFactory extends EmbeddedDocumentExtractorFactory { - List<Integer> getIds(); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java similarity index 90% rename from tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java rename to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java index 8e1e8e325..12357a718 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java @@ -23,11 +23,9 @@ import java.util.List; import org.apache.tika.metadata.Metadata; -public interface EmbeddedDocumentByteStore extends Closeable { +public interface EmbeddedDocumentBytesHandler extends Closeable { //we need metadata for the emitter store...can we get away without it? void add(int id, Metadata metadata, InputStream inputStream) throws IOException; - InputStream getDocument(int id) throws IOException; - List<Integer> getIds(); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index 99a3f3921..d6e2c28a8 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable { context.set(Parser.class, new AutoDetectParser(tikaConfig)); } } - EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context, 0); + EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context); context.set(EmbeddedDocumentExtractor.class, ex); return ex; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 97cf5b57f..8391624a3 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -22,12 +22,8 @@ import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; import org.apache.commons.io.input.CloseShieldInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -35,7 +31,6 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -55,9 +50,6 @@ import org.apache.tika.sax.EmbeddedContentHandler; */ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { - private static final Logger LOGGER = - LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); - private static final File ABSTRACT_PATH = new File(""); private static final Parser DELEGATING_PARSER = new DelegatingParser(); @@ -66,14 +58,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private final ParseContext context; - private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; - - private long bytesExtracted = 0; - private final long maxEmbeddedBytesForExtraction; - - public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { + public ParsingEmbeddedDocumentExtractor(ParseContext context) { this.context = context; - this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; } public boolean shouldParseEmbedded(Metadata metadata) { @@ -113,19 +99,15 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract // Use the delegate parser to parse this entry try (TemporaryResources tmp = new TemporaryResources()) { final TikaInputStream newStream = - TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata); + TikaInputStream.get(new CloseShieldInputStream(stream), tmp, metadata); if (stream instanceof TikaInputStream) { final Object container = ((TikaInputStream) stream).getOpenContainer(); if (container != null) { newStream.setOpenContainer(container); } } - EmbeddedDocumentByteStore store = context.get(EmbeddedDocumentByteStore.class); - if (store != null) { - parseWithBytes(newStream, handler, metadata); - } else { - parse(newStream, handler, metadata); - } + DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), + metadata, context); } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { @@ -141,65 +123,6 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract } } - private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata) - throws TikaException, IOException, SAXException { - //TODO -- improve the efficiency of this so that we're not - //literally writing out a file per request - Path p = stream.getPath(); - try { - parse(stream, handler, metadata); - } finally { - storeEmbeddedBytes(p, metadata); - } - } - - private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata) - throws TikaException, IOException, SAXException { - DELEGATING_PARSER.parse(stream, - new EmbeddedContentHandler(new BodyContentHandler(handler)), - metadata, context); - } - - private void storeEmbeddedBytes(Path p, Metadata metadata) { - if (! embeddedBytesSelector.select(metadata)) { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("skipping embedded bytes {} <-> {}", - metadata.get(Metadata.CONTENT_TYPE), - metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); - } - return; - } - EmbeddedDocumentByteStore embeddedDocumentByteStore = - context.get(EmbeddedDocumentByteStore.class); - int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); - try (InputStream is = Files.newInputStream(p)) { - if (bytesExtracted >= maxEmbeddedBytesForExtraction) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")"); - } - long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted; - - try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) { - embeddedDocumentByteStore.add(id, metadata, boundedIs); - bytesExtracted += boundedIs.getPos(); - if (boundedIs.hasHitBound()) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " + - "bytes"); - } - } - } catch (IOException e) { - LOGGER.warn("problem writing out embedded bytes", e); - //info in metadata doesn't actually make it back to the metadata list - //because we're filtering and cloning the metadata at the end of the parse - //which happens before we try to copy out the files. - //TODO fix this - //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, - // ExceptionUtils.getStackTrace(e)); - } - } - - private void recordException(Exception e, ParseContext context) { ParseRecord record = context.get(ParseRecord.class); if (record == null) { @@ -215,12 +138,4 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; } - - public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) { - this.embeddedBytesSelector = embeddedBytesSelector; - } - - public EmbeddedBytesSelector getEmbeddedBytesSelector() { - return embeddedBytesSelector; - } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index fd8cf54b1..9136228c4 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -16,93 +16,25 @@ */ package org.apache.tika.extractor; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { +public class ParsingEmbeddedDocumentExtractorFactory + implements EmbeddedDocumentExtractorFactory { private boolean writeFileNameToContent = true; - private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; - private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; } - @Field - public void setEmbeddedBytesIncludeMimeTypes(List<String> includeMimeTypes) { - embeddedBytesIncludeMimeTypes = new HashSet<>(); - embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes); - } - - @Field - public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) { - embeddedBytesExcludeMimeTypes = new HashSet<>(); - embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes); - - } - - @Field - public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) { - embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); - embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes); - - } - - @Field - public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) { - embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); - embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes); - - } - - /** - * Total number of bytes to write out. A good zip bomb may contain petabytes - * compressed into a few kb. Make sure that you can't fill up a disk! - * - * This does not include the container file in the count of bytes written out. - * This only counts the lengths of the embedded files. - * - * @param maxEmbeddedBytesForExtraction - */ - @Field - public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException { - if (maxEmbeddedBytesForExtraction < 0) { - throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0"); - } - this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; - } - @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction); + new ParsingEmbeddedDocumentExtractor(parseContext); ex.setWriteFileNameToContent(writeFileNameToContent); - ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; } - - - private EmbeddedBytesSelector createEmbeddedBytesSelector() { - if (embeddedBytesIncludeMimeTypes.size() == 0 && - embeddedBytesExcludeMimeTypes.size() == 0 && - embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 && - embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { - return EmbeddedBytesSelector.ACCEPT_ALL; - } - return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes, - embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes, - embeddedBytesExcludeEmbeddedResourceTypes); - } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java similarity index 92% copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java copy to tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 97cf5b57f..4c69d0997 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -48,12 +48,11 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; /** - * Helper class for parsers of package archives or other compound document - * formats that support embedded or attached component documents. + * Recursive Unpacker and text and metadata extractor. * - * @since Apache Tika 0.8 + * @since Apache Tika 3.0.0 */ -public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { +public class RUnpackExtractor implements EmbeddedDocumentExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); @@ -71,7 +70,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private long bytesExtracted = 0; private final long maxEmbeddedBytesForExtraction; - public ParsingEmbeddedDocumentExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { + public RUnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction) { this.context = context; this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; } @@ -120,8 +119,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract newStream.setOpenContainer(container); } } - EmbeddedDocumentByteStore store = context.get(EmbeddedDocumentByteStore.class); - if (store != null) { + EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class); + if (bytesHandler != null) { parseWithBytes(newStream, handler, metadata); } else { parse(newStream, handler, metadata); @@ -169,8 +168,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract } return; } - EmbeddedDocumentByteStore embeddedDocumentByteStore = - context.get(EmbeddedDocumentByteStore.class); + EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler = + context.get(EmbeddedDocumentBytesHandler.class); int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); try (InputStream is = Files.newInputStream(p)) { if (bytesExtracted >= maxEmbeddedBytesForExtraction) { @@ -180,7 +179,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted; try (BoundedInputStream boundedIs = new BoundedInputStream(maxToRead, is)) { - embeddedDocumentByteStore.add(id, metadata, boundedIs); + embeddedDocumentBytesHandler.add(id, metadata, boundedIs); bytesExtracted += boundedIs.getPos(); if (boundedIs.hasHitBound()) { throw new IOException("Bytes extracted (" + bytesExtracted + diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java similarity index 91% copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java copy to tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java index fd8cf54b1..a715ed25f 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java @@ -26,7 +26,9 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { +public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtractorFactory { + + public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l * 1024l * 1024l; private boolean writeFileNameToContent = true; private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET; @@ -34,7 +36,7 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET; private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; - private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 1024l;//10GB + private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION; @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; @@ -86,8 +88,9 @@ public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocument @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext, maxEmbeddedBytesForExtraction); + RUnpackExtractor ex = + new RUnpackExtractor(parseContext, + maxEmbeddedBytesForExtraction); ex.setWriteFileNameToContent(writeFileNameToContent); ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index d333c2e9a..86eae692a 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -28,6 +28,8 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.HttpHeaders; @@ -197,7 +199,6 @@ public class AutoDetectParser extends CompositeParser { createSecureContentHandler(handler, tis, autoDetectParserConfig) : null; initializeEmbeddedDocumentExtractor(metadata, context); - try { // Parse the document super.parse(tis, sch, metadata, context); @@ -267,8 +268,12 @@ public class AutoDetectParser extends CompositeParser { if (p == null) { context.set(Parser.class, this); } - EmbeddedDocumentExtractor edx = autoDetectParserConfig.getEmbeddedDocumentExtractorFactory() - .newInstance(metadata, context); + EmbeddedDocumentExtractorFactory edxf = + autoDetectParserConfig.getEmbeddedDocumentExtractorFactory(); + if (edxf == null) { + edxf = new ParsingEmbeddedDocumentExtractorFactory(); + } + EmbeddedDocumentExtractor edx = edxf.newInstance(metadata, context); context.set(EmbeddedDocumentExtractor.class, edx); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index bc4904367..afe65b07e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -25,7 +25,6 @@ import org.xml.sax.ContentHandler; import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.sax.ContentHandlerDecoratorFactory; @@ -87,8 +86,7 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { private MetadataWriteFilterFactory metadataWriteFilterFactory = null; - private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory = - new ParsingEmbeddedDocumentExtractorFactory(); + private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory = null; private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory = NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 5cc22d378..d8957368d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -45,9 +45,14 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.BasicEmbeddedDocumentByteStore; +import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.extractor.DocumentSelector; -import org.apache.tika.extractor.EmbeddedDocumentByteStore; +import org.apache.tika.extractor.EmbeddedDocumentByteStoreExtractorFactory; +import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; +import org.apache.tika.extractor.RUnpackExtractor; +import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -64,7 +69,7 @@ import org.apache.tika.pipes.emitter.Emitter; import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.emitter.StreamEmitter; import org.apache.tika.pipes.emitter.TikaEmitterException; -import org.apache.tika.pipes.extractor.EmbeddedDocumentEmitterStore; +import org.apache.tika.pipes.extractor.EmittingEmbeddedDocumentBytesHandler; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; @@ -381,9 +386,9 @@ public class PipesServer implements Runnable { emitParseData(t, parseData); } finally { if (parseData != null && parseData.hasEmbeddedDocumentByteStore() && - parseData.getEmbeddedDocumentByteStore() instanceof Closeable) { + parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) { try { - ((Closeable) parseData.getEmbeddedDocumentByteStore()).close(); + ((Closeable) parseData.getEmbeddedDocumentBytesHandler()).close(); } catch (IOException e) { LOG.warn("problem closing embedded document byte store", e); } @@ -536,7 +541,7 @@ public class PipesServer implements Runnable { } return new MetadataListAndEmbeddedBytes(metadataList, - parseContext.get(EmbeddedDocumentByteStore.class)); + parseContext.get(EmbeddedDocumentBytesHandler.class)); } private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple) @@ -545,14 +550,28 @@ public class PipesServer implements Runnable { if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) { return parseContext; } - - //TODO: clean this up. + EmbeddedDocumentExtractorFactory factory = ((AutoDetectParser)autoDetectParser) + .getAutoDetectParserConfig().getEmbeddedDocumentExtractorFactory(); + if (factory == null) { + parseContext.set(EmbeddedDocumentExtractor.class, new RUnpackExtractor(parseContext, + RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION)); + } else { + if (! (factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) { + throw new TikaConfigException("EmbeddedDocumentExtractorFactory must be an " + + "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" + + "to extract embedded bytes! I see this embedded doc factory: " + + factory.getClass() + "and a request: " + + fetchEmitTuple.getEmbeddedDocumentBytesConfig()); + } + } + //TODO: especially clean this up. if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) { - parseContext.set(EmbeddedDocumentByteStore.class, - new EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(), + parseContext.set(EmbeddedDocumentBytesHandler.class, + new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(), fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager)); } else { - parseContext.set(EmbeddedDocumentByteStore.class, new BasicEmbeddedDocumentByteStore( + parseContext.set(EmbeddedDocumentBytesHandler.class, + new BasicEmbeddedDocumentBytesHandler( fetchEmitTuple.getEmbeddedDocumentBytesConfig())); } return parseContext; @@ -677,8 +696,8 @@ public class PipesServer implements Runnable { if (t.getEmbeddedDocumentBytesConfig() != null && t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) { - EmbeddedDocumentByteStore embeddedDocumentByteStore = - parseContext.get(EmbeddedDocumentByteStore.class); + EmbeddedDocumentBytesHandler embeddedDocumentByteStore = + parseContext.get(EmbeddedDocumentBytesHandler.class); try (InputStream is = Files.newInputStream(tis.getPath())) { embeddedDocumentByteStore.add(0, metadata, is); } catch (IOException e) { @@ -747,6 +766,14 @@ public class PipesServer implements Runnable { //override this value because we'll be digesting before parse ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory() .setSkipContainerDocument(true); + //if the user hasn't configured an embedded document extractor, set up the + // RUnpackExtractorFactory + if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig() + .getEmbeddedDocumentExtractorFactory() == null) { + ((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory( + new RUnpackExtractorFactory()); + } } this.detector = ((AutoDetectParser) this.autoDetectParser).getDetector(); this.rMetaParser = new RecursiveParserWrapper(autoDetectParser); @@ -809,20 +836,20 @@ public class PipesServer implements Runnable { class MetadataListAndEmbeddedBytes { final List<Metadata> metadataList; - final Optional<EmbeddedDocumentByteStore> embeddedDocumentByteStore; + final Optional<EmbeddedDocumentBytesHandler> embeddedDocumentBytesHandler; public MetadataListAndEmbeddedBytes(List<Metadata> metadataList, - EmbeddedDocumentByteStore embeddedDocumentByteStore) { + EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) { this.metadataList = metadataList; - this.embeddedDocumentByteStore = Optional.ofNullable(embeddedDocumentByteStore); + this.embeddedDocumentBytesHandler = Optional.ofNullable(embeddedDocumentBytesHandler); } public List<Metadata> getMetadataList() { return metadataList; } - public EmbeddedDocumentByteStore getEmbeddedDocumentByteStore() { - return embeddedDocumentByteStore.get(); + public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() { + return embeddedDocumentBytesHandler.get(); } /** @@ -832,7 +859,7 @@ public class PipesServer implements Runnable { * @return */ public boolean hasEmbeddedDocumentByteStore() { - return embeddedDocumentByteStore.isPresent(); + return embeddedDocumentBytesHandler.isPresent(); } /** @@ -844,7 +871,7 @@ public class PipesServer implements Runnable { * @return */ public boolean toBePackagedForStreamEmitter() { - return !(embeddedDocumentByteStore.get() instanceof EmbeddedDocumentEmitterStore); + return !(embeddedDocumentBytesHandler.get() instanceof EmittingEmbeddedDocumentBytesHandler); } } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java index 66b7321ac..071de05c4 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java @@ -55,6 +55,15 @@ public class EmbeddedDocumentBytesConfig implements Serializable { private boolean includeOriginal = false; + /** + * Create an EmbeddedDocumentBytesConfig with + * {@link EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes} + * set to <code>true</code> + */ + public EmbeddedDocumentBytesConfig() { + this.extractEmbeddedDocumentBytes = true; + } + public EmbeddedDocumentBytesConfig(boolean extractEmbeddedDocumentBytes) { this.extractEmbeddedDocumentBytes = extractEmbeddedDocumentBytes; } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java similarity index 83% rename from tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java rename to tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java index 5d09cfe18..1132a4bc6 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java @@ -23,7 +23,7 @@ import java.io.InputStream; import org.apache.commons.io.IOExceptionWithCause; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore; +import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.emitter.Emitter; @@ -31,15 +31,15 @@ import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.emitter.StreamEmitter; import org.apache.tika.pipes.emitter.TikaEmitterException; -public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteStore { +public class EmittingEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler { private final EmitKey containerEmitKey; private final EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig; private final StreamEmitter emitter; private static final Metadata METADATA = new Metadata(); - public EmbeddedDocumentEmitterStore(EmitKey containerEmitKey, - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, - EmitterManager emitterManager) throws TikaConfigException { + public EmittingEmbeddedDocumentBytesHandler(EmitKey containerEmitKey, + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, + EmitterManager emitterManager) throws TikaConfigException { this.containerEmitKey = containerEmitKey; this.embeddedDocumentBytesConfig = embeddedDocumentBytesConfig; Emitter tmpEmitter = @@ -64,11 +64,6 @@ public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteSt } } - @Override - public InputStream getDocument(int id) { - throw new UnsupportedOperationException("this is emit only."); - } - @Override public void close() throws IOException { if (emitter instanceof Closeable) { diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index a0d5d4896..62b061d98 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -25,8 +25,8 @@ import org.junit.jupiter.api.Test; import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedBytesSelector; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; +import org.apache.tika.extractor.RUnpackExtractor; +import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; @@ -41,12 +41,12 @@ public class AutoDetectParserConfigTest { config = new TikaConfig(is); } AutoDetectParserConfig c = config.getAutoDetectParserConfig(); - ParsingEmbeddedDocumentExtractorFactory f = - (ParsingEmbeddedDocumentExtractorFactory) c.getEmbeddedDocumentExtractorFactory(); + RUnpackExtractorFactory f = + (RUnpackExtractorFactory) c.getEmbeddedDocumentExtractorFactory(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); - ParsingEmbeddedDocumentExtractor ex = (ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext); + RUnpackExtractor ex = (RUnpackExtractor) f.newInstance(metadata, parseContext); EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector(); assertFalse(selector.select(getMetadata("", ""))); assertTrue(selector.select(getMetadata("application/pdf", ""))); diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java index 6f55e5d11..6794f1a8f 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java @@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.apache.tika.TikaTest; +import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; @@ -117,9 +118,13 @@ public class PipesServerTest extends TikaTest { assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = - IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0)); + IOUtils.toByteArray( + ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + .getDocument(0)); byte[] bytes1 = - IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1)); + IOUtils.toByteArray( + ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + .getDocument(1)); assertContains("is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8)); @@ -170,9 +175,13 @@ public class PipesServerTest extends TikaTest { assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = - IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0)); + IOUtils.toByteArray( + ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + .getDocument(0)); byte[] bytes1 = - IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1)); + IOUtils.toByteArray( + ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + .getDocument(1)); assertContains("is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8)); diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml index d60c6b1ca..5e1339a40 100644 --- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml @@ -22,7 +22,7 @@ <autoDetectParserConfig> <spoolToDisk>123450</spoolToDisk> <outputThreshold>678900</outputThreshold> - <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory"> <writeFileNameToContent>false</writeFileNameToContent> <embeddedBytesIncludeMimeTypes> <mime>application/pdf</mime> diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml index 610bad77b..5e46a09e9 100644 --- a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml +++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml @@ -20,7 +20,7 @@ <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory"> <skipContainerDocument>false</skipContainerDocument> </digesterFactory> - <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory"> <writeFileNameToContent>false</writeFileNameToContent> <maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction> </embeddedDocumentExtractorFactory> diff --git a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java index 43c0b1d3a..091facc21 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java +++ b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java @@ -64,7 +64,7 @@ public class ExtractEmbeddedFiles { private int fileCount = 0; private MyEmbeddedDocumentExtractor(Path outputDir, ParseContext context) { - super(context, 1000000l); + super(context); this.outputDir = outputDir; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index bcd45460c..c95547aee 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest { List<Metadata> trackingMetadata = new ArrayList<>(); public EmbeddedTrackingExtrator(ParseContext context) { - super(context, 0); + super(context); } @Override diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java index 8503e8bd8..08d18b6c1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java @@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest { Map<Integer, byte[]> embedded = new HashMap<>(); public RenderCaptureExtractor(ParseContext context) { - super(context, 0); + super(context); } public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml index 0e2f26bd2..9cedc9ed4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml @@ -22,7 +22,7 @@ <autoDetectParserConfig> <spoolToDisk>123450</spoolToDisk> <outputThreshold>678900</outputThreshold> - <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory"> <writeFileNameToContent>false</writeFileNameToContent> </embeddedDocumentExtractorFactory> </autoDetectParserConfig> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml index f54eb9a0a..369acafc9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml @@ -22,7 +22,7 @@ <autoDetectParserConfig> <spoolToDisk>123450</spoolToDisk> <outputThreshold>678900</outputThreshold> - <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.RUnpackExtractorFactory"> <writeFileNameToContent>true</writeFileNameToContent> </embeddedDocumentExtractorFactory> </autoDetectParserConfig>