This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4207 by this push:
     new 8cdaff4b3 TIKA-4207 -- further refactorings to simplify class 
structure and bring back the default ParsingEmbeddedDocumentExtractor
8cdaff4b3 is described below

commit 8cdaff4b3e2a4a477f753f3bfca751d804721a9d
Author: tallison <talli...@apache.org>
AuthorDate: Thu Mar 28 07:11:46 2024 -0400

    TIKA-4207 -- further refactorings to simplify class structure and bring 
back the default ParsingEmbeddedDocumentExtractor
---
 ...a => AbstractEmbeddedDocumentBytesHandler.java} |  2 +-
 ...java => BasicEmbeddedDocumentBytesHandler.java} | 12 ++-
 ...EmbeddedDocumentByteStoreExtractorFactory.java} | 24 +++---
 ...tore.java => EmbeddedDocumentBytesHandler.java} |  4 +-
 .../tika/extractor/EmbeddedDocumentUtil.java       |  2 +-
 .../ParsingEmbeddedDocumentExtractor.java          | 93 +---------------------
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 74 +----------------
 ...ocumentExtractor.java => RUnpackExtractor.java} | 19 +++--
 ...orFactory.java => RUnpackExtractorFactory.java} | 11 ++-
 .../org/apache/tika/parser/AutoDetectParser.java   | 11 ++-
 .../apache/tika/parser/AutoDetectParserConfig.java |  4 +-
 .../java/org/apache/tika/pipes/PipesServer.java    | 67 +++++++++++-----
 .../extractor/EmbeddedDocumentBytesConfig.java     |  9 +++
 ...a => EmittingEmbeddedDocumentBytesHandler.java} | 15 ++--
 .../tika/parser/AutoDetectParserConfigTest.java    | 10 +--
 .../org/apache/tika/pipes/PipesServerTest.java     | 17 +++-
 .../config/TIKA-4207-embedded-bytes-config.xml     |  2 +-
 .../apache/tika/pipes/TIKA-4207-limit-bytes.xml    |  2 +-
 .../apache/tika/example/ExtractEmbeddedFiles.java  |  2 +-
 .../parser/microsoft/pst/OutlookPSTParserTest.java |  2 +-
 .../apache/tika/parser/pdf/PDFRenderingTest.java   |  2 +-
 .../resources/configs/tika-config-no-names.xml     |  2 +-
 .../resources/configs/tika-config-with-names.xml   |  2 +-
 23 files changed, 142 insertions(+), 246 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
similarity index 96%
rename from 
tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
rename to 
tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 15b26451a..3f2f38f94 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -28,7 +28,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
 import org.apache.tika.utils.StringUtils;
 
-public abstract class AbstractEmbeddedDocumentByteStore implements 
EmbeddedDocumentByteStore {
+public abstract class AbstractEmbeddedDocumentBytesHandler implements 
EmbeddedDocumentBytesHandler {
 
     List<Integer> ids = new ArrayList<>();
 
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
similarity index 80%
rename from 
tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
rename to 
tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
index d3aeb4507..cf6441b4f 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
@@ -27,9 +27,16 @@ import 
org.apache.commons.io.input.UnsynchronizedBufferedInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
 
-public class BasicEmbeddedDocumentByteStore extends 
AbstractEmbeddedDocumentByteStore {
+/**
+ * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores
+ * all the bytes in memory. Users can retrieve the documents with {@link 
#getDocument(int)}.
+ *
+ * We'll need to make this cache to disk at some point if there are many bytes 
of
+ * embedded documents.
+ */
+public class BasicEmbeddedDocumentBytesHandler extends 
AbstractEmbeddedDocumentBytesHandler {
     private final EmbeddedDocumentBytesConfig config;
-    public BasicEmbeddedDocumentByteStore(EmbeddedDocumentBytesConfig config) {
+    public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig 
config) {
         this.config = config;
     }
     //this won't scale, but let's start fully in memory for now;
@@ -40,7 +47,6 @@ public class BasicEmbeddedDocumentByteStore extends 
AbstractEmbeddedDocumentByte
         docBytes.put(id, IOUtils.toByteArray(is));
     }
 
-    @Override
     public InputStream getDocument(int id) throws IOException {
         return new 
UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get();
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
similarity index 59%
copy from 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
copy to 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
index 8e1e8e325..f7237bd6a 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
@@ -16,18 +16,18 @@
  */
 package org.apache.tika.extractor;
 
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
 
-import org.apache.tika.metadata.Metadata;
-
-public interface EmbeddedDocumentByteStore extends Closeable {
-    //we need metadata for the emitter store...can we get away without it?
-    void add(int id, Metadata metadata, InputStream inputStream) throws 
IOException;
-
-    InputStream getDocument(int id) throws IOException;
+/**
+ * This factory creates EmbeddedDocumentExtractors that require an
+ * {@link EmbeddedDocumentBytesHandler} in the
+ * {@link org.apache.tika.parser.ParseContext} should extend this.
+ *
+ * This is a shim interface to signal to {@link 
org.apache.tika.pipes.PipesServer}
+ * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom
+ * EmbeddedDocumentExtractor.
+ *
+ * TODO: Figure out how to simplify this and allow for emitting of the source 
document.
+ */
+public interface EmbeddedDocumentByteStoreExtractorFactory extends 
EmbeddedDocumentExtractorFactory {
 
-    List<Integer> getIds();
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
similarity index 90%
rename from 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
rename to 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
index 8e1e8e325..12357a718 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
@@ -23,11 +23,9 @@ import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
 
-public interface EmbeddedDocumentByteStore extends Closeable {
+public interface EmbeddedDocumentBytesHandler extends Closeable {
     //we need metadata for the emitter store...can we get away without it?
     void add(int id, Metadata metadata, InputStream inputStream) throws 
IOException;
 
-    InputStream getDocument(int id) throws IOException;
-
     List<Integer> getIds();
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 99a3f3921..d6e2c28a8 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable {
                 context.set(Parser.class, new AutoDetectParser(tikaConfig));
             }
         }
-        EmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(context, 0);
+        EmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(context);
         context.set(EmbeddedDocumentExtractor.class, ex);
         return ex;
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 97cf5b57f..8391624a3 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -22,12 +22,8 @@ import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -35,7 +31,6 @@ import org.xml.sax.helpers.AttributesImpl;
 import org.apache.tika.exception.CorruptedFileException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -55,9 +50,6 @@ import org.apache.tika.sax.EmbeddedContentHandler;
  */
 public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {
 
-    private static final Logger LOGGER =
-            LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
-
     private static final File ABSTRACT_PATH = new File("");
 
     private static final Parser DELEGATING_PARSER = new DelegatingParser();
@@ -66,14 +58,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
 
     private final ParseContext context;
 
-    private EmbeddedBytesSelector embeddedBytesSelector = 
EmbeddedBytesSelector.ACCEPT_ALL;
-
-    private long bytesExtracted = 0;
-    private final long maxEmbeddedBytesForExtraction;
-
-    public ParsingEmbeddedDocumentExtractor(ParseContext context, long 
maxEmbeddedBytesForExtraction) {
+    public ParsingEmbeddedDocumentExtractor(ParseContext context) {
         this.context = context;
-        this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
     }
 
     public boolean shouldParseEmbedded(Metadata metadata) {
@@ -113,19 +99,15 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
         // Use the delegate parser to parse this entry
         try (TemporaryResources tmp = new TemporaryResources()) {
             final TikaInputStream newStream =
-                    TikaInputStream.get(CloseShieldInputStream.wrap(stream), 
tmp, metadata);
+                    TikaInputStream.get(new CloseShieldInputStream(stream), 
tmp, metadata);
             if (stream instanceof TikaInputStream) {
                 final Object container = ((TikaInputStream) 
stream).getOpenContainer();
                 if (container != null) {
                     newStream.setOpenContainer(container);
                 }
             }
-            EmbeddedDocumentByteStore store = 
context.get(EmbeddedDocumentByteStore.class);
-            if (store != null) {
-                parseWithBytes(newStream, handler, metadata);
-            } else {
-                parse(newStream, handler, metadata);
-            }
+            DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new 
BodyContentHandler(handler)),
+                    metadata, context);
         } catch (EncryptedDocumentException ede) {
             recordException(ede, context);
         } catch (CorruptedFileException e) {
@@ -141,65 +123,6 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
         }
     }
 
-    private void parseWithBytes(TikaInputStream stream, ContentHandler 
handler, Metadata metadata)
-            throws TikaException, IOException, SAXException {
-        //TODO -- improve the efficiency of this so that we're not
-        //literally writing out a file per request
-        Path p = stream.getPath();
-        try {
-            parse(stream, handler, metadata);
-        } finally {
-            storeEmbeddedBytes(p, metadata);
-        }
-    }
-
-    private void parse(TikaInputStream stream, ContentHandler handler, 
Metadata metadata)
-            throws TikaException, IOException, SAXException {
-        DELEGATING_PARSER.parse(stream,
-                new EmbeddedContentHandler(new BodyContentHandler(handler)),
-                metadata, context);
-    }
-
-    private void storeEmbeddedBytes(Path p, Metadata metadata) {
-        if (! embeddedBytesSelector.select(metadata)) {
-            if (LOGGER.isDebugEnabled()) {
-                LOGGER.debug("skipping embedded bytes {} <-> {}",
-                        metadata.get(Metadata.CONTENT_TYPE),
-                        
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
-            }
-            return;
-        }
-        EmbeddedDocumentByteStore embeddedDocumentByteStore =
-                context.get(EmbeddedDocumentByteStore.class);
-        int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
-        try (InputStream is = Files.newInputStream(p)) {
-            if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
-                throw new IOException("Bytes extracted (" + bytesExtracted +
-                        ") >= max allowed (" + maxEmbeddedBytesForExtraction + 
")");
-            }
-            long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
-
-            try (BoundedInputStream boundedIs = new 
BoundedInputStream(maxToRead, is)) {
-                embeddedDocumentByteStore.add(id, metadata, boundedIs);
-                bytesExtracted += boundedIs.getPos();
-                if (boundedIs.hasHitBound()) {
-                    throw new IOException("Bytes extracted (" + bytesExtracted 
+
-                            ") >= max allowed (" + 
maxEmbeddedBytesForExtraction + "). Truncated " +
-                            "bytes");
-                }
-            }
-        } catch (IOException e) {
-            LOGGER.warn("problem writing out embedded bytes", e);
-            //info in metadata doesn't actually make it back to the metadata 
list
-            //because we're filtering and cloning the metadata at the end of 
the parse
-            //which happens before we try to copy out the files.
-            //TODO fix this
-            //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
-              //      ExceptionUtils.getStackTrace(e));
-        }
-    }
-
-
     private void recordException(Exception e, ParseContext context) {
         ParseRecord record = context.get(ParseRecord.class);
         if (record == null) {
@@ -215,12 +138,4 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
-
-    public void setEmbeddedBytesSelector(EmbeddedBytesSelector 
embeddedBytesSelector) {
-        this.embeddedBytesSelector = embeddedBytesSelector;
-    }
-
-    public EmbeddedBytesSelector getEmbeddedBytesSelector() {
-        return embeddedBytesSelector;
-    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index fd8cf54b1..9136228c4 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -16,93 +16,25 @@
  */
 package org.apache.tika.extractor;
 
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
 import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory implements 
EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory
+        implements EmbeddedDocumentExtractorFactory {
 
     private boolean writeFileNameToContent = true;
-    private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
-    private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
-    private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
-    private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
 
-    private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 
1024l;//10GB
     @Field
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
 
-    @Field
-    public void setEmbeddedBytesIncludeMimeTypes(List<String> 
includeMimeTypes) {
-        embeddedBytesIncludeMimeTypes = new HashSet<>();
-        embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
-    }
-
-    @Field
-    public void setEmbeddedBytesExcludeMimeTypes(List<String> 
excludeMimeTypes) {
-        embeddedBytesExcludeMimeTypes = new HashSet<>();
-        embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
-
-    }
-
-    @Field
-    public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> 
includeAttachmentTypes) {
-        embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
-        
embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
-
-    }
-
-    @Field
-    public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> 
excludeAttachmentTypes) {
-        embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
-        
embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
-
-    }
-
-    /**
-     * Total number of bytes to write out. A good zip bomb may contain 
petabytes
-     * compressed into a few kb. Make sure that you can't fill up a disk!
-     *
-     * This does not include the container file in the count of bytes written 
out.
-     * This only counts the lengths of the embedded files.
-     *
-     * @param maxEmbeddedBytesForExtraction
-     */
-    @Field
-    public void setMaxEmbeddedBytesForExtraction(long 
maxEmbeddedBytesForExtraction) throws TikaConfigException {
-        if (maxEmbeddedBytesForExtraction < 0) {
-            throw new TikaConfigException("maxEmbeddedBytesForExtraction must 
be >= 0");
-        }
-        this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
-    }
-
     @Override
     public EmbeddedDocumentExtractor newInstance(Metadata metadata, 
ParseContext parseContext) {
         ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext, 
maxEmbeddedBytesForExtraction);
+                new ParsingEmbeddedDocumentExtractor(parseContext);
         ex.setWriteFileNameToContent(writeFileNameToContent);
-        ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
         return ex;
     }
-
-
-    private EmbeddedBytesSelector createEmbeddedBytesSelector() {
-        if (embeddedBytesIncludeMimeTypes.size() == 0 &&
-                embeddedBytesExcludeMimeTypes.size() == 0 &&
-                embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
-                embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
-            return EmbeddedBytesSelector.ACCEPT_ALL;
-        }
-        return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
-                embeddedBytesExcludeMimeTypes, 
embeddedBytesIncludeEmbeddedResourceTypes,
-                embeddedBytesExcludeEmbeddedResourceTypes);
-    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
 b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
similarity index 92%
copy from 
tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
copy to tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 97cf5b57f..4c69d0997 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -48,12 +48,11 @@ import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 
 /**
- * Helper class for parsers of package archives or other compound document
- * formats that support embedded or attached component documents.
+ * Recursive Unpacker and text and metadata extractor.
  *
- * @since Apache Tika 0.8
+ * @since Apache Tika 3.0.0
  */
-public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {
+public class RUnpackExtractor implements EmbeddedDocumentExtractor {
 
     private static final Logger LOGGER =
             LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
@@ -71,7 +70,7 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
     private long bytesExtracted = 0;
     private final long maxEmbeddedBytesForExtraction;
 
-    public ParsingEmbeddedDocumentExtractor(ParseContext context, long 
maxEmbeddedBytesForExtraction) {
+    public RUnpackExtractor(ParseContext context, long 
maxEmbeddedBytesForExtraction) {
         this.context = context;
         this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
     }
@@ -120,8 +119,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
                     newStream.setOpenContainer(container);
                 }
             }
-            EmbeddedDocumentByteStore store = 
context.get(EmbeddedDocumentByteStore.class);
-            if (store != null) {
+            EmbeddedDocumentBytesHandler bytesHandler = 
context.get(EmbeddedDocumentBytesHandler.class);
+            if (bytesHandler != null) {
                 parseWithBytes(newStream, handler, metadata);
             } else {
                 parse(newStream, handler, metadata);
@@ -169,8 +168,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
             }
             return;
         }
-        EmbeddedDocumentByteStore embeddedDocumentByteStore =
-                context.get(EmbeddedDocumentByteStore.class);
+        EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler =
+                context.get(EmbeddedDocumentBytesHandler.class);
         int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
         try (InputStream is = Files.newInputStream(p)) {
             if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
@@ -180,7 +179,7 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
             long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
 
             try (BoundedInputStream boundedIs = new 
BoundedInputStream(maxToRead, is)) {
-                embeddedDocumentByteStore.add(id, metadata, boundedIs);
+                embeddedDocumentBytesHandler.add(id, metadata, boundedIs);
                 bytesExtracted += boundedIs.getPos();
                 if (boundedIs.hasHitBound()) {
                     throw new IOException("Bytes extracted (" + bytesExtracted 
+
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
similarity index 91%
copy from 
tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
copy to 
tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
index fd8cf54b1..a715ed25f 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
@@ -26,7 +26,9 @@ import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory implements 
EmbeddedDocumentExtractorFactory {
+public class RUnpackExtractorFactory implements 
EmbeddedDocumentByteStoreExtractorFactory {
+
+    public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l 
* 1024l * 1024l;
 
     private boolean writeFileNameToContent = true;
     private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
@@ -34,7 +36,7 @@ public class ParsingEmbeddedDocumentExtractorFactory 
implements EmbeddedDocument
     private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
     private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
 
-    private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 
1024l;//10GB
+    private long maxEmbeddedBytesForExtraction = 
DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION;
     @Field
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
@@ -86,8 +88,9 @@ public class ParsingEmbeddedDocumentExtractorFactory 
implements EmbeddedDocument
 
     @Override
     public EmbeddedDocumentExtractor newInstance(Metadata metadata, 
ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext, 
maxEmbeddedBytesForExtraction);
+        RUnpackExtractor ex =
+                new RUnpackExtractor(parseContext,
+                        maxEmbeddedBytesForExtraction);
         ex.setWriteFileNameToContent(writeFileNameToContent);
         ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
         return ex;
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index d333c2e9a..86eae692a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -28,6 +28,8 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.HttpHeaders;
@@ -197,7 +199,6 @@ public class AutoDetectParser extends CompositeParser {
                     createSecureContentHandler(handler, tis, 
autoDetectParserConfig) : null;
 
             initializeEmbeddedDocumentExtractor(metadata, context);
-
             try {
                 // Parse the document
                 super.parse(tis, sch, metadata, context);
@@ -267,8 +268,12 @@ public class AutoDetectParser extends CompositeParser {
         if (p == null) {
             context.set(Parser.class, this);
         }
-        EmbeddedDocumentExtractor edx = 
autoDetectParserConfig.getEmbeddedDocumentExtractorFactory()
-                .newInstance(metadata, context);
+        EmbeddedDocumentExtractorFactory edxf =
+                autoDetectParserConfig.getEmbeddedDocumentExtractorFactory();
+        if (edxf == null) {
+            edxf = new ParsingEmbeddedDocumentExtractorFactory();
+        }
+        EmbeddedDocumentExtractor edx = edxf.newInstance(metadata, context);
         context.set(EmbeddedDocumentExtractor.class, edx);
     }
 
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index bc4904367..afe65b07e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -25,7 +25,6 @@ import org.xml.sax.ContentHandler;
 import org.apache.tika.config.ConfigBase;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
 import org.apache.tika.sax.ContentHandlerDecoratorFactory;
@@ -87,8 +86,7 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
 
     private MetadataWriteFilterFactory metadataWriteFilterFactory = null;
 
-    private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory =
-            new ParsingEmbeddedDocumentExtractorFactory();
+    private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory 
= null;
 
     private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory =
             NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java 
b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 5cc22d378..d8957368d 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -45,9 +45,14 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.BasicEmbeddedDocumentByteStore;
+import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler;
 import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.extractor.EmbeddedDocumentByteStore;
+import org.apache.tika.extractor.EmbeddedDocumentByteStoreExtractorFactory;
+import org.apache.tika.extractor.EmbeddedDocumentBytesHandler;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.RUnpackExtractor;
+import org.apache.tika.extractor.RUnpackExtractorFactory;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -64,7 +69,7 @@ import org.apache.tika.pipes.emitter.Emitter;
 import org.apache.tika.pipes.emitter.EmitterManager;
 import org.apache.tika.pipes.emitter.StreamEmitter;
 import org.apache.tika.pipes.emitter.TikaEmitterException;
-import org.apache.tika.pipes.extractor.EmbeddedDocumentEmitterStore;
+import org.apache.tika.pipes.extractor.EmittingEmbeddedDocumentBytesHandler;
 import org.apache.tika.pipes.fetcher.FetchKey;
 import org.apache.tika.pipes.fetcher.Fetcher;
 import org.apache.tika.pipes.fetcher.FetcherManager;
@@ -381,9 +386,9 @@ public class PipesServer implements Runnable {
             emitParseData(t, parseData);
         } finally {
             if (parseData != null && parseData.hasEmbeddedDocumentByteStore() 
&&
-                    parseData.getEmbeddedDocumentByteStore() instanceof 
Closeable) {
+                    parseData.getEmbeddedDocumentBytesHandler() instanceof 
Closeable) {
                 try {
-                    ((Closeable) 
parseData.getEmbeddedDocumentByteStore()).close();
+                    ((Closeable) 
parseData.getEmbeddedDocumentBytesHandler()).close();
                 } catch (IOException e) {
                     LOG.warn("problem closing embedded document byte store", 
e);
                 }
@@ -536,7 +541,7 @@ public class PipesServer implements Runnable {
         }
 
         return new MetadataListAndEmbeddedBytes(metadataList,
-                parseContext.get(EmbeddedDocumentByteStore.class));
+                parseContext.get(EmbeddedDocumentBytesHandler.class));
     }
 
     private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple)
@@ -545,14 +550,28 @@ public class PipesServer implements Runnable {
         if (! 
fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes())
 {
             return parseContext;
         }
-
-        //TODO: clean this up.
+        EmbeddedDocumentExtractorFactory factory = 
((AutoDetectParser)autoDetectParser)
+                
.getAutoDetectParserConfig().getEmbeddedDocumentExtractorFactory();
+        if (factory == null) {
+            parseContext.set(EmbeddedDocumentExtractor.class, new 
RUnpackExtractor(parseContext,
+                    
RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION));
+        } else {
+            if (! (factory instanceof 
EmbeddedDocumentByteStoreExtractorFactory)) {
+                throw new 
TikaConfigException("EmbeddedDocumentExtractorFactory must be an " +
+                        "instance of EmbeddedDocumentByteStoreExtractorFactory 
if you want" +
+                        "to extract embedded bytes! I see this embedded doc 
factory: " +
+                        factory.getClass() + "and a request: " +
+                        fetchEmitTuple.getEmbeddedDocumentBytesConfig());
+            }
+        }
+        //TODO: especially clean this up.
         if 
(!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter()))
 {
-            parseContext.set(EmbeddedDocumentByteStore.class,
-                    new 
EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(),
+            parseContext.set(EmbeddedDocumentBytesHandler.class,
+                    new 
EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(),
                             fetchEmitTuple.getEmbeddedDocumentBytesConfig(), 
emitterManager));
         } else {
-            parseContext.set(EmbeddedDocumentByteStore.class, new 
BasicEmbeddedDocumentByteStore(
+            parseContext.set(EmbeddedDocumentBytesHandler.class,
+                    new BasicEmbeddedDocumentBytesHandler(
                     fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
         }
         return parseContext;
@@ -677,8 +696,8 @@ public class PipesServer implements Runnable {
 
         if (t.getEmbeddedDocumentBytesConfig() != null &&
                 t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
-            EmbeddedDocumentByteStore embeddedDocumentByteStore =
-                    parseContext.get(EmbeddedDocumentByteStore.class);
+            EmbeddedDocumentBytesHandler embeddedDocumentByteStore =
+                    parseContext.get(EmbeddedDocumentBytesHandler.class);
             try (InputStream is = Files.newInputStream(tis.getPath())) {
                 embeddedDocumentByteStore.add(0, metadata, is);
             } catch (IOException e) {
@@ -747,6 +766,14 @@ public class PipesServer implements Runnable {
             //override this value because we'll be digesting before parse
             ((AutoDetectParser) 
autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
                     .setSkipContainerDocument(true);
+            //if the user hasn't configured an embedded document extractor, 
set up the
+            // RUnpackExtractorFactory
+            if (((AutoDetectParser) 
autoDetectParser).getAutoDetectParserConfig()
+                    .getEmbeddedDocumentExtractorFactory() == null) {
+                ((AutoDetectParser) autoDetectParser)
+                        
.getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory(
+                                new RUnpackExtractorFactory());
+            }
         }
         this.detector = ((AutoDetectParser) 
this.autoDetectParser).getDetector();
         this.rMetaParser = new RecursiveParserWrapper(autoDetectParser);
@@ -809,20 +836,20 @@ public class PipesServer implements Runnable {
 
     class MetadataListAndEmbeddedBytes {
         final List<Metadata> metadataList;
-        final Optional<EmbeddedDocumentByteStore> embeddedDocumentByteStore;
+        final Optional<EmbeddedDocumentBytesHandler> 
embeddedDocumentBytesHandler;
 
         public MetadataListAndEmbeddedBytes(List<Metadata> metadataList,
-                                            EmbeddedDocumentByteStore 
embeddedDocumentByteStore) {
+                                            EmbeddedDocumentBytesHandler 
embeddedDocumentBytesHandler) {
             this.metadataList = metadataList;
-            this.embeddedDocumentByteStore = 
Optional.ofNullable(embeddedDocumentByteStore);
+            this.embeddedDocumentBytesHandler = 
Optional.ofNullable(embeddedDocumentBytesHandler);
         }
 
         public List<Metadata> getMetadataList() {
             return metadataList;
         }
 
-        public EmbeddedDocumentByteStore getEmbeddedDocumentByteStore() {
-            return embeddedDocumentByteStore.get();
+        public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() {
+            return embeddedDocumentBytesHandler.get();
         }
 
         /**
@@ -832,7 +859,7 @@ public class PipesServer implements Runnable {
          * @return
          */
         public boolean hasEmbeddedDocumentByteStore() {
-            return embeddedDocumentByteStore.isPresent();
+            return embeddedDocumentBytesHandler.isPresent();
         }
 
         /**
@@ -844,7 +871,7 @@ public class PipesServer implements Runnable {
          * @return
          */
         public boolean toBePackagedForStreamEmitter() {
-            return !(embeddedDocumentByteStore.get() instanceof 
EmbeddedDocumentEmitterStore);
+            return !(embeddedDocumentBytesHandler.get() instanceof 
EmittingEmbeddedDocumentBytesHandler);
         }
     }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index 66b7321ac..071de05c4 100644
--- 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -55,6 +55,15 @@ public class EmbeddedDocumentBytesConfig implements 
Serializable {
 
     private boolean includeOriginal = false;
 
+    /**
+     * Create an EmbeddedDocumentBytesConfig with
+     * {@link EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes}
+     * set to <code>true</code>
+     */
+    public EmbeddedDocumentBytesConfig() {
+        this.extractEmbeddedDocumentBytes = true;
+    }
+
     public EmbeddedDocumentBytesConfig(boolean extractEmbeddedDocumentBytes) {
         this.extractEmbeddedDocumentBytes = extractEmbeddedDocumentBytes;
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
similarity index 83%
rename from 
tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
rename to 
tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
index 5d09cfe18..1132a4bc6 100644
--- 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
@@ -23,7 +23,7 @@ import java.io.InputStream;
 import org.apache.commons.io.IOExceptionWithCause;
 
 import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore;
+import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.emitter.EmitKey;
 import org.apache.tika.pipes.emitter.Emitter;
@@ -31,15 +31,15 @@ import org.apache.tika.pipes.emitter.EmitterManager;
 import org.apache.tika.pipes.emitter.StreamEmitter;
 import org.apache.tika.pipes.emitter.TikaEmitterException;
 
-public class EmbeddedDocumentEmitterStore extends 
AbstractEmbeddedDocumentByteStore {
+public class EmittingEmbeddedDocumentBytesHandler extends 
AbstractEmbeddedDocumentBytesHandler {
     private final EmitKey containerEmitKey;
     private final EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig;
     private final StreamEmitter emitter;
 
     private static final Metadata METADATA = new Metadata();
-    public EmbeddedDocumentEmitterStore(EmitKey containerEmitKey,
-                                        EmbeddedDocumentBytesConfig 
embeddedDocumentBytesConfig,
-                                        EmitterManager emitterManager) throws 
TikaConfigException {
+    public EmittingEmbeddedDocumentBytesHandler(EmitKey containerEmitKey,
+                                                EmbeddedDocumentBytesConfig 
embeddedDocumentBytesConfig,
+                                                EmitterManager emitterManager) 
throws TikaConfigException {
         this.containerEmitKey = containerEmitKey;
         this.embeddedDocumentBytesConfig = embeddedDocumentBytesConfig;
         Emitter tmpEmitter =
@@ -64,11 +64,6 @@ public class EmbeddedDocumentEmitterStore extends 
AbstractEmbeddedDocumentByteSt
         }
     }
 
-    @Override
-    public InputStream getDocument(int id) {
-        throw new UnsupportedOperationException("this is emit only.");
-    }
-
     @Override
     public void close() throws IOException {
         if (emitter instanceof Closeable) {
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 
b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index a0d5d4896..62b061d98 100644
--- 
a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ 
b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -25,8 +25,8 @@ import org.junit.jupiter.api.Test;
 
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedBytesSelector;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.RUnpackExtractor;
+import org.apache.tika.extractor.RUnpackExtractorFactory;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.utils.StringUtils;
@@ -41,12 +41,12 @@ public class AutoDetectParserConfigTest {
             config = new TikaConfig(is);
         }
         AutoDetectParserConfig c = config.getAutoDetectParserConfig();
-        ParsingEmbeddedDocumentExtractorFactory f =
-                (ParsingEmbeddedDocumentExtractorFactory) 
c.getEmbeddedDocumentExtractorFactory();
+        RUnpackExtractorFactory f =
+                (RUnpackExtractorFactory) 
c.getEmbeddedDocumentExtractorFactory();
 
         Metadata metadata = new Metadata();
         ParseContext parseContext = new ParseContext();
-        ParsingEmbeddedDocumentExtractor ex = 
(ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext);
+        RUnpackExtractor ex = (RUnpackExtractor) f.newInstance(metadata, 
parseContext);
         EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector();
         assertFalse(selector.select(getMetadata("", "")));
         assertTrue(selector.select(getMetadata("application/pdf", "")));
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java 
b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 6f55e5d11..6794f1a8f 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.emitter.EmitKey;
 import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
@@ -117,9 +118,13 @@ public class PipesServerTest extends TikaTest {
         assertEquals(2, parseData.metadataList.size());
 
         byte[] bytes0 =
-                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+                IOUtils.toByteArray(
+                        
((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+                        .getDocument(0));
         byte[] bytes1 =
-                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+                IOUtils.toByteArray(
+                        
((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+                                .getDocument(1));
 
         assertContains("is to trigger mock on the embedded",
                 new String(bytes0, StandardCharsets.UTF_8));
@@ -170,9 +175,13 @@ public class PipesServerTest extends TikaTest {
         assertEquals(2, parseData.metadataList.size());
 
         byte[] bytes0 =
-                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+                IOUtils.toByteArray(
+                        
((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+                                .getDocument(0));
         byte[] bytes1 =
-                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+                IOUtils.toByteArray(
+                        
((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+                                .getDocument(1));
 
         assertContains("is to trigger mock on the embedded",
                 new String(bytes0, StandardCharsets.UTF_8));
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
index d60c6b1ca..5e1339a40 100644
--- 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
@@ -22,7 +22,7 @@
   <autoDetectParserConfig>
     <spoolToDisk>123450</spoolToDisk>
     <outputThreshold>678900</outputThreshold>
-    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.RUnpackExtractorFactory">
       <writeFileNameToContent>false</writeFileNameToContent>
       <embeddedBytesIncludeMimeTypes>
         <mime>application/pdf</mime>
diff --git 
a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml 
b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
index 610bad77b..5e46a09e9 100644
--- 
a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
+++ 
b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
@@ -20,7 +20,7 @@
     <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory">
       <skipContainerDocument>false</skipContainerDocument>
     </digesterFactory>
-    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.RUnpackExtractorFactory">
       <writeFileNameToContent>false</writeFileNameToContent>
       <maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction>
     </embeddedDocumentExtractorFactory>
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java 
b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
index 43c0b1d3a..091facc21 100644
--- 
a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
+++ 
b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
@@ -64,7 +64,7 @@ public class ExtractEmbeddedFiles {
         private int fileCount = 0;
 
         private MyEmbeddedDocumentExtractor(Path outputDir, ParseContext 
context) {
-            super(context, 1000000l);
+            super(context);
             this.outputDir = outputDir;
         }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index bcd45460c..c95547aee 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest {
         List<Metadata> trackingMetadata = new ArrayList<>();
 
         public EmbeddedTrackingExtrator(ParseContext context) {
-            super(context, 0);
+            super(context);
         }
 
         @Override
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 8503e8bd8..08d18b6c1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest {
         Map<Integer, byte[]> embedded = new HashMap<>();
 
         public RenderCaptureExtractor(ParseContext context) {
-            super(context, 0);
+            super(context);
         }
 
         public void parseEmbedded(InputStream stream, ContentHandler handler, 
Metadata metadata,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
index 0e2f26bd2..9cedc9ed4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
@@ -22,7 +22,7 @@
   <autoDetectParserConfig>
     <spoolToDisk>123450</spoolToDisk>
     <outputThreshold>678900</outputThreshold>
-    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.RUnpackExtractorFactory">
       <writeFileNameToContent>false</writeFileNameToContent>
     </embeddedDocumentExtractorFactory>
   </autoDetectParserConfig>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
index f54eb9a0a..369acafc9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
@@ -22,7 +22,7 @@
   <autoDetectParserConfig>
     <spoolToDisk>123450</spoolToDisk>
     <outputThreshold>678900</outputThreshold>
-    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.RUnpackExtractorFactory">
       <writeFileNameToContent>true</writeFileNameToContent>
     </embeddedDocumentExtractorFactory>
   </autoDetectParserConfig>

Reply via email to