(tika) branch TIKA-4207 updated: TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted

tallison Thu, 21 Mar 2024 14:20:20 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/TIKA-4207 by this push:
     new 59608e69b TIKA-4207 -- refactor to use inputstreams instead of byte 
arrays. add max bytes extracted
59608e69b is described below

commit 59608e69bdaeb8a8151e1e9f27b1ef7c3030288b
Author: tallison <talli...@apache.org>
AuthorDate: Thu Mar 21 17:19:37 2024 -0400

    TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max 
bytes extracted
---
 .../AbstractEmbeddedDocumentByteStore.java         |  3 +-
 .../extractor/BasicEmbeddedDocumentByteStore.java  | 16 ++--
 .../tika/extractor/EmbeddedDocumentByteStore.java  |  5 +-
 .../tika/extractor/EmbeddedDocumentUtil.java       |  2 +-
 .../ParsingEmbeddedDocumentExtractor.java          | 40 +++++++--
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 22 ++++-
 .../org/apache/tika/io/BoundedInputStream.java     |  4 +
 .../java/org/apache/tika/pipes/PipesServer.java    |  5 +-
 .../extractor/EmbeddedDocumentBytesConfig.java     |  6 +-
 .../extractor/EmbeddedDocumentEmitterStore.java    |  9 +-
 .../org/apache/tika/pipes/PipesServerTest.java     | 58 ++++++++++++-
 .../apache/tika/pipes/TIKA-4207-limit-bytes.xml    | 34 ++++++++
 .../parser/microsoft/pst/OutlookPSTParserTest.java |  2 +-
 .../apache/tika/parser/pdf/PDFRenderingTest.java   |  2 +-
 .../apache/tika/server/standard/TikaPipesTest.java | 97 +++++++++++++++++++++-
 15 files changed, 270 insertions(+), 35 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
index 214c2ab4e..15b26451a 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
@@ -17,6 +17,7 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
@@ -57,7 +58,7 @@ public abstract class AbstractEmbeddedDocumentByteStore 
implements EmbeddedDocum
     }
 
     @Override
-    public void add(int id, Metadata metadata, byte[] bytes) throws 
IOException {
+    public void add(int id, Metadata metadata, InputStream bytes) throws 
IOException {
         ids.add(id);
     }
 
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
index b41285eb0..d3aeb4507 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentByteStore.java
@@ -17,9 +17,13 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedBufferedInputStream;
+
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
 
@@ -30,13 +34,15 @@ public class BasicEmbeddedDocumentByteStore extends 
AbstractEmbeddedDocumentByte
     }
     //this won't scale, but let's start fully in memory for now;
     Map<Integer, byte[]> docBytes = new HashMap<>();
-    public void add(int id, Metadata metadata, byte[] bytes) throws 
IOException {
-        super.add(id, metadata, bytes);
-        docBytes.put(id, bytes);
+    @Override
+    public void add(int id, Metadata metadata, InputStream is) throws 
IOException {
+        super.add(id, metadata, is);
+        docBytes.put(id, IOUtils.toByteArray(is));
     }
 
-    public byte[] getDocument(int id) {
-        return docBytes.get(id);
+    @Override
+    public InputStream getDocument(int id) throws IOException {
+        return new 
UnsynchronizedBufferedInputStream.Builder().setByteArray(docBytes.get(id)).get();
     }
 
     @Override
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
index ad1bb81f3..8e1e8e325 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStore.java
@@ -18,15 +18,16 @@ package org.apache.tika.extractor;
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
 
 public interface EmbeddedDocumentByteStore extends Closeable {
     //we need metadata for the emitter store...can we get away without it?
-    void add(int id, Metadata metadata, byte[] bytes) throws IOException;
+    void add(int id, Metadata metadata, InputStream inputStream) throws 
IOException;
 
-    byte[] getDocument(int id);
+    InputStream getDocument(int id) throws IOException;
 
     List<Integer> getIds();
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index d6e2c28a8..99a3f3921 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,7 @@ public class EmbeddedDocumentUtil implements Serializable {
                 context.set(Parser.class, new AutoDetectParser(tikaConfig));
             }
         }
-        EmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(context);
+        EmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(context, 0);
         context.set(EmbeddedDocumentExtractor.class, ex);
         return ex;
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index ee15c1e22..97cf5b57f 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -35,6 +35,7 @@ import org.xml.sax.helpers.AttributesImpl;
 import org.apache.tika.exception.CorruptedFileException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -45,7 +46,6 @@ import org.apache.tika.parser.ParseRecord;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.utils.ExceptionUtils;
 
 /**
  * Helper class for parsers of package archives or other compound document
@@ -68,8 +68,12 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
 
     private EmbeddedBytesSelector embeddedBytesSelector = 
EmbeddedBytesSelector.ACCEPT_ALL;
 
-    public ParsingEmbeddedDocumentExtractor(ParseContext context) {
+    private long bytesExtracted = 0;
+    private final long maxEmbeddedBytesForExtraction;
+
+    public ParsingEmbeddedDocumentExtractor(ParseContext context, long 
maxEmbeddedBytesForExtraction) {
         this.context = context;
+        this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
     }
 
     public boolean shouldParseEmbedded(Metadata metadata) {
@@ -139,6 +143,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
 
     private void parseWithBytes(TikaInputStream stream, ContentHandler 
handler, Metadata metadata)
             throws TikaException, IOException, SAXException {
+        //TODO -- improve the efficiency of this so that we're not
+        //literally writing out a file per request
         Path p = stream.getPath();
         try {
             parse(stream, handler, metadata);
@@ -157,7 +163,7 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
         if (! embeddedBytesSelector.select(metadata)) {
             if (LOGGER.isDebugEnabled()) {
-                LOGGER.debug("skipping embedded bytes {} {}",
+                LOGGER.debug("skipping embedded bytes {} <-> {}",
                         metadata.get(Metadata.CONTENT_TYPE),
                         
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
             }
@@ -166,12 +172,30 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
         EmbeddedDocumentByteStore embeddedDocumentByteStore =
                 context.get(EmbeddedDocumentByteStore.class);
         int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
-
-        try {
-            embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
+        try (InputStream is = Files.newInputStream(p)) {
+            if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
+                throw new IOException("Bytes extracted (" + bytesExtracted +
+                        ") >= max allowed (" + maxEmbeddedBytesForExtraction + 
")");
+            }
+            long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
+
+            try (BoundedInputStream boundedIs = new 
BoundedInputStream(maxToRead, is)) {
+                embeddedDocumentByteStore.add(id, metadata, boundedIs);
+                bytesExtracted += boundedIs.getPos();
+                if (boundedIs.hasHitBound()) {
+                    throw new IOException("Bytes extracted (" + bytesExtracted 
+
+                            ") >= max allowed (" + 
maxEmbeddedBytesForExtraction + "). Truncated " +
+                            "bytes");
+                }
+            }
         } catch (IOException e) {
-            metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
-                    ExceptionUtils.getStackTrace(e));
+            LOGGER.warn("problem writing out embedded bytes", e);
+            //info in metadata doesn't actually make it back to the metadata 
list
+            //because we're filtering and cloning the metadata at the end of 
the parse
+            //which happens before we try to copy out the files.
+            //TODO fix this
+            //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+              //      ExceptionUtils.getStackTrace(e));
         }
     }
 
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 7632ed49c..fd8cf54b1 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -22,6 +22,7 @@ import java.util.List;
 import java.util.Set;
 
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
@@ -33,6 +34,7 @@ public class ParsingEmbeddedDocumentExtractorFactory 
implements EmbeddedDocument
     private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
     private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
 
+    private long maxEmbeddedBytesForExtraction = 10l * 1024l * 1024l * 
1024l;//10GB
     @Field
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
@@ -65,15 +67,33 @@ public class ParsingEmbeddedDocumentExtractorFactory 
implements EmbeddedDocument
 
     }
 
+    /**
+     * Total number of bytes to write out. A good zip bomb may contain 
petabytes
+     * compressed into a few kb. Make sure that you can't fill up a disk!
+     *
+     * This does not include the container file in the count of bytes written 
out.
+     * This only counts the lengths of the embedded files.
+     *
+     * @param maxEmbeddedBytesForExtraction
+     */
+    @Field
+    public void setMaxEmbeddedBytesForExtraction(long 
maxEmbeddedBytesForExtraction) throws TikaConfigException {
+        if (maxEmbeddedBytesForExtraction < 0) {
+            throw new TikaConfigException("maxEmbeddedBytesForExtraction must 
be >= 0");
+        }
+        this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
+    }
 
     @Override
     public EmbeddedDocumentExtractor newInstance(Metadata metadata, 
ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(parseContext);
+        ParsingEmbeddedDocumentExtractor ex =
+                new ParsingEmbeddedDocumentExtractor(parseContext, 
maxEmbeddedBytesForExtraction);
         ex.setWriteFileNameToContent(writeFileNameToContent);
         ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
         return ex;
     }
 
+
     private EmbeddedBytesSelector createEmbeddedBytesSelector() {
         if (embeddedBytesIncludeMimeTypes.size() == 0 &&
                 embeddedBytesExcludeMimeTypes.size() == 0 &&
diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
index a80009cd2..31290cc1a 100644
--- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
@@ -147,4 +147,8 @@ public class BoundedInputStream extends InputStream {
     public long transferTo(OutputStream out) throws IOException {
         return in.transferTo(out);
     }
+
+    public long getPos() {
+        return pos;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java 
b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 851805d06..5cc22d378 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -546,6 +546,7 @@ public class PipesServer implements Runnable {
             return parseContext;
         }
 
+        //TODO: clean this up.
         if 
(!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter()))
 {
             parseContext.set(EmbeddedDocumentByteStore.class,
                     new 
EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(),
@@ -678,8 +679,8 @@ public class PipesServer implements Runnable {
                 t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
             EmbeddedDocumentByteStore embeddedDocumentByteStore =
                     parseContext.get(EmbeddedDocumentByteStore.class);
-            try {
-                embeddedDocumentByteStore.add(0, metadata, 
Files.readAllBytes(tis.getPath()));
+            try (InputStream is = Files.newInputStream(tis.getPath())) {
+                embeddedDocumentByteStore.add(0, metadata, is);
             } catch (IOException e) {
                 LOG.warn("problem reading source file into embedded document 
byte store", e);
             }
diff --git 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index 42538ff80..66b7321ac 100644
--- 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -44,11 +44,7 @@ public class EmbeddedDocumentBytesConfig implements 
Serializable {
         }
     }
     private final boolean extractEmbeddedDocumentBytes;
-    //TODO -- add these at some point
-    /*
-        private Set<String> includeMimeTypes = new HashSet<>();
-        private Set<String> excludeMimeTypes = new HashSet<>();
-    */
+
     private int zeroPadName = 0;
 
     private SUFFIX_STRATEGY suffixStrategy = SUFFIX_STRATEGY.NONE;
diff --git 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
index 915b44d44..5d09cfe18 100644
--- 
a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
+++ 
b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
@@ -18,9 +18,9 @@ package org.apache.tika.pipes.extractor;
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.io.InputStream;
 
 import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.extractor.AbstractEmbeddedDocumentByteStore;
@@ -53,20 +53,19 @@ public class EmbeddedDocumentEmitterStore extends 
AbstractEmbeddedDocumentByteSt
     }
 
     @Override
-    public void add(int id, Metadata metadata, byte[] bytes) throws 
IOException {
+    public void add(int id, Metadata metadata, InputStream inputStream) throws 
IOException {
         //intentionally do not call super.add, because we want the ids list to 
be empty
         String emitKey = getEmitKey(containerEmitKey.getEmitKey(),
                 id, embeddedDocumentBytesConfig, metadata);
-
         try {
-            emitter.emit(emitKey, new 
UnsynchronizedByteArrayInputStream(bytes), METADATA);
+            emitter.emit(emitKey, inputStream, METADATA);
         } catch (TikaEmitterException e) {
             throw new IOExceptionWithCause(e);
         }
     }
 
     @Override
-    public byte[] getDocument(int id) {
+    public InputStream getDocument(int id) {
         throw new UnsupportedOperationException("this is emit only.");
     }
 
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java 
b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 857bf485f..6f55e5d11 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -116,8 +116,10 @@ public class PipesServerTest extends TikaTest {
                 parseData = pipesServer.parseFromTuple(fetchEmitTuple, 
fetcher);
         assertEquals(2, parseData.metadataList.size());
 
-        byte[] bytes0 = 
parseData.getEmbeddedDocumentByteStore().getDocument(0);
-        byte[] bytes1 = 
parseData.getEmbeddedDocumentByteStore().getDocument(1);
+        byte[] bytes0 =
+                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+        byte[] bytes1 =
+                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
 
         assertContains("is to trigger mock on the embedded",
                 new String(bytes0, StandardCharsets.UTF_8));
@@ -127,4 +129,56 @@ public class PipesServerTest extends TikaTest {
         
assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
                 parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
     }
+
+    @Test
+    public void testEmbeddedStreamEmitterLimitBytes() throws Exception {
+        Path tmp = Paths.get("/home/tallison/Desktop/tmp");
+        if (Files.isDirectory(tmp)) {
+            FileUtils.deleteDirectory(tmp.toFile());
+        }
+        Files.createDirectories(tmp);
+        Path tikaConfig = tmp.resolve("tika-config.xml");
+
+        String xml = IOUtils.toString(
+                
PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"),
+                StandardCharsets.UTF_8);
+        xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
+        Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
+
+        
Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
+                tmp.resolve("mock.xml"));
+
+        PipesServer pipesServer = new PipesServer(tikaConfig,
+                new UnsynchronizedByteArrayInputStream(new byte[0]),
+                new PrintStream(new UnsynchronizedByteArrayOutputStream(), 
true,
+                        StandardCharsets.UTF_8.name()),
+                -1, 30000, 30000);
+
+        pipesServer.initializeResources();
+        EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
+                new EmbeddedDocumentBytesConfig(true);
+        embeddedDocumentBytesConfig.setIncludeOriginal(true);
+
+        FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
+                new FetchKey("fs", "mock.xml"),
+                new EmitKey("", ""), new Metadata(),
+                HandlerConfig.DEFAULT_HANDLER_CONFIG, 
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+                embeddedDocumentBytesConfig);
+        Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
+        PipesServer.MetadataListAndEmbeddedBytes
+                parseData = pipesServer.parseFromTuple(fetchEmitTuple, 
fetcher);
+        assertEquals(2, parseData.metadataList.size());
+
+        byte[] bytes0 =
+                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(0));
+        byte[] bytes1 =
+                
IOUtils.toByteArray(parseData.getEmbeddedDocumentByteStore().getDocument(1));
+
+        assertContains("is to trigger mock on the embedded",
+                new String(bytes0, StandardCharsets.UTF_8));
+
+        assertEquals(10, bytes1.length);
+        
assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
+                parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
+    }
 }
diff --git 
a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml 
b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
new file mode 100644
index 000000000..610bad77b
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <autoDetectParserConfig>
+    <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory">
+      <skipContainerDocument>false</skipContainerDocument>
+    </digesterFactory>
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+      <writeFileNameToContent>false</writeFileNameToContent>
+      <maxEmbeddedBytesForExtraction>10</maxEmbeddedBytesForExtraction>
+    </embeddedDocumentExtractorFactory>
+  </autoDetectParserConfig>
+  <fetchers>
+    <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+      <name>fs</name>
+      <basePath>BASE_PATH</basePath>
+    </fetcher>
+  </fetchers>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index c95547aee..bcd45460c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -150,7 +150,7 @@ public class OutlookPSTParserTest extends TikaTest {
         List<Metadata> trackingMetadata = new ArrayList<>();
 
         public EmbeddedTrackingExtrator(ParseContext context) {
-            super(context);
+            super(context, 0);
         }
 
         @Override
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 08d18b6c1..8503e8bd8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -112,7 +112,7 @@ public class PDFRenderingTest extends TikaTest {
         Map<Integer, byte[]> embedded = new HashMap<>();
 
         public RenderCaptureExtractor(ParseContext context) {
-            super(context);
+            super(context, 0);
         }
 
         public void parseEmbedded(InputStream stream, ContentHandler handler, 
Metadata metadata,
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
index 7f41e065c..110c3f7e8 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
@@ -16,20 +16,27 @@
  */
 package org.apache.tika.server.standard;
 
+import static org.apache.tika.TikaTest.debug;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 
 import java.io.ByteArrayInputStream;
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
+import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import jakarta.ws.rs.core.Response;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
@@ -49,6 +56,7 @@ import 
org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.pipes.FetchEmitTuple;
 import org.apache.tika.pipes.HandlerConfig;
 import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
 import org.apache.tika.pipes.fetcher.FetchKey;
 import org.apache.tika.pipes.fetcher.FetcherManager;
 import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -72,6 +80,7 @@ public class TikaPipesTest extends CXFTestBase {
     private static Path TMP_WORKING_DIR;
     private static Path TMP_OUTPUT_DIR;
     private static Path TMP_OUTPUT_FILE;
+    private static Path TMP_BYTES_DIR;
     private static Path TIKA_PIPES_LOG4j2_PATH;
     private static Path TIKA_CONFIG_PATH;
     private static String TIKA_CONFIG_XML;
@@ -81,6 +90,7 @@ public class TikaPipesTest extends CXFTestBase {
     public static void setUpBeforeClass() throws Exception {
         Path inputDir = TMP_WORKING_DIR.resolve("input");
         TMP_OUTPUT_DIR = TMP_WORKING_DIR.resolve("output");
+        TMP_BYTES_DIR = TMP_WORKING_DIR.resolve("bytes");
         TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(TEST_RECURSIVE_DOC + ".json");
 
         Files.createDirectories(inputDir);
@@ -103,7 +113,10 @@ public class TikaPipesTest extends CXFTestBase {
                         "<emitter 
class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
                         "<params>" + "<name>fse</name>" + "<basePath>" +
                         TMP_OUTPUT_DIR.toAbsolutePath() + "</basePath>" + 
"</params>" +
-                        "</emitter>" + "</emitters>" + 
"<pipes><params><tikaConfig>" +
+                        "</emitter>" + "<emitter 
class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
+                        "<params>" + "<name>bytes</name>" + "<basePath>" +
+                        TMP_BYTES_DIR.toAbsolutePath() + "</basePath>" + 
"</params>" +
+                        "</emitter>" +"</emitters>" + 
"<pipes><params><tikaConfig>" +
                         ProcessUtils.escapeCommandLine(
                                 TIKA_CONFIG_PATH.toAbsolutePath().toString()) +
                         "</tikaConfig><numClients>10</numClients>" + 
"<forkedJvmArgs>" +
@@ -203,4 +216,86 @@ public class TikaPipesTest extends CXFTestBase {
         assertContains("When in the Course",
                 metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }
+
+    @Test
+    public void testBytes() throws Exception {
+        EmbeddedDocumentBytesConfig config = new 
EmbeddedDocumentBytesConfig(true);
+        config.setEmitter("bytes");
+        config.setIncludeOriginal(true);
+        config.setEmbeddedIdPrefix("-");
+        config.setZeroPadNameLength(10);
+        
config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING);
+
+        FetchEmitTuple t =
+                new FetchEmitTuple("myId", new FetchKey("fsf", 
"test_recursive_embedded.docx"),
+                        new EmitKey("fse", "test_recursive_embedded.docx"), 
new Metadata(),
+                        new 
HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                                HandlerConfig.PARSE_MODE.RMETA, -1, -1, false),
+                        FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, config);
+        StringWriter writer = new StringWriter();
+        JsonFetchEmitTuple.toJson(t, writer);
+
+        String getUrl = endPoint + PIPES_PATH;
+        Response response =
+                
WebClient.create(getUrl).accept("application/json").post(writer.toString());
+        assertEquals(200, response.getStatus());
+
+        List<Metadata> metadataList = null;
+        try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
+            metadataList = JsonMetadataList.fromJson(reader);
+        }
+        assertEquals(12, metadataList.size());
+        assertContains("When in the Course",
+                metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+        Map<String, Long> expected = loadExpected();
+        Map<String, Long> byteFileNames = getFileNames(TMP_BYTES_DIR);
+        assertEquals(expected, byteFileNames);
+    }
+
+    private Map<String, Long> loadExpected() {
+        Map<String, Long> m = new HashMap<>();
+        m.put("test_recursive_embedded.docx-0000000009.txt", 8151l);
+        m.put("test_recursive_embedded.docx-0000000007.txt", 8l);
+        m.put("test_recursive_embedded.docx-0000000006.txt", 8l);
+        m.put("test_recursive_embedded.docx-0000000002.zip", 4827l);
+        m.put("test_recursive_embedded.docx-0000000001.emf", 4992l);
+        m.put("test_recursive_embedded.docx-0000000008.zip", 4048l);
+        m.put("test_recursive_embedded.docx-0000000004.txt", 8l);
+        m.put("test_recursive_embedded.docx-0000000000.docx", 27082l);
+        m.put("test_recursive_embedded.docx-0000000003.txt", 8l);
+        m.put("test_recursive_embedded.docx-0000000011.txt", 7l);
+        m.put("test_recursive_embedded.docx-0000000005.zip", 4492l);
+        m.put("test_recursive_embedded.docx-0000000010.zip", 163l);
+        return m;
+    }
+
+    private Map<String, Long> getFileNames(Path p) throws Exception {
+        final Map<String, Long> ret = new HashMap<>();
+        Files.walkFileTree(TMP_BYTES_DIR, new FileVisitor<Path>() {
+            @Override
+            public FileVisitResult preVisitDirectory(Path dir, 
BasicFileAttributes attrs)
+                    throws IOException {
+                return FileVisitResult.CONTINUE;
+            }
+
+            @Override
+            public FileVisitResult visitFile(Path file, BasicFileAttributes 
attrs)
+                    throws IOException {
+                ret.put(file.getFileName().toString(), Files.size(file));
+                return FileVisitResult.CONTINUE;
+            }
+
+            @Override
+            public FileVisitResult visitFileFailed(Path file, IOException exc) 
throws IOException {
+                return FileVisitResult.CONTINUE;
+            }
+
+            @Override
+            public FileVisitResult postVisitDirectory(Path dir, IOException 
exc)
+                    throws IOException {
+                return FileVisitResult.CONTINUE;
+            }
+        });
+        return ret;
+    }
 }

(tika) branch TIKA-4207 updated: TIKA-4207 -- refactor to use inputstreams instead of byte arrays. add max bytes extracted

Reply via email to