This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4518
in repository https://gitbox.apache.org/repos/asf/tika.git

commit abbf3fd0a69626b85c5d431d345352c094cf3691
Author: tallison <[email protected]>
AuthorDate: Wed Oct 15 08:35:59 2025 -0400

    TIKA-4518 -- improve pst handling with -Z option, WIP
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  4 +--
 .../extractor/DefaultEmbeddedStreamTranslator.java | 14 ++++----
 .../tika/extractor/EmbeddedStreamTranslator.java   |  7 ++--
 .../apache/tika/extractor/RUnpackExtractor.java    | 13 +++++---
 .../microsoft/MSEmbeddedStreamTranslator.java      | 39 ++++++++++------------
 .../server/core/resource/UnpackerResource.java     | 26 ++++++---------
 6 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 28a9b29c7..a1db5f8bf 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1112,9 +1112,7 @@ public class TikaCLI {
 
             try (OutputStream os = Files.newOutputStream(outputFile)) {
                 if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
-                    try (InputStream translated = 
embeddedStreamTranslator.translate(tis, metadata)) {
-                        IOUtils.copy(translated, os);
-                    }
+                    embeddedStreamTranslator.translate(tis, metadata, os);
                 } else {
                     IOUtils.copy(tis, os);
                 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
index 537c5ffa1..bf2321481 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -17,10 +17,11 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
-import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.List;
 
 import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.utils.ServiceLoaderUtils;
 
@@ -58,7 +59,7 @@ public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator
      * @throws IOException
      */
     @Override
-    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public boolean shouldTranslate(TikaInputStream inputStream, Metadata 
metadata) throws IOException {
         for (EmbeddedStreamTranslator translator : translators) {
             if (translator.shouldTranslate(inputStream, metadata)) {
                 return true;
@@ -75,13 +76,12 @@ public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator
      * @throws IOException
      */
     @Override
-    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public void translate(TikaInputStream inputStream, Metadata metadata, 
OutputStream outputStream) throws IOException {
         for (EmbeddedStreamTranslator translator : translators) {
-            InputStream translated = translator.translate(inputStream, 
metadata);
-            if (translated != null) {
-                return translated;
+            if (translator.shouldTranslate(inputStream, metadata)) {
+                translator.translate(inputStream, metadata, outputStream);
+                return;
             }
         }
-        return inputStream;
     }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
index b2ce05db4..2391f0be5 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -18,7 +18,9 @@ package org.apache.tika.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 
 /**
@@ -30,9 +32,8 @@ import org.apache.tika.metadata.Metadata;
  */
 public interface EmbeddedStreamTranslator {
 
-    boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws 
IOException;
+    boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) 
throws IOException;
 
-    InputStream translate(InputStream inputStream,
-                          Metadata metadata) throws IOException;
+    void translate(TikaInputStream inputStream, Metadata metadata, 
OutputStream os) throws IOException;
 
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 42544dc80..c5d8185b2 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -21,6 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
@@ -110,18 +111,20 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
         }
     }
 
-    private void parseWithBytes(TikaInputStream stream, ContentHandler 
handler, Metadata metadata)
+    private void parseWithBytes(TikaInputStream tis, ContentHandler handler, 
Metadata metadata)
             throws TikaException, IOException, SAXException {
         //TODO -- improve the efficiency of this so that we're not
         //literally writing out a file per request
         Path tmp = Files.createTempFile("tika-tmp-", ".bin");
-        if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) {
-            Files.copy(embeddedStreamTranslator.translate(stream, metadata), 
tmp, StandardCopyOption.REPLACE_EXISTING);
+        if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+            try (OutputStream os = Files.newOutputStream(tmp)) {
+                embeddedStreamTranslator.translate(tis, metadata, os);
+            }
         } else {
-            Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING);
+            Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
         }
         try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) {
-            parse(tmpTis, handler, metadata);
+            parse(tis, handler, metadata);
         } finally {
             try {
                 storeEmbeddedBytes(tmp, metadata);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 24f7ec2d3..3833b91da 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -18,9 +18,12 @@ package org.apache.tika.extractor.microsoft;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -43,26 +46,22 @@ public class MSEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
     private static final Logger LOG = 
LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class);
 
     @Override
-    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public boolean shouldTranslate(TikaInputStream tis, Metadata metadata) 
throws IOException {
         String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
         if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
             return true;
-        } else if (inputStream instanceof TikaInputStream) {
-            TikaInputStream tin = (TikaInputStream) inputStream;
-            if (tin.getOpenContainer() != null &&
-                    tin.getOpenContainer() instanceof DirectoryEntry) {
-                return true;
-            }
+        } else {
+            return tis.getOpenContainer() != null &&
+                    tis.getOpenContainer() instanceof DirectoryEntry;
         }
-        return false;
     }
 
     @Override
-    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public void translate(TikaInputStream tis, Metadata metadata, OutputStream 
os) throws IOException {
         String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
         if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
             UnsynchronizedByteArrayOutputStream bos = 
UnsynchronizedByteArrayOutputStream.builder().get();
-            IOUtils.copy(inputStream, bos);
+            IOUtils.copy(tis, bos);
             POIFSFileSystem poifs = new POIFSFileSystem(bos.toInputStream());
             OfficeParser.POIFSDocumentType type = 
OfficeParser.POIFSDocumentType.detectType(poifs);
             String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
@@ -82,21 +81,17 @@ public class MSEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
                 name += '.' + type.getExtension();
             }
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
-            return 
UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get();
-        } else if (inputStream instanceof TikaInputStream) {
-            TikaInputStream tin = (TikaInputStream) inputStream;
-
-            if (tin.getOpenContainer() != null &&
-                    tin.getOpenContainer() instanceof DirectoryEntry) {
-                POIFSFileSystem fs = new POIFSFileSystem();
-                copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
-                try (UnsynchronizedByteArrayOutputStream bos2 = 
UnsynchronizedByteArrayOutputStream.builder().get()) {
-                    fs.writeFilesystem(bos2);
-                    return bos2.toInputStream();
+            os.write(data);
+            os.flush();
+        } else {
+            if (tis.getOpenContainer() != null &&
+                    tis.getOpenContainer() instanceof DirectoryEntry) {
+                try (POIFSFileSystem fs = new POIFSFileSystem()) {
+                    copy((DirectoryEntry) tis.getOpenContainer(), 
fs.getRoot());
+                    fs.writeFilesystem(CloseShieldOutputStream.wrap(os));
                 }
             }
         }
-        return inputStream;
     }
 
     protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
throws IOException {
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index a2e3064d6..0d75c9bec 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -193,12 +193,16 @@ public class UnpackerResource {
                     .builder()
                     .get();
 
-            BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, 
tis);
-            IOUtils.copy(bis, bos);
-            if (bis.hasHitBound()) {
-                throw new IOException(new TikaMemoryLimitException(
-                        "An attachment is longer than " + "'unpackMaxBytes' 
(default=100MB, actual=" + unpackMaxBytes + "). " + "If you need to increase 
this " +
-                                "limit, add a header to your request, such as: 
unpackMaxBytes: " + "1073741824.  There is a hard limit of 2GB."));
+            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+                embeddedStreamTranslator.translate(tis, metadata, bos);
+            } else {
+                BoundedInputStream bis = new 
BoundedInputStream(unpackMaxBytes, tis);
+                IOUtils.copy(bis, bos);
+                if (bis.hasHitBound()) {
+                    throw new IOException(new TikaMemoryLimitException(
+                            "An attachment is longer than " + 
"'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " + "If you 
need to increase this " +
+                                    "limit, add a header to your request, such 
as: unpackMaxBytes: " + "1073741824.  There is a hard limit of 2GB."));
+                }
             }
             byte[] data = bos.toByteArray();
 
@@ -224,16 +228,6 @@ public class UnpackerResource {
                     LOG.warn("Unexpected MimeTypeException", e);
                 }
             }
-            try (TikaInputStream is = TikaInputStream.get(data)) {
-                if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
-                    InputStream translated = 
embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
 metadata);
-                    UnsynchronizedByteArrayOutputStream bos2 = 
UnsynchronizedByteArrayOutputStream
-                            .builder()
-                            .get();
-                    IOUtils.copy(translated, bos2);
-                    data = bos2.toByteArray();
-                }
-            }
 
             final String finalName = getFinalName(name, zout);
 

Reply via email to