This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4514
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f57b43d3e86649d741493378fa98510de2bd7d7c
Author: tallison <[email protected]>
AuthorDate: Fri Oct 10 17:38:24 2025 -0400

    TIKA-4514
---
 .../apache/tika/extractor/RUnpackExtractor.java    | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index cbd560c50..7886f3572 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.slf4j.Logger;
@@ -31,6 +32,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.CorruptedFileException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
@@ -56,6 +58,7 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
 
     private EmbeddedBytesSelector embeddedBytesSelector = 
EmbeddedBytesSelector.ACCEPT_ALL;
 
+    private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
     private long bytesExtracted = 0;
     private final long maxEmbeddedBytesForExtraction;
 
@@ -113,13 +116,20 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
             throws TikaException, IOException, SAXException {
         //TODO -- improve the efficiency of this so that we're not
         //literally writing out a file per request
-        Path p = stream.getPath();
-        try {
-            //warp in CloseShieldInputStream to ensure that a misbehaving 
parser isn't closing
-            //the stream and thereby deleting the temp file.
-            parse(CloseShieldInputStream.wrap(stream), handler, metadata);
+        Path tmp = Files.createTempFile("tika-tmp-", ".bin");
+        if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) {
+            Files.copy(embeddedStreamTranslator.translate(stream, metadata), 
tmp, StandardCopyOption.REPLACE_EXISTING);
+        } else {
+            Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING);
+        }
+        try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) {
+            parse(tmpTis, handler, metadata);
         } finally {
-            storeEmbeddedBytes(p, metadata);
+            try {
+                storeEmbeddedBytes(tmp, metadata);
+            } finally {
+                Files.delete(tmp);
+            }
         }
     }
 

Reply via email to