This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 905d62f1d0ec20452c6f9a8ff91d64a60b5b368a
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 31 16:16:31 2025 -0400

    TIKA-4533 - third time's the charm -- further refinement (#2382)
    
    (cherry picked from commit 701323a4866a9355eec4c9e3ee21192e2d9b4128)
    
    # Conflicts:
    #       
tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
---
 .../apache/tika/extractor/RUnpackExtractor.java    | 41 +++++++++++++++++-----
 .../src/test/java/org/apache/tika/TikaTest.java    |  9 +++++
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 0e5928845..36d54183e 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.slf4j.Logger;
@@ -57,6 +58,7 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
 
     private EmbeddedBytesSelector embeddedBytesSelector = 
EmbeddedBytesSelector.ACCEPT_ALL;
 
+    private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
     private long bytesExtracted = 0;
     private final long maxEmbeddedBytesForExtraction;
 
@@ -115,17 +117,34 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
         }
     }
 
-    private void parseWithBytes(TikaInputStream stream, ContentHandler 
handler, Metadata metadata)
-            throws TikaException, IOException, SAXException {
-        //TODO -- improve the efficiency of this so that we're not
-        //literally writing out a file per request
-        Path p = stream.getPath();
+    private void parseWithBytes(TikaInputStream tis, ContentHandler handler, 
Metadata metadata) throws TikaException, IOException, SAXException {
+
+        //trigger spool to disk
+        Path rawBytes = tis.getPath();
+
+        //There may be a "translated" path for OLE2 etc
+        Path translated = null;
         try {
-            //warp in CloseShieldInputStream to ensure that a misbehaving 
parser isn't closing
-            //the stream and thereby deleting the temp file.
-            parse(CloseShieldInputStream.wrap(stream), handler, metadata);
+            //translate the stream or not
+            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+                translated = Files.createTempFile("tika-tmp-", ".bin");
+                try (InputStream is = embeddedStreamTranslator.translate(tis, 
metadata)) {
+                    Files.copy(is, translated, 
StandardCopyOption.REPLACE_EXISTING);
+                }
+            }
+            parse(tis, handler, metadata);
         } finally {
-            storeEmbeddedBytes(p, metadata);
+            try {
+                if (translated != null) {
+                    storeEmbeddedBytes(translated, metadata);
+                } else {
+                    storeEmbeddedBytes(rawBytes, metadata);
+                }
+            } finally {
+                if (translated != null) {
+                    Files.delete(translated);
+                }
+            }
         }
     }
 
@@ -137,6 +156,10 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
     }
 
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
+        if (p == null) {
+            return;
+        }
+
         if (! embeddedBytesSelector.select(metadata)) {
             if (LOGGER.isDebugEnabled()) {
                 LOGGER.debug("skipping embedded bytes {} <-> {}",
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a0a6377b8..4345c2a03 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -399,6 +399,15 @@ public abstract class TikaTest {
         }
     }
 
+    protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, 
ParseContext parseContext,
+                                                  boolean suppressException) 
throws Exception {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, metadata, parseContext,
+                    suppressException);
+        }
+    }
+
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
                                                   boolean suppressException) 
throws Exception {
         Metadata metadata = new Metadata();

Reply via email to