This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4514 in repository https://gitbox.apache.org/repos/asf/tika.git
commit f57b43d3e86649d741493378fa98510de2bd7d7c Author: tallison <[email protected]> AuthorDate: Fri Oct 10 17:38:24 2025 -0400 TIKA-4514 --- .../apache/tika/extractor/RUnpackExtractor.java | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index cbd560c50..7886f3572 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import org.apache.commons.io.input.CloseShieldInputStream; import org.slf4j.Logger; @@ -31,6 +32,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -56,6 +58,7 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; + private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); private long bytesExtracted = 0; private final long maxEmbeddedBytesForExtraction; @@ -113,13 +116,20 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { throws TikaException, IOException, SAXException { //TODO -- improve the efficiency of this so that we're not //literally writing out a file per request - Path p = stream.getPath(); - try { - //warp in CloseShieldInputStream to ensure that a misbehaving parser isn't closing - //the stream and thereby deleting the temp file. - parse(CloseShieldInputStream.wrap(stream), handler, metadata); + Path tmp = Files.createTempFile("tika-tmp-", ".bin"); + if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) { + Files.copy(embeddedStreamTranslator.translate(stream, metadata), tmp, StandardCopyOption.REPLACE_EXISTING); + } else { + Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING); + } + try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) { + parse(tmpTis, handler, metadata); } finally { - storeEmbeddedBytes(p, metadata); + try { + storeEmbeddedBytes(tmp, metadata); + } finally { + Files.delete(tmp); + } } }
