This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4533c in repository https://gitbox.apache.org/repos/asf/tika.git
commit dd3e30dc56418a7ab68afe50a9c6af29f8322e79 Author: tallison <[email protected]> AuthorDate: Thu Oct 30 17:13:53 2025 -0400 TIKA-4533 - third time's the charm -- further refinement --- .../apache/tika/extractor/RUnpackExtractor.java | 32 +++++++++++----------- .../src/test/java/org/apache/tika/TikaTest.java | 9 ++++++ 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 70c21ffb4..234c3155f 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.StandardCopyOption; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -113,30 +112,31 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { - Path tmp = Files.createTempFile("tika-tmp-", ".bin"); + //trigger spool to disk + Path rawBytes = tis.getPath(); + + //There may be a "translated" path for OLE2 etc + Path translated = null; try { //translate the stream or not if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { - try (OutputStream os = Files.newOutputStream(tmp)) { + translated = Files.createTempFile("tika-tmp-", ".bin"); + try (OutputStream os = Files.newOutputStream(translated)) { embeddedStreamTranslator.translate(tis, metadata, os); } - } else { - Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING); - } - - //now do the parse - if (tis.getOpenContainer() != null) { - parse(tis, handler, metadata); - } else { - try (TikaInputStream tisTmp = TikaInputStream.get(tmp)) { - parse(tisTmp, handler, metadata); - } } + parse(tis, handler, metadata); } finally { try { - storeEmbeddedBytes(tmp, metadata); + if (translated != null) { + storeEmbeddedBytes(translated, metadata); + } else { + storeEmbeddedBytes(rawBytes, metadata); + } } finally { - Files.delete(tmp); + if (translated != null) { + Files.delete(translated); + } } } } diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a0a6377b8..4345c2a03 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -399,6 +399,15 @@ public abstract class TikaTest { } } + protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, ParseContext parseContext, + boolean suppressException) throws Exception { + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, parser, metadata, parseContext, + suppressException); + } + } + protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception { Metadata metadata = new Metadata();
