This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 905d62f1d0ec20452c6f9a8ff91d64a60b5b368a Author: Tim Allison <[email protected]> AuthorDate: Fri Oct 31 16:16:31 2025 -0400 TIKA-4533 - third time's the charm -- further refinement (#2382) (cherry picked from commit 701323a4866a9355eec4c9e3ee21192e2d9b4128) # Conflicts: # tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java --- .../apache/tika/extractor/RUnpackExtractor.java | 41 +++++++++++++++++----- .../src/test/java/org/apache/tika/TikaTest.java | 9 +++++ 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 0e5928845..36d54183e 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import org.apache.commons.io.input.CloseShieldInputStream; import org.slf4j.Logger; @@ -57,6 +58,7 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; + private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); private long bytesExtracted = 0; private final long maxEmbeddedBytesForExtraction; @@ -115,17 +117,34 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { } } - private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata) - throws TikaException, IOException, SAXException { - //TODO -- improve the efficiency of this so that we're not - //literally writing out a file per request - Path p = stream.getPath(); + private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { + + //trigger spool to disk + Path rawBytes = tis.getPath(); + + //There may be a "translated" path for OLE2 etc + Path translated = null; try { - //warp in CloseShieldInputStream to ensure that a misbehaving parser isn't closing - //the stream and thereby deleting the temp file. - parse(CloseShieldInputStream.wrap(stream), handler, metadata); + //translate the stream or not + if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { + translated = Files.createTempFile("tika-tmp-", ".bin"); + try (InputStream is = embeddedStreamTranslator.translate(tis, metadata)) { + Files.copy(is, translated, StandardCopyOption.REPLACE_EXISTING); + } + } + parse(tis, handler, metadata); } finally { - storeEmbeddedBytes(p, metadata); + try { + if (translated != null) { + storeEmbeddedBytes(translated, metadata); + } else { + storeEmbeddedBytes(rawBytes, metadata); + } + } finally { + if (translated != null) { + Files.delete(translated); + } + } } } @@ -137,6 +156,10 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { } private void storeEmbeddedBytes(Path p, Metadata metadata) { + if (p == null) { + return; + } + if (! embeddedBytesSelector.select(metadata)) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("skipping embedded bytes {} <-> {}", diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a0a6377b8..4345c2a03 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -399,6 +399,15 @@ public abstract class TikaTest { } } + protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, ParseContext parseContext, + boolean suppressException) throws Exception { + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, parser, metadata, parseContext, + suppressException); + } + } + protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception { Metadata metadata = new Metadata();
