This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 701323a48 TIKA-4533 - third time's the charm -- further refinement
(#2382)
701323a48 is described below
commit 701323a4866a9355eec4c9e3ee21192e2d9b4128
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 31 16:16:31 2025 -0400
TIKA-4533 - third time's the charm -- further refinement (#2382)
---
.../apache/tika/extractor/RUnpackExtractor.java | 32 +++++++++++-----------
.../src/test/java/org/apache/tika/TikaTest.java | 9 ++++++
2 files changed, 25 insertions(+), 16 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 70c21ffb4..234c3155f 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -24,7 +24,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -113,30 +112,31 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
private void parseWithBytes(TikaInputStream tis, ContentHandler handler,
Metadata metadata) throws TikaException, IOException, SAXException {
- Path tmp = Files.createTempFile("tika-tmp-", ".bin");
+ //trigger spool to disk
+ Path rawBytes = tis.getPath();
+
+ //There may be a "translated" path for OLE2 etc
+ Path translated = null;
try {
//translate the stream or not
if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
- try (OutputStream os = Files.newOutputStream(tmp)) {
+ translated = Files.createTempFile("tika-tmp-", ".bin");
+ try (OutputStream os = Files.newOutputStream(translated)) {
embeddedStreamTranslator.translate(tis, metadata, os);
}
- } else {
- Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
- }
-
- //now do the parse
- if (tis.getOpenContainer() != null) {
- parse(tis, handler, metadata);
- } else {
- try (TikaInputStream tisTmp = TikaInputStream.get(tmp)) {
- parse(tisTmp, handler, metadata);
- }
}
+ parse(tis, handler, metadata);
} finally {
try {
- storeEmbeddedBytes(tmp, metadata);
+ if (translated != null) {
+ storeEmbeddedBytes(translated, metadata);
+ } else {
+ storeEmbeddedBytes(rawBytes, metadata);
+ }
} finally {
- Files.delete(tmp);
+ if (translated != null) {
+ Files.delete(translated);
+ }
}
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a0a6377b8..4345c2a03 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -399,6 +399,15 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
ParseContext parseContext,
+ boolean suppressException)
throws Exception {
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+ return getRecursiveMetadata(tis, parser, metadata, parseContext,
+ suppressException);
+ }
+ }
+
protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
boolean suppressException)
throws Exception {
Metadata metadata = new Metadata();