This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4518 in repository https://gitbox.apache.org/repos/asf/tika.git
commit abbf3fd0a69626b85c5d431d345352c094cf3691 Author: tallison <[email protected]> AuthorDate: Wed Oct 15 08:35:59 2025 -0400 TIKA-4518 -- improve pst handling with -Z option, WIP --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +-- .../extractor/DefaultEmbeddedStreamTranslator.java | 14 ++++---- .../tika/extractor/EmbeddedStreamTranslator.java | 7 ++-- .../apache/tika/extractor/RUnpackExtractor.java | 13 +++++--- .../microsoft/MSEmbeddedStreamTranslator.java | 39 ++++++++++------------ .../server/core/resource/UnpackerResource.java | 26 ++++++--------- 6 files changed, 47 insertions(+), 56 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 28a9b29c7..a1db5f8bf 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -1112,9 +1112,7 @@ public class TikaCLI { try (OutputStream os = Files.newOutputStream(outputFile)) { if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { - try (InputStream translated = embeddedStreamTranslator.translate(tis, metadata)) { - IOUtils.copy(translated, os); - } + embeddedStreamTranslator.translate(tis, metadata, os); } else { IOUtils.copy(tis, os); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java index 537c5ffa1..bf2321481 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java @@ -17,10 +17,11 @@ package org.apache.tika.extractor; import java.io.IOException; -import java.io.InputStream; +import java.io.OutputStream; import java.util.List; import org.apache.tika.config.ServiceLoader; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.ServiceLoaderUtils; @@ -58,7 +59,7 @@ public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator * @throws IOException */ @Override - public boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws IOException { + public boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) throws IOException { for (EmbeddedStreamTranslator translator : translators) { if (translator.shouldTranslate(inputStream, metadata)) { return true; @@ -75,13 +76,12 @@ public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator * @throws IOException */ @Override - public InputStream translate(InputStream inputStream, Metadata metadata) throws IOException { + public void translate(TikaInputStream inputStream, Metadata metadata, OutputStream outputStream) throws IOException { for (EmbeddedStreamTranslator translator : translators) { - InputStream translated = translator.translate(inputStream, metadata); - if (translated != null) { - return translated; + if (translator.shouldTranslate(inputStream, metadata)) { + translator.translate(inputStream, metadata, outputStream); + return; } } - return inputStream; } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java index b2ce05db4..2391f0be5 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java @@ -18,7 +18,9 @@ package org.apache.tika.extractor; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; /** @@ -30,9 +32,8 @@ import org.apache.tika.metadata.Metadata; */ public interface EmbeddedStreamTranslator { - boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws IOException; + boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) throws IOException; - InputStream translate(InputStream inputStream, - Metadata metadata) throws IOException; + void translate(TikaInputStream inputStream, Metadata metadata, OutputStream os) throws IOException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 42544dc80..c5d8185b2 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -21,6 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; @@ -110,18 +111,20 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { } } - private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata) + private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { //TODO -- improve the efficiency of this so that we're not //literally writing out a file per request Path tmp = Files.createTempFile("tika-tmp-", ".bin"); - if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) { - Files.copy(embeddedStreamTranslator.translate(stream, metadata), tmp, StandardCopyOption.REPLACE_EXISTING); + if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { + try (OutputStream os = Files.newOutputStream(tmp)) { + embeddedStreamTranslator.translate(tis, metadata, os); + } } else { - Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING); + Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING); } try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) { - parse(tmpTis, handler, metadata); + parse(tis, handler, metadata); } finally { try { storeEmbeddedBytes(tmp, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java index 24f7ec2d3..3833b91da 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java @@ -18,9 +18,12 @@ package org.apache.tika.extractor.microsoft; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.commons.io.output.CloseShieldOutputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DocumentEntry; @@ -43,26 +46,22 @@ public class MSEmbeddedStreamTranslator implements EmbeddedStreamTranslator { private static final Logger LOG = LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class); @Override - public boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws IOException { + public boolean shouldTranslate(TikaInputStream tis, Metadata metadata) throws IOException { String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE); if ("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) { return true; - } else if (inputStream instanceof TikaInputStream) { - TikaInputStream tin = (TikaInputStream) inputStream; - if (tin.getOpenContainer() != null && - tin.getOpenContainer() instanceof DirectoryEntry) { - return true; - } + } else { + return tis.getOpenContainer() != null && + tis.getOpenContainer() instanceof DirectoryEntry; } - return false; } @Override - public InputStream translate(InputStream inputStream, Metadata metadata) throws IOException { + public void translate(TikaInputStream tis, Metadata metadata, OutputStream os) throws IOException { String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE); if ("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) { UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); - IOUtils.copy(inputStream, bos); + IOUtils.copy(tis, bos); POIFSFileSystem poifs = new POIFSFileSystem(bos.toInputStream()); OfficeParser.POIFSDocumentType type = OfficeParser.POIFSDocumentType.detectType(poifs); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); @@ -82,21 +81,17 @@ public class MSEmbeddedStreamTranslator implements EmbeddedStreamTranslator { name += '.' + type.getExtension(); } metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); - return UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(); - } else if (inputStream instanceof TikaInputStream) { - TikaInputStream tin = (TikaInputStream) inputStream; - - if (tin.getOpenContainer() != null && - tin.getOpenContainer() instanceof DirectoryEntry) { - POIFSFileSystem fs = new POIFSFileSystem(); - copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot()); - try (UnsynchronizedByteArrayOutputStream bos2 = UnsynchronizedByteArrayOutputStream.builder().get()) { - fs.writeFilesystem(bos2); - return bos2.toInputStream(); + os.write(data); + os.flush(); + } else { + if (tis.getOpenContainer() != null && + tis.getOpenContainer() instanceof DirectoryEntry) { + try (POIFSFileSystem fs = new POIFSFileSystem()) { + copy((DirectoryEntry) tis.getOpenContainer(), fs.getRoot()); + fs.writeFilesystem(CloseShieldOutputStream.wrap(os)); } } } - return inputStream; } protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) throws IOException { diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java index a2e3064d6..0d75c9bec 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java @@ -193,12 +193,16 @@ public class UnpackerResource { .builder() .get(); - BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, tis); - IOUtils.copy(bis, bos); - if (bis.hasHitBound()) { - throw new IOException(new TikaMemoryLimitException( - "An attachment is longer than " + "'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " + "If you need to increase this " + - "limit, add a header to your request, such as: unpackMaxBytes: " + "1073741824. There is a hard limit of 2GB.")); + if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { + embeddedStreamTranslator.translate(tis, metadata, bos); + } else { + BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, tis); + IOUtils.copy(bis, bos); + if (bis.hasHitBound()) { + throw new IOException(new TikaMemoryLimitException( + "An attachment is longer than " + "'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " + "If you need to increase this " + + "limit, add a header to your request, such as: unpackMaxBytes: " + "1073741824. There is a hard limit of 2GB.")); + } } byte[] data = bos.toByteArray(); @@ -224,16 +228,6 @@ public class UnpackerResource { LOG.warn("Unexpected MimeTypeException", e); } } - try (TikaInputStream is = TikaInputStream.get(data)) { - if (embeddedStreamTranslator.shouldTranslate(is, metadata)) { - InputStream translated = embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), metadata); - UnsynchronizedByteArrayOutputStream bos2 = UnsynchronizedByteArrayOutputStream - .builder() - .get(); - IOUtils.copy(translated, bos2); - data = bos2.toByteArray(); - } - } final String finalName = getFinalName(name, zout);
