This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new b2e5df17c TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage. b2e5df17c is described below commit b2e5df17cbf02e73733c0985ae47c6fc63af0a68 Author: tballison <talli...@apache.org> AuthorDate: Mon Nov 7 15:42:31 2022 -0500 TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage. --- .../src/main/java/org/apache/tika/io/TikaInputStream.java | 10 +++++++--- .../apache/tika/detect/zip/StreamingZipContainerDetector.java | 8 +++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 76db4a2a3..62daabcc0 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -710,12 +710,16 @@ public class TikaInputStream extends TaggedInputStream { } else { Path tmpFile = tmp.createTempFile(suffix); if (maxBytes > -1) { - try (InputStream lookAhead = new LookaheadInputStream(this, maxBytes)) { - Files.copy(lookAhead, tmpFile, REPLACE_EXISTING); - if (Files.size(tmpFile) >= maxBytes) { + this.mark(maxBytes); + try (BoundedInputStream boundedInputStream = + new BoundedInputStream(maxBytes, this)) { + Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING); + if (boundedInputStream.hasHitBound()) { //tmpFile will be cleaned up when this TikaInputStream is closed return null; } + } finally { + this.reset(); } } else { // Spool the entire stream into a temporary file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java index c96e01323..b4a8c6771 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java @@ -25,7 +25,7 @@ import org.apache.commons.io.IOUtils; import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; -import org.apache.tika.io.LookaheadInputStream; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -80,9 +80,11 @@ public class StreamingZipContainerDetector extends DefaultZipContainerDetector { if (type == TIFF) { return TIFF; } else if (isZipArchive(type)) { - - try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) { + input.mark(markLimit); + try (BoundedInputStream lookahead = new BoundedInputStream(markLimit, input)) { return detectStreaming(lookahead, metadata); + } finally { + input.reset(); } } else if (!type.equals(MediaType.OCTET_STREAM)) { return type;