https://github.com/python/cpython/commit/6d7a19e5334636f77cac135120fe81f343a73876
commit: 6d7a19e5334636f77cac135120fe81f343a73876
branch: main
author: Tomi Belan <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-05-30T09:23:50Z
summary:
gh-121109: Fix performance of tarfile reading with "r|*" (GH-121296)
files:
A Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
M Lib/tarfile.py
M Misc/ACKS
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index f5e4d6f887e9e6..a293a049247274 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("bz2 module is not available") from
None
if mode == "r":
- self.dbuf = b""
self.cmp = bz2.BZ2Decompressor()
self.exception = OSError
else:
@@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("lzma module is not available")
from None
if mode == "r":
- self.dbuf = b""
self.cmp = lzma.LZMADecompressor()
self.exception = lzma.LZMAError
else:
@@ -403,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
except ImportError:
raise CompressionError("compression.zstd module is not
available") from None
if mode == "r":
- self.dbuf = b""
self.cmp = zstd.ZstdDecompressor()
self.exception = zstd.ZstdError
else:
@@ -485,7 +482,6 @@ def _init_read_gz(self):
"""Initialize for reading a gzip compressed fileobj.
"""
self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
- self.dbuf = b""
# taken from gzip.GzipFile with some alterations
if self.__read(2) != b"\037\213":
@@ -543,26 +539,44 @@ def _read(self, size):
if self.comptype == "tar":
return self.__read(size)
- c = len(self.dbuf)
- t = [self.dbuf]
+ c = 0
+ t = []
while c < size:
- # Skip underlying buffer to avoid unaligned double buffering.
- if self.buf:
- buf = self.buf
- self.buf = b""
+ if self.comptype == "gz":
+ # zlib interface is different than others.
+ # It returns data in unconsumed_tail.
+ if self.buf:
+ cbuf = self.buf
+ self.buf = b""
+ else:
+ cbuf = self.fileobj.read(self.bufsize)
+ if not cbuf:
+ break
+
+ try:
+ dbuf = self.cmp.decompress(cbuf, size - c)
+ self.buf = self.cmp.unconsumed_tail
+ except self.exception as e:
+ raise ReadError("invalid compressed data") from e
else:
- buf = self.fileobj.read(self.bufsize)
- if not buf:
- break
- try:
- buf = self.cmp.decompress(buf)
- except self.exception as e:
- raise ReadError("invalid compressed data") from e
- t.append(buf)
- c += len(buf)
- t = b"".join(t)
- self.dbuf = t[size:]
- return t[:size]
+ # Other decompressors have needs_input.
+ # decompress() can buffer data internally.
+ if self.cmp.needs_input:
+ cbuf = self.fileobj.read(self.bufsize)
+ if not cbuf:
+ break
+ else:
+ cbuf = b""
+
+ try:
+ dbuf = self.cmp.decompress(cbuf, size - c)
+ except self.exception as e:
+ raise ReadError("invalid compressed data") from e
+
+ t.append(dbuf)
+ c += len(dbuf)
+
+ return b"".join(t)
def __read(self, size):
"""Return size bytes from stream. If internal buffer is empty,
diff --git a/Misc/ACKS b/Misc/ACKS
index 234d0d2d0a2a16..14f0db7549534b 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -144,6 +144,7 @@ Bas van Beek
Ian Beer
Stefan Behnel
Reimer Behrends
+Tomi Belan
Maxime Bélanger
Ben Bell
Thomas Bellman
diff --git
a/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
new file mode 100644
index 00000000000000..eca6014e4a0aed
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
@@ -0,0 +1,2 @@
+Fix :mod:`tarfile` performance issue when reading archives in streaming mode
+(e.g. ``r|*``).
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]