https://github.com/python/cpython/commit/6d7a19e5334636f77cac135120fe81f343a73876
commit: 6d7a19e5334636f77cac135120fe81f343a73876
branch: main
author: Tomi Belan <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-05-30T09:23:50Z
summary:

gh-121109: Fix performance of tarfile reading with "r|*" (GH-121296)

files:
A Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
M Lib/tarfile.py
M Misc/ACKS

diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index f5e4d6f887e9e6..a293a049247274 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
                 except ImportError:
                     raise CompressionError("bz2 module is not available") from 
None
                 if mode == "r":
-                    self.dbuf = b""
                     self.cmp = bz2.BZ2Decompressor()
                     self.exception = OSError
                 else:
@@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
                 except ImportError:
                     raise CompressionError("lzma module is not available") 
from None
                 if mode == "r":
-                    self.dbuf = b""
                     self.cmp = lzma.LZMADecompressor()
                     self.exception = lzma.LZMAError
                 else:
@@ -403,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
                 except ImportError:
                     raise CompressionError("compression.zstd module is not 
available") from None
                 if mode == "r":
-                    self.dbuf = b""
                     self.cmp = zstd.ZstdDecompressor()
                     self.exception = zstd.ZstdError
                 else:
@@ -485,7 +482,6 @@ def _init_read_gz(self):
         """Initialize for reading a gzip compressed fileobj.
         """
         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
-        self.dbuf = b""
 
         # taken from gzip.GzipFile with some alterations
         if self.__read(2) != b"\037\213":
@@ -543,26 +539,44 @@ def _read(self, size):
         if self.comptype == "tar":
             return self.__read(size)
 
-        c = len(self.dbuf)
-        t = [self.dbuf]
+        c = 0
+        t = []
         while c < size:
-            # Skip underlying buffer to avoid unaligned double buffering.
-            if self.buf:
-                buf = self.buf
-                self.buf = b""
+            if self.comptype == "gz":
+                # zlib interface is different than others.
+                # It returns data in unconsumed_tail.
+                if self.buf:
+                    cbuf = self.buf
+                    self.buf = b""
+                else:
+                    cbuf = self.fileobj.read(self.bufsize)
+                    if not cbuf:
+                        break
+
+                try:
+                    dbuf = self.cmp.decompress(cbuf, size - c)
+                    self.buf = self.cmp.unconsumed_tail
+                except self.exception as e:
+                    raise ReadError("invalid compressed data") from e
             else:
-                buf = self.fileobj.read(self.bufsize)
-                if not buf:
-                    break
-            try:
-                buf = self.cmp.decompress(buf)
-            except self.exception as e:
-                raise ReadError("invalid compressed data") from e
-            t.append(buf)
-            c += len(buf)
-        t = b"".join(t)
-        self.dbuf = t[size:]
-        return t[:size]
+                # Other decompressors have needs_input.
+                # decompress() can buffer data internally.
+                if self.cmp.needs_input:
+                    cbuf = self.fileobj.read(self.bufsize)
+                    if not cbuf:
+                        break
+                else:
+                    cbuf = b""
+
+                try:
+                    dbuf = self.cmp.decompress(cbuf, size - c)
+                except self.exception as e:
+                    raise ReadError("invalid compressed data") from e
+
+            t.append(dbuf)
+            c += len(dbuf)
+
+        return b"".join(t)
 
     def __read(self, size):
         """Return size bytes from stream. If internal buffer is empty,
diff --git a/Misc/ACKS b/Misc/ACKS
index 234d0d2d0a2a16..14f0db7549534b 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -144,6 +144,7 @@ Bas van Beek
 Ian Beer
 Stefan Behnel
 Reimer Behrends
+Tomi Belan
 Maxime Bélanger
 Ben Bell
 Thomas Bellman
diff --git 
a/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst 
b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
new file mode 100644
index 00000000000000..eca6014e4a0aed
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst
@@ -0,0 +1,2 @@
+Fix :mod:`tarfile` performance issue when reading archives in streaming mode
+(e.g. ``r|*``).

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

Reply via email to