https://github.com/python/cpython/commit/2b2d607095335024e5e2bb358e3ef37650536839
commit: 2b2d607095335024e5e2bb358e3ef37650536839
branch: main
author: Johan Förberg <[email protected]>
committer: hauntsaninja <[email protected]>
date: 2024-10-30T15:08:30-07:00
summary:

gh-121267: Improve performance of tarfile (#121267) (#121269)

Tarfile in the default write mode spends much of its time resolving UIDs
into usernames and GIDs into group names. By caching these mappings, a
significant speedup can be achieved.

In my simple benchmark[1], this extra caching speeds up tarfile by 8x.

[1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2

---------

Co-authored-by: Tian Gao <[email protected]>
Co-authored-by: Bénédikt Tran <[email protected]>
Co-authored-by: Shantanu <[email protected]>

files:
A Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst
M Lib/tarfile.py

diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 1475b3da2d3293..a0fab46b24e249 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -1760,6 +1760,8 @@ def __init__(self, name=None, mode="r", fileobj=None, 
format=None,
                                 # current position in the archive file
         self.inodes = {}        # dictionary caching the inodes of
                                 # archive members already added
+        self._unames = {}       # Cached mappings of uid -> uname
+        self._gnames = {}       # Cached mappings of gid -> gname
 
         try:
             if self.mode == "r":
@@ -2138,16 +2140,23 @@ def gettarinfo(self, name=None, arcname=None, 
fileobj=None):
         tarinfo.mtime = statres.st_mtime
         tarinfo.type = type
         tarinfo.linkname = linkname
+
+        # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
+        # speed things up, cache the resolved usernames and group names.
         if pwd:
-            try:
-                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
-            except KeyError:
-                pass
+            if tarinfo.uid not in self._unames:
+                try:
+                    self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
+                except KeyError:
+                    self._unames[tarinfo.uid] = ''
+            tarinfo.uname = self._unames[tarinfo.uid]
         if grp:
-            try:
-                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
-            except KeyError:
-                pass
+            if tarinfo.gid not in self._gnames:
+                try:
+                    self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
+                except KeyError:
+                    self._gnames[tarinfo.gid] = ''
+            tarinfo.gname = self._gnames[tarinfo.gid]
 
         if type in (CHRTYPE, BLKTYPE):
             if hasattr(os, "major") and hasattr(os, "minor"):
diff --git 
a/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst 
b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst
new file mode 100644
index 00000000000000..9e52405c15a82d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-02-15-56-42.gh-issue-121267.yFBWkh.rst
@@ -0,0 +1,2 @@
+Improve the performance of :mod:`tarfile` when writing files, by caching user 
names
+and group names.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to