Skip to content

Commit 2b2d607

Browse files
jforberggaogaotiantianpicnixzhauntsaninja
authored
gh-121267: Improve performance of tarfile (#121267) (#121269)
Tarfile in the default write mode spends much of its time resolving UIDs into usernames and GIDs into group names. By caching these mappings, a significant speedup can be achieved. In my simple benchmark[1], this extra caching speeds up tarfile by 8x. [1] https://gist.github.com/jforberg/86af759c796199740c31547ae828aef2 --------- Co-authored-by: Tian Gao <[email protected]> Co-authored-by: Bénédikt Tran <[email protected]> Co-authored-by: Shantanu <[email protected]>
1 parent 616468b commit 2b2d607

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

Lib/tarfile.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -1760,6 +1760,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None,
17601760
# current position in the archive file
17611761
self.inodes = {} # dictionary caching the inodes of
17621762
# archive members already added
1763+
self._unames = {} # Cached mappings of uid -> uname
1764+
self._gnames = {} # Cached mappings of gid -> gname
17631765

17641766
try:
17651767
if self.mode == "r":
@@ -2138,16 +2140,23 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None):
21382140
tarinfo.mtime = statres.st_mtime
21392141
tarinfo.type = type
21402142
tarinfo.linkname = linkname
2143+
2144+
# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
2145+
# speed things up, cache the resolved usernames and group names.
21412146
if pwd:
2142-
try:
2143-
tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2144-
except KeyError:
2145-
pass
2147+
if tarinfo.uid not in self._unames:
2148+
try:
2149+
self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
2150+
except KeyError:
2151+
self._unames[tarinfo.uid] = ''
2152+
tarinfo.uname = self._unames[tarinfo.uid]
21462153
if grp:
2147-
try:
2148-
tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2149-
except KeyError:
2150-
pass
2154+
if tarinfo.gid not in self._gnames:
2155+
try:
2156+
self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
2157+
except KeyError:
2158+
self._gnames[tarinfo.gid] = ''
2159+
tarinfo.gname = self._gnames[tarinfo.gid]
21512160

21522161
if type in (CHRTYPE, BLKTYPE):
21532162
if hasattr(os, "major") and hasattr(os, "minor"):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve the performance of :mod:`tarfile` when writing files, by caching user names
2+
and group names.

0 commit comments

Comments
 (0)