https://github.com/python/cpython/commit/6f93b4df92b8fbf80529cb6435789f5a75664a20
commit: 6f93b4df92b8fbf80529cb6435789f5a75664a20
branch: main
author: Barney Gale <[email protected]>
committer: barneygale <[email protected]>
date: 2024-02-10T18:12:34Z
summary:
GH-115060: Speed up `pathlib.Path.glob()` by removing redundant regex matching
(#115061)
When expanding and filtering paths for a `**` wildcard segment, build an
`re.Pattern` object from the subsequent pattern parts, rather than the entire
pattern, and match against the `os.DirEntry` object prior to instantiating a
path object. Also skip compiling a pattern when expanding a `*` wildcard
segment.
files:
A Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst
M Lib/pathlib/__init__.py
M Lib/pathlib/_abc.py
M Lib/test/test_pathlib/test_pathlib.py
diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py
index 65ce836765c42b..46834b1a76a6eb 100644
--- a/Lib/pathlib/__init__.py
+++ b/Lib/pathlib/__init__.py
@@ -587,9 +587,13 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)
- def _make_child_entry(self, entry):
+ def _direntry_str(self, entry):
+ # Transform an entry yielded from _scandir() into a path string.
+ return entry.name if str(self) == '.' else entry.path
+
+ def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
- path_str = entry.name if str(self) == '.' else entry.path
+ path_str = self._direntry_str(entry)
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py
index e4b1201a3703c3..27c6b4e367a050 100644
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@@ -86,19 +86,29 @@ def _select_children(parent_paths, dir_only,
follow_symlinks, match):
continue
except OSError:
continue
- if match(entry.name):
- yield parent_path._make_child_entry(entry)
+ # Avoid cost of making a path object for non-matching paths by
+ # matching against the os.DirEntry.name string.
+ if match is None or match(entry.name):
+ yield parent_path._make_child_direntry(entry)
-def _select_recursive(parent_paths, dir_only, follow_symlinks):
- """Yield given paths and all their subdirectories, recursively."""
+def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
+ """Yield given paths and all their children, recursively, filtering by
+ string and type.
+ """
if follow_symlinks is None:
follow_symlinks = False
for parent_path in parent_paths:
+ if match is not None:
+ # If we're filtering paths through a regex, record the length of
+ # the parent path. We'll pass it to match(path, pos=...) later.
+ parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
paths = [parent_path._make_child_relpath('')]
while paths:
path = paths.pop()
- yield path
+ if match is None or match(str(path), parent_len):
+ # Yield *directory* path that matches pattern (if any).
+ yield path
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
@@ -108,14 +118,22 @@ def _select_recursive(parent_paths, dir_only,
follow_symlinks):
pass
else:
for entry in entries:
+ # Handle directory entry.
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
- paths.append(path._make_child_entry(entry))
+ # Recurse into this directory.
+ paths.append(path._make_child_direntry(entry))
continue
except OSError:
pass
+
+ # Handle file entry.
if not dir_only:
- yield path._make_child_entry(entry)
+ # Avoid cost of making a path object for non-matching
+ # files by matching against the os.DirEntry object.
+ if match is None or match(path._direntry_str(entry),
parent_len):
+ # Yield *file* path that matches pattern (if any).
+ yield path._make_child_direntry(entry)
def _select_unique(paths):
@@ -750,8 +768,14 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())
- def _make_child_entry(self, entry):
+ def _direntry_str(self, entry):
+ # Transform an entry yielded from _scandir() into a path string.
+ # PathBase._scandir() yields PathBase objects, so use str().
+ return str(entry)
+
+ def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
+ # PathBase._scandir() yields PathBase objects, so this is a no-op.
return entry
def _make_child_relpath(self, name):
@@ -769,43 +793,49 @@ def glob(self, pattern, *, case_sensitive=None,
follow_symlinks=None):
stack = pattern._pattern_stack
specials = ('', '.', '..')
- filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
while stack:
part = stack.pop()
if part in specials:
+ # Join special component (e.g. '..') onto paths.
paths = _select_special(paths, part)
+
elif part == '**':
- # Consume adjacent '**' components.
+ # Consume following '**' components, which have no effect.
while stack and stack[-1] == '**':
stack.pop()
- # Consume adjacent non-special components and enable post-walk
- # regex filtering, provided we're treating symlinks
consistently.
+ # Consume following non-special components, provided we're
+ # treating symlinks consistently. Each component is joined
+ # onto 'part', which is used to generate an re.Pattern object.
if follow_symlinks is not None:
while stack and stack[-1] not in specials:
- filter_paths = True
- stack.pop()
+ part += sep + stack.pop()
- dir_only = bool(stack)
- paths = _select_recursive(paths, dir_only, follow_symlinks)
+ # If the previous loop consumed pattern components, compile an
+ # re.Pattern object based on those components.
+ match = _compile_pattern(part, sep, case_sensitive) if part !=
'**' else None
+
+ # Recursively walk directories, filtering by type and regex.
+ paths = _select_recursive(paths, bool(stack), follow_symlinks,
match)
+
+ # De-duplicate if we've already seen a '**' component.
if deduplicate_paths:
- # De-duplicate if we've already seen a '**' component.
paths = _select_unique(paths)
deduplicate_paths = True
+
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire
path component")
+
else:
- dir_only = bool(stack)
- match = _compile_pattern(part, sep, case_sensitive)
- paths = _select_children(paths, dir_only, follow_symlinks,
match)
- if filter_paths:
- # Filter out paths that don't match pattern.
- prefix_len = len(str(self._make_child_relpath('_'))) - 1
- match = _compile_pattern(pattern._pattern_str, sep, case_sensitive)
- paths = (path for path in paths if match(path._pattern_str,
prefix_len))
+ # If the pattern component isn't '*', compile an re.Pattern
+ # object based on the component.
+ match = _compile_pattern(part, sep, case_sensitive) if part !=
'*' else None
+
+ # Iterate over directories' children filtering by type and
regex.
+ paths = _select_children(paths, bool(stack), follow_symlinks,
match)
return paths
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -854,7 +884,7 @@ def walk(self, top_down=True, on_error=None,
follow_symlinks=False):
if is_dir:
if not top_down:
- paths.append(path._make_child_entry(entry))
+ paths.append(path._make_child_direntry(entry))
dirnames.append(entry.name)
else:
filenames.append(entry.name)
diff --git a/Lib/test/test_pathlib/test_pathlib.py
b/Lib/test/test_pathlib/test_pathlib.py
index 2b166451243775..c0dcf314da4bfc 100644
--- a/Lib/test/test_pathlib/test_pathlib.py
+++ b/Lib/test/test_pathlib/test_pathlib.py
@@ -1250,6 +1250,19 @@ def test_glob_pathlike(self):
self.assertEqual(expect, set(p.glob(P(pattern))))
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
+ @needs_symlinks
+ def test_glob_dot(self):
+ P = self.cls
+ with os_helper.change_cwd(P(self.base, "dirC")):
+ self.assertEqual(
+ set(P('.').glob('*')), {P("fileC"), P("novel.txt"), P("dirD")})
+ self.assertEqual(
+ set(P('.').glob('**')), {P("fileC"), P("novel.txt"),
P("dirD"), P("dirD/fileD"), P(".")})
+ self.assertEqual(
+ set(P('.').glob('**/*')), {P("fileC"), P("novel.txt"),
P("dirD"), P("dirD/fileD")})
+ self.assertEqual(
+ set(P('.').glob('**/*/*')), {P("dirD/fileD")})
+
def test_rglob_pathlike(self):
P = self.cls
p = P(self.base, "dirC")
diff --git
a/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst
b/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst
new file mode 100644
index 00000000000000..b358eeb569626f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst
@@ -0,0 +1 @@
+Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]