https://github.com/python/cpython/commit/da4899b94a9a9083fed4972b2473546e0d997727
commit: da4899b94a9a9083fed4972b2473546e0d997727
branch: main
author: Barney Gale <[email protected]>
committer: barneygale <[email protected]>
date: 2025-02-28T20:33:51Z
summary:
GH-116380: Speed up `glob.[i]glob()` by making fewer system calls. (#116392)
## Filtered recursive walk
Expanding a recursive `**` segment entails walking the entire directory
tree, and so any subsequent pattern segments (except special segments) can
be evaluated by filtering the expanded paths through a regex. For example,
`glob.glob("foo/**/*.py", recursive=True)` recursively walks `foo/` with
`os.scandir()`, and then filters paths through a regex based on "`**/*.py`,
with no further filesystem access needed.
This fixes an issue where `glob()` could return duplicate results.
## Tracking path existence
We store a flag alongside each path indicating whether the path is
guaranteed to exist. As we process the pattern:
- Certain special pattern segments (`""`, `"."` and `".."`) leave the flag
unchanged
- Literal pattern segments (e.g. `foo/bar`) set the flag to false
- Wildcard pattern segments (e.g. `*/*.py`) set the flag to true (because
children are found via `os.scandir()`)
- Recursive pattern segments (e.g. `**`) leave the flag unchanged for the
root path, and set it to true for descendants discovered via
`os.scandir()`.
If the flag is false at the end, we call `lstat()` on each path to filter
out missing paths.
## Minor speed-ups
- Exclude paths that don't match a non-terminal non-recursive wildcard
pattern _prior_ to calling `is_dir()`.
- Use a stack rather than recursion to implement recursive wildcards.
- This fixes a recursion error when globbing deep trees.
- Pre-compile regular expressions and pre-join literal pattern segments.
- Convert to/from `bytes` (a minor use-case) in `iglob()` rather than
supporting `bytes` throughout. This particularly simplifies the code
needed to handle relative bytes paths with `dir_fd`.
- Avoid calling `os.path.join()`; instead we keep paths in a normalized
form and append trailing slashes when needed.
- Avoid calling `os.path.normcase()`; instead we use case-insensitive regex
matching.
## Implementation notes
Much of this functionality is already present in pathlib's implementation
of globbing. The specific additions we make are:
1. Support for `dir_fd`
2. Support for `include_hidden`
3. Support for generating paths relative to `root_dir`
This unifies the implementations of globbing in the `glob` and `pathlib`
modules.
Co-authored-by: Pieter Eendebak <[email protected]>
Co-authored-by: Bénédikt Tran <[email protected]>
files:
A Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst
M Doc/library/glob.rst
M Doc/whatsnew/3.14.rst
M Lib/glob.py
M Lib/pathlib/_abc.py
M Lib/pathlib/_local.py
M Lib/test/test_glob.py
diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst
index 684466d354aef8..849316553e408a 100644
--- a/Doc/library/glob.rst
+++ b/Doc/library/glob.rst
@@ -75,10 +75,6 @@ The :mod:`glob` module defines the following functions:
Using the "``**``" pattern in large directory trees may consume
an inordinate amount of time.
- .. note::
- This function may return duplicate path names if *pathname*
- contains multiple "``**``" patterns and *recursive* is true.
-
.. versionchanged:: 3.5
Support for recursive globs using "``**``".
@@ -88,6 +84,11 @@ The :mod:`glob` module defines the following functions:
.. versionchanged:: 3.11
Added the *include_hidden* parameter.
+ .. versionchanged:: 3.14
+ Matching path names are returned only once. In previous versions, this
+ function may return duplicate path names if *pathname* contains multiple
+ "``**``" patterns and *recursive* is true.
+
.. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \
include_hidden=False)
@@ -98,10 +99,6 @@ The :mod:`glob` module defines the following functions:
.. audit-event:: glob.glob pathname,recursive glob.iglob
.. audit-event:: glob.glob/2 pathname,recursive,root_dir,dir_fd glob.iglob
- .. note::
- This function may return duplicate path names if *pathname*
- contains multiple "``**``" patterns and *recursive* is true.
-
.. versionchanged:: 3.5
Support for recursive globs using "``**``".
@@ -111,6 +108,11 @@ The :mod:`glob` module defines the following functions:
.. versionchanged:: 3.11
Added the *include_hidden* parameter.
+ .. versionchanged:: 3.14
+ Matching path names are yielded only once. In previous versions, this
+ function may yield duplicate path names if *pathname* contains multiple
+ "``**``" patterns and *recursive* is true.
+
.. function:: escape(pathname)
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
index dbd59a9d7be150..3c876a193fad32 100644
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -968,6 +968,14 @@ base64
(Contributed by Bénédikt Tran, Chris Markiewicz, and Adam Turner in
:gh:`118761`.)
+glob
+----
+
+* Reduce the number of system calls in :func:`glob.glob` and
:func:`~glob.iglob`,
+ thereby improving the speed of globbing operations by 20-80%.
+ (Contributed by Barney Gale in :gh:`116380`.)
+
+
io
---
* :mod:`io` which provides the built-in :func:`open` makes less system calls
diff --git a/Lib/glob.py b/Lib/glob.py
index d1a6dddeeb1610..7ac497d4f0075a 100644
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -1,13 +1,10 @@
"""Filename globbing utility."""
-import contextlib
import os
import re
import fnmatch
import functools
-import itertools
import operator
-import stat
import sys
@@ -45,82 +42,35 @@ def iglob(pathname, *, root_dir=None, dir_fd=None,
recursive=False,
"""
sys.audit("glob.glob", pathname, recursive)
sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd)
- if root_dir is not None:
- root_dir = os.fspath(root_dir)
- else:
- root_dir = pathname[:0]
- it = _iglob(pathname, root_dir, dir_fd, recursive, False,
- include_hidden=include_hidden)
- if not pathname or recursive and _isrecursive(pathname[:2]):
- try:
- s = next(it) # skip empty string
- if s:
- it = itertools.chain((s,), it)
- except StopIteration:
- pass
- return it
-
-def _iglob(pathname, root_dir, dir_fd, recursive, dironly,
- include_hidden=False):
- dirname, basename = os.path.split(pathname)
- if not has_magic(pathname):
- assert not dironly
- if basename:
- if _lexists(_join(root_dir, pathname), dir_fd):
- yield pathname
- else:
- # Patterns ending with a slash should match only directories
- if _isdir(_join(root_dir, dirname), dir_fd):
- yield pathname
- return
- if not dirname:
- if recursive and _isrecursive(basename):
- yield from _glob2(root_dir, basename, dir_fd, dironly,
- include_hidden=include_hidden)
- else:
- yield from _glob1(root_dir, basename, dir_fd, dironly,
- include_hidden=include_hidden)
- return
- # `os.path.split()` returns the argument itself as a dirname if it is a
- # drive or UNC path. Prevent an infinite recursion if a drive or UNC path
- # contains magic characters (i.e. r'\\?\C:').
- if dirname != pathname and has_magic(dirname):
- dirs = _iglob(dirname, root_dir, dir_fd, recursive, True,
- include_hidden=include_hidden)
- else:
- dirs = [dirname]
- if has_magic(basename):
- if recursive and _isrecursive(basename):
- glob_in_dir = _glob2
- else:
- glob_in_dir = _glob1
+ pathname = os.fspath(pathname)
+ if isinstance(pathname, bytes):
+ pathname = os.fsdecode(pathname)
+ if root_dir is not None:
+ root_dir = os.fsdecode(root_dir)
+ paths = _iglob(pathname, root_dir, dir_fd, recursive, include_hidden)
+ return map(os.fsencode, paths)
else:
- glob_in_dir = _glob0
- for dirname in dirs:
- for name in glob_in_dir(_join(root_dir, dirname), basename, dir_fd,
dironly,
- include_hidden=include_hidden):
- yield os.path.join(dirname, name)
-
-# These 2 helper functions non-recursively glob inside a literal directory.
-# They return a list of basenames. _glob1 accepts a pattern while _glob0
-# takes a literal basename (so it only has to check for its existence).
-
-def _glob1(dirname, pattern, dir_fd, dironly, include_hidden=False):
- names = _listdir(dirname, dir_fd, dironly)
- if not (include_hidden or _ishidden(pattern)):
- names = (x for x in names if not _ishidden(x))
- return fnmatch.filter(names, pattern)
-
-def _glob0(dirname, basename, dir_fd, dironly, include_hidden=False):
- if basename:
- if _lexists(_join(dirname, basename), dir_fd):
- return [basename]
+ return _iglob(pathname, root_dir, dir_fd, recursive, include_hidden)
+
+def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden):
+ if os.path.altsep:
+ pathname = pathname.replace(os.path.altsep, os.path.sep)
+ drive, root, tail = os.path.splitroot(pathname)
+ parts = tail.split(os.path.sep)[::-1] if tail else []
+ globber = _StringGlobber(recursive=recursive,
include_hidden=include_hidden)
+ select = globber.selector(parts)
+ if drive:
+ root = drive + root
+ return select(root, dir_fd, root)
+ elif root:
+ return select(root, dir_fd, root, exists=True)
+ elif not root_dir:
+ return select(root, dir_fd, root, empty=True)
else:
- # `os.path.split()` returns an empty basename for paths ending with a
- # directory separator. 'q*x/' should match only directories.
- if _isdir(dirname, dir_fd):
- return [basename]
- return []
+ root = os.path.join(root_dir, '')
+ root_len = len(root)
+ paths = select(root, dir_fd, root, empty=True)
+ return (path[root_len:] for path in paths)
_deprecated_function_message = (
"{name} is deprecated and will be removed in Python {remove}. Use "
@@ -130,102 +80,16 @@ def _glob0(dirname, basename, dir_fd, dironly,
include_hidden=False):
def glob0(dirname, pattern):
import warnings
warnings._deprecated("glob.glob0", _deprecated_function_message,
remove=(3, 15))
- return _glob0(dirname, pattern, None, False)
+ dirname = os.path.join(dirname, '')
+ select = _StringGlobber().literal_selector(pattern, [])
+ return [path[len(dirname):] for path in select(dirname)]
def glob1(dirname, pattern):
import warnings
warnings._deprecated("glob.glob1", _deprecated_function_message,
remove=(3, 15))
- return _glob1(dirname, pattern, None, False)
-
-# This helper function recursively yields relative pathnames inside a literal
-# directory.
-
-def _glob2(dirname, pattern, dir_fd, dironly, include_hidden=False):
- assert _isrecursive(pattern)
- if not dirname or _isdir(dirname, dir_fd):
- yield pattern[:0]
- yield from _rlistdir(dirname, dir_fd, dironly,
- include_hidden=include_hidden)
-
-# If dironly is false, yields all file names inside a directory.
-# If dironly is true, yields only directory names.
-def _iterdir(dirname, dir_fd, dironly):
- try:
- fd = None
- fsencode = None
- if dir_fd is not None:
- if dirname:
- fd = arg = os.open(dirname, _dir_open_flags, dir_fd=dir_fd)
- else:
- arg = dir_fd
- if isinstance(dirname, bytes):
- fsencode = os.fsencode
- elif dirname:
- arg = dirname
- elif isinstance(dirname, bytes):
- arg = bytes(os.curdir, 'ASCII')
- else:
- arg = os.curdir
- try:
- with os.scandir(arg) as it:
- for entry in it:
- try:
- if not dironly or entry.is_dir():
- if fsencode is not None:
- yield fsencode(entry.name)
- else:
- yield entry.name
- except OSError:
- pass
- finally:
- if fd is not None:
- os.close(fd)
- except OSError:
- return
-
-def _listdir(dirname, dir_fd, dironly):
- with contextlib.closing(_iterdir(dirname, dir_fd, dironly)) as it:
- return list(it)
-
-# Recursively yields relative pathnames inside a literal directory.
-def _rlistdir(dirname, dir_fd, dironly, include_hidden=False):
- names = _listdir(dirname, dir_fd, dironly)
- for x in names:
- if include_hidden or not _ishidden(x):
- yield x
- path = _join(dirname, x) if dirname else x
- for y in _rlistdir(path, dir_fd, dironly,
- include_hidden=include_hidden):
- yield _join(x, y)
-
-
-def _lexists(pathname, dir_fd):
- # Same as os.path.lexists(), but with dir_fd
- if dir_fd is None:
- return os.path.lexists(pathname)
- try:
- os.lstat(pathname, dir_fd=dir_fd)
- except (OSError, ValueError):
- return False
- else:
- return True
-
-def _isdir(pathname, dir_fd):
- # Same as os.path.isdir(), but with dir_fd
- if dir_fd is None:
- return os.path.isdir(pathname)
- try:
- st = os.stat(pathname, dir_fd=dir_fd)
- except (OSError, ValueError):
- return False
- else:
- return stat.S_ISDIR(st.st_mode)
-
-def _join(dirname, basename):
- # It is common if dirname or basename is empty
- if not dirname or not basename:
- return dirname or basename
- return os.path.join(dirname, basename)
+ dirname = os.path.join(dirname, '')
+ select = _StringGlobber().wildcard_selector(pattern, [])
+ return [path[len(dirname):] for path in select(dirname)]
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
@@ -237,15 +101,6 @@ def has_magic(s):
match = magic_check.search(s)
return match is not None
-def _ishidden(path):
- return path[0] in ('.', b'.'[0])
-
-def _isrecursive(pattern):
- if isinstance(pattern, bytes):
- return pattern == b'**'
- else:
- return pattern == '**'
-
def escape(pathname):
"""Escape all special characters.
"""
@@ -319,12 +174,13 @@ def translate(pat, *, recursive=False,
include_hidden=False, seps=None):
return fr'(?s:{res})\Z'
[email protected]_cache(maxsize=512)
-def _compile_pattern(pat, sep, case_sensitive, recursive=True):
[email protected]_cache(maxsize=1024)
+def _compile_pattern(pat, sep, case_sensitive, recursive, include_hidden):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
- regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
+ regex = translate(pat, recursive=recursive,
+ include_hidden=include_hidden, seps=sep)
return re.compile(regex, flags=flags).match
@@ -332,11 +188,13 @@ class _GlobberBase:
"""Abstract class providing shell-style pattern matching and globbing.
"""
- def __init__(self, sep, case_sensitive, case_pedantic=False,
recursive=False):
+ def __init__(self, sep=os.path.sep, case_sensitive=os.name != 'nt',
+ case_pedantic=False, recursive=False, include_hidden=False):
self.sep = sep
self.case_sensitive = case_sensitive
self.case_pedantic = case_pedantic
self.recursive = recursive
+ self.include_hidden = include_hidden
# Abstract methods
@@ -346,12 +204,38 @@ def lexists(path):
"""
raise NotImplementedError
+ @staticmethod
+ def lstat(path, dir_fd=None):
+ """Implements os.lstat()
+ """
+ raise NotImplementedError
+
+ @staticmethod
+ def open(path, flags, dir_fd=None):
+ """Implements os.open()
+ """
+ raise NotImplementedError
+
@staticmethod
def scandir(path):
"""Like os.scandir(), but generates (entry, name, path) tuples.
"""
raise NotImplementedError
+ @staticmethod
+ def scandir_cwd():
+ raise NotImplementedError
+
+ @staticmethod
+ def scandir_fd(fd, prefix):
+ raise NotImplementedError
+
+ @staticmethod
+ def close(fd):
+ """Implements os.close().
+ """
+ raise NotImplementedError
+
@staticmethod
def concat_path(path, text):
"""Implements path concatenation.
@@ -361,7 +245,8 @@ def concat_path(path, text):
# High-level methods
def compile(self, pat):
- return _compile_pattern(pat, self.sep, self.case_sensitive,
self.recursive)
+ return _compile_pattern(pat, self.sep, self.case_sensitive,
+ self.recursive, self.include_hidden)
def selector(self, parts):
"""Returns a function that selects from a given path, walking and
@@ -386,10 +271,14 @@ def special_selector(self, part, parts):
if parts:
part += self.sep
select_next = self.selector(parts)
+ if not part:
+ return select_next
- def select_special(path, exists=False):
+ def select_special(path, dir_fd=None, rel_path=None, exists=False,
empty=False):
path = self.concat_path(path, part)
- return select_next(path, exists)
+ if dir_fd is not None:
+ rel_path = self.concat_path(rel_path, part)
+ return select_next(path, dir_fd, rel_path, exists)
return select_special
def literal_selector(self, part, parts):
@@ -406,9 +295,11 @@ def literal_selector(self, part, parts):
select_next = self.selector(parts)
- def select_literal(path, exists=False):
+ def select_literal(path, dir_fd=None, rel_path=None, exists=False,
empty=False):
path = self.concat_path(path, part)
- return select_next(path, exists=False)
+ if dir_fd is not None:
+ rel_path = self.concat_path(rel_path, part)
+ return select_next(path, dir_fd, rel_path)
return select_literal
def wildcard_selector(self, part, parts):
@@ -416,14 +307,24 @@ def wildcard_selector(self, part, parts):
filtering by pattern.
"""
- match = None if part == '*' else self.compile(part)
+ match = None if self.include_hidden and part == '*' else
self.compile(part)
dir_only = bool(parts)
if dir_only:
select_next = self.selector(parts)
- def select_wildcard(path, exists=False):
+ def select_wildcard(path, dir_fd=None, rel_path=None, exists=False,
empty=False):
+ close_fd = False
try:
- entries = self.scandir(path)
+ if dir_fd is None:
+ fd = None
+ entries = self.scandir(path) if path else
self.scandir_cwd()
+ elif not rel_path:
+ fd = dir_fd
+ entries = self.scandir_fd(fd, path)
+ else:
+ fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd)
+ close_fd = True
+ entries = self.scandir_fd(fd, path)
except OSError:
pass
else:
@@ -436,9 +337,17 @@ def select_wildcard(path, exists=False):
except OSError:
continue
entry_path = self.concat_path(entry_path, self.sep)
- yield from select_next(entry_path, exists=True)
+ if fd is not None:
+ entry_name = entry_name + self.sep
+ yield from select_next(
+ entry_path, fd, entry_name, exists=True)
else:
+ # Optimization: directly yield the path if this is
+ # last pattern part.
yield entry_path
+ finally:
+ if close_fd:
+ self.close(fd)
return select_wildcard
def recursive_selector(self, part, parts):
@@ -460,26 +369,49 @@ def recursive_selector(self, part, parts):
while parts and parts[-1] not in _special_parts:
part += self.sep + parts.pop()
- match = None if part == '**' else self.compile(part)
+ match = None if self.include_hidden and part == '**' else
self.compile(part)
dir_only = bool(parts)
select_next = self.selector(parts)
- def select_recursive(path, exists=False):
+ def select_recursive(path, dir_fd=None, rel_path=None, exists=False,
empty=False):
match_pos = len(str(path))
if match is None or match(str(path), match_pos):
- yield from select_next(path, exists)
- stack = [path]
- while stack:
- yield from select_recursive_step(stack, match_pos)
+ yield from select_next(path, dir_fd, rel_path, exists, empty)
+ stack = [(path, dir_fd, rel_path)]
+ try:
+ while stack:
+ yield from select_recursive_step(stack, match_pos)
+ finally:
+ # Close any file descriptors still on the stack.
+ while stack:
+ path, dir_fd, _rel_path = stack.pop()
+ if path is None:
+ try:
+ self.close(dir_fd)
+ except OSError:
+ pass
def select_recursive_step(stack, match_pos):
- path = stack.pop()
+ path, dir_fd, rel_path = stack.pop()
try:
- entries = self.scandir(path)
+ if path is None:
+ self.close(dir_fd)
+ return
+ elif dir_fd is None:
+ fd = None
+ entries = self.scandir(path) if path else
self.scandir_cwd()
+ elif not rel_path:
+ fd = dir_fd
+ entries = self.scandir_fd(fd, path)
+ else:
+ fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd)
+ # Schedule the file descriptor to be closed next step.
+ stack.append((None, fd, None))
+ entries = self.scandir_fd(fd, path)
except OSError:
pass
else:
- for entry, _entry_name, entry_path in entries:
+ for entry, entry_name, entry_path in entries:
is_dir = False
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
@@ -491,25 +423,38 @@ def select_recursive_step(stack, match_pos):
entry_path_str = str(entry_path)
if dir_only:
entry_path = self.concat_path(entry_path, self.sep)
+ if fd is not None:
+ entry_name = entry_name + self.sep
if match is None or match(entry_path_str, match_pos):
if dir_only:
- yield from select_next(entry_path, exists=True)
+ yield from select_next(
+ entry_path, fd, entry_name, exists=True)
else:
# Optimization: directly yield the path if
this is
# last pattern part.
yield entry_path
if is_dir:
- stack.append(entry_path)
+ stack.append((entry_path, fd, entry_name))
return select_recursive
- def select_exists(self, path, exists=False):
- """Yields the given path, if it exists.
+ def select_exists(self, path, dir_fd=None, rel_path=None, exists=False,
empty=False):
+ """Yields the given path, if it exists. If *dir_fd* is given, we check
+ whether *rel_path* exists relative to the fd.
"""
- if exists:
+ if empty:
+ # Suppress initial path so iglob() doesn't yield the empty string.
+ pass
+ elif exists:
# Optimization: this path is already known to exist, e.g. because
# it was returned from os.scandir(), so we skip calling lstat().
yield path
+ elif dir_fd is not None:
+ try:
+ self.lstat(rel_path, dir_fd=dir_fd)
+ yield path
+ except OSError:
+ pass
elif self.lexists(path):
yield path
@@ -518,6 +463,9 @@ class _StringGlobber(_GlobberBase):
"""Provides shell-style pattern matching and globbing for string paths.
"""
lexists = staticmethod(os.path.lexists)
+ lstat = staticmethod(os.lstat)
+ open = staticmethod(os.open)
+ close = staticmethod(os.close)
concat_path = operator.add
@staticmethod
@@ -528,6 +476,20 @@ def scandir(path):
entries = list(scandir_it)
return ((entry, entry.name, entry.path) for entry in entries)
+ @staticmethod
+ def scandir_cwd():
+ with os.scandir() as scandir_it:
+ entries = list(scandir_it)
+ # Suppress leading dot when scanning current directory.
+ return ((entry, entry.name, entry.name) for entry in entries)
+
+ @staticmethod
+ def scandir_fd(fd, prefix):
+ prefix = os.path.join(prefix, prefix[:0])
+ with os.scandir(fd) as scandir_it:
+ entries = list(scandir_it)
+ return ((entry, entry.name, prefix + entry.name) for entry in entries)
+
class _PathGlobber(_GlobberBase):
"""Provides shell-style pattern matching and globbing for pathlib paths.
diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py
index 930701d4789f5c..bd2252ef77dcf6 100644
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@@ -196,7 +196,8 @@ def full_match(self, pattern, *, case_sensitive=None):
pattern = self.with_segments(pattern)
if case_sensitive is None:
case_sensitive = self.parser.normcase('Aa') == 'Aa'
- globber = _PathGlobber(pattern.parser.sep, case_sensitive,
recursive=True)
+ globber = _PathGlobber(pattern.parser.sep, case_sensitive,
+ recursive=True, include_hidden=True)
match = globber.compile(str(pattern))
return match(str(self)) is not None
diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py
index 83da6960f00489..492bae0b7f7cd8 100644
--- a/Lib/pathlib/_local.py
+++ b/Lib/pathlib/_local.py
@@ -558,7 +558,8 @@ def full_match(self, pattern, *, case_sensitive=None):
# paths shouldn't match wildcards, so we change it to the empty string.
path = str(self) if self.parts else ''
pattern = str(pattern) if pattern.parts else ''
- globber = _StringGlobber(self.parser.sep, case_sensitive,
recursive=True)
+ globber = _StringGlobber(self.parser.sep, case_sensitive,
+ recursive=True, include_hidden=True)
return globber.compile(pattern)(path) is not None
def match(self, path_pattern, *, case_sensitive=None):
@@ -580,7 +581,8 @@ def match(self, path_pattern, *, case_sensitive=None):
return False
if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
return False
- globber = _StringGlobber(self.parser.sep, case_sensitive)
+ globber = _StringGlobber(self.parser.sep, case_sensitive,
+ include_hidden=True)
for path_part, pattern_part in zip(path_parts, pattern_parts):
match = globber.compile(pattern_part)
if match(path_part) is None:
@@ -855,7 +857,8 @@ def glob(self, pattern, *, case_sensitive=None,
recurse_symlinks=False):
case_pedantic = True
parts = self._parse_pattern(pattern)
recursive = True if recurse_symlinks else _no_recurse_symlinks
- globber = _StringGlobber(self.parser.sep, case_sensitive,
case_pedantic, recursive)
+ globber = _StringGlobber(self.parser.sep, case_sensitive,
case_pedantic,
+ recursive, include_hidden=True)
select = globber.selector(parts[::-1])
root = str(self)
paths = select(self.parser.join(root, ''))
diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py
index da73769c16e9af..2290e06e60e377 100644
--- a/Lib/test/test_glob.py
+++ b/Lib/test/test_glob.py
@@ -4,14 +4,18 @@
import shutil
import sys
import unittest
+import unittest.mock
import warnings
from test import support
-from test.support import is_wasi, Py_DEBUG
+from test.support import is_wasi, Py_DEBUG, infinite_recursion
from test.support.os_helper import (TESTFN, skip_unless_symlink,
can_symlink, create_empty_file, change_cwd)
+_supports_dir_fd = {os.open, os.stat} <= os.supports_dir_fd and os.scandir in
os.supports_fd
+
+
class GlobTests(unittest.TestCase):
dir_fd = None
@@ -49,7 +53,7 @@ def setUp(self):
def open_dirfd(self):
if self.dir_fd is not None:
os.close(self.dir_fd)
- if {os.open, os.stat} <= os.supports_dir_fd and os.scandir in
os.supports_fd:
+ if _supports_dir_fd:
self.dir_fd = os.open(self.tempdir, os.O_RDONLY | os.O_DIRECTORY)
else:
self.dir_fd = None
@@ -177,20 +181,18 @@ def test_glob_directory_with_trailing_slash(self):
self.assertEqual(glob.glob(self.norm('Z*Z') + sep), [])
self.assertEqual(glob.glob(self.norm('ZZZ') + sep), [])
self.assertEqual(glob.glob(self.norm('aaa') + sep),
- [self.norm('aaa') + sep])
- # Preserving the redundant separators is an implementation detail.
+ [self.norm('aaa') + os.sep])
+ # Redundant separators are preserved and normalized
self.assertEqual(glob.glob(self.norm('aaa') + sep*2),
- [self.norm('aaa') + sep*2])
+ [self.norm('aaa') + os.sep*2])
# When there is a wildcard pattern which ends with a pathname
# separator, glob() doesn't blow.
# The result should end with the pathname separator.
- # Normalizing the trailing separator is an implementation detail.
eq = self.assertSequencesEqual_noorder
eq(glob.glob(self.norm('aa*') + sep),
[self.norm('aaa') + os.sep, self.norm('aab') + os.sep])
- # Stripping the redundant separators is an implementation detail.
eq(glob.glob(self.norm('aa*') + sep*2),
- [self.norm('aaa') + os.sep, self.norm('aab') + os.sep])
+ [self.norm('aaa') + os.sep*2, self.norm('aab') + os.sep*2])
def test_glob_bytes_directory_with_trailing_slash(self):
# Same as test_glob_directory_with_trailing_slash, but with a
@@ -200,16 +202,16 @@ def test_glob_bytes_directory_with_trailing_slash(self):
self.assertEqual(glob.glob(os.fsencode(self.norm('Z*Z') + sep)),
[])
self.assertEqual(glob.glob(os.fsencode(self.norm('ZZZ') + sep)),
[])
self.assertEqual(glob.glob(os.fsencode(self.norm('aaa') + sep)),
- [os.fsencode(self.norm('aaa') + sep)])
+ [os.fsencode(self.norm('aaa') + os.sep)])
self.assertEqual(glob.glob(os.fsencode(self.norm('aaa') + sep*2)),
- [os.fsencode(self.norm('aaa') + sep*2)])
+ [os.fsencode(self.norm('aaa') + os.sep*2)])
eq = self.assertSequencesEqual_noorder
eq(glob.glob(os.fsencode(self.norm('aa*') + sep)),
[os.fsencode(self.norm('aaa') + os.sep),
os.fsencode(self.norm('aab') + os.sep)])
eq(glob.glob(os.fsencode(self.norm('aa*') + sep*2)),
- [os.fsencode(self.norm('aaa') + os.sep),
- os.fsencode(self.norm('aab') + os.sep)])
+ [os.fsencode(self.norm('aaa') + os.sep*2),
+ os.fsencode(self.norm('aab') + os.sep*2)])
@skip_unless_symlink
def test_glob_symlinks(self):
@@ -326,8 +328,12 @@ def test_recursive_glob(self):
with change_cwd(self.tempdir):
join = os.path.join
eq(glob.glob('**', recursive=True), [join(*i) for i in full])
+ eq(glob.glob(join('**', '**'), recursive=True),
+ [join(*i) for i in full])
eq(glob.glob(join('**', ''), recursive=True),
[join(*i) for i in dirs])
+ eq(glob.glob(join('**', '**', ''), recursive=True),
+ [join(*i) for i in dirs])
eq(glob.glob(join('**', '*'), recursive=True),
[join(*i) for i in full])
eq(glob.glob(join(os.curdir, '**'), recursive=True),
@@ -394,6 +400,33 @@ def test_glob_many_open_files(self):
for it in iters:
self.assertEqual(next(it), p)
+ def test_glob_above_recursion_limit(self):
+ depth = 30
+ base = os.path.join(self.tempdir, 'deep')
+ p = os.path.join(base, *(['d']*depth))
+ os.makedirs(p)
+ pattern = os.path.join(base, '**', 'd')
+ with infinite_recursion(depth - 5):
+ glob.glob(pattern, recursive=True)
+
+ @unittest.skipUnless(_supports_dir_fd, "Needs support for
iglob(dir_fd=...)")
+ def test_iglob_iter_close(self):
+ base = os.path.join(self.tempdir, 'deep')
+ p = os.path.join(base, *(['d'] * 10))
+ os.makedirs(p)
+ with (
+ unittest.mock.patch("glob._StringGlobber.open", wraps=os.open) as
os_open,
+ unittest.mock.patch("glob._StringGlobber.close", wraps=os.close)
as os_close
+ ):
+ self.assertEqual(os_open.call_count, os_close.call_count)
+ iter = glob.iglob('**/*/d', dir_fd=self.dir_fd, recursive=True)
+ self.assertEqual(os_open.call_count, os_close.call_count)
+ self.assertEqual(next(iter), 'deep/d')
+ self.assertEqual(next(iter), 'deep/d/d')
+ self.assertGreater(os_open.call_count, os_close.call_count)
+ iter.close()
+ self.assertEqual(os_open.call_count, os_close.call_count)
+
def test_glob0(self):
with self.assertWarns(DeprecationWarning):
glob.glob0(self.tempdir, 'a')
diff --git
a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst
b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst
new file mode 100644
index 00000000000000..b7f27ab7191a96
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst
@@ -0,0 +1,2 @@
+Speed up :func:`glob.glob` and :func:`glob.iglob` by making use of
+:func:`glob.translate` and tracking path existence more precisely.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]