https://github.com/python/cpython/commit/9b7294c3a560f43f1e26a0f48c258829076d6464
commit: 9b7294c3a560f43f1e26a0f48c258829076d6464
branch: main
author: Barney Gale <[email protected]>
committer: barneygale <[email protected]>
date: 2024-11-04T19:29:57Z
summary:

GH-126363: Speed up pattern parsing in `pathlib.Path.glob()` (#126364)

The implementation of `Path.glob()` does rather a hacky thing: it calls
`self.with_segments()` to convert the given pattern to a `Path` object, and
then peeks at the private `_raw_path` attribute to see if pathlib removed a
trailing slash from the pattern.

In this patch, we make `glob()` use a new `_parse_pattern()` classmethod
that splits the pattern into parts while preserving information about any
trailing slash. This skips the cost of creating a `Path` object, and avoids
some path anchor normalization, which makes `Path.glob()` slightly faster.
But mostly it's about making the code less naughty.

Co-authored-by: Tomas R. <[email protected]>

files:
A Misc/NEWS.d/next/Library/2024-11-03-14-43-51.gh-issue-126363.Xus7vU.rst
M Lib/pathlib/_local.py

diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py
index ef072b83d96904..99474e1f71a307 100644
--- a/Lib/pathlib/_local.py
+++ b/Lib/pathlib/_local.py
@@ -274,6 +274,31 @@ def _parse_path(cls, path):
                 root = sep
         return drv, root, [x for x in rel.split(sep) if x and x != '.']
 
+    @classmethod
+    def _parse_pattern(cls, pattern):
+        """Parse a glob pattern to a list of parts. This is much like
+        _parse_path, except:
+
+        - Rather than normalizing and returning the drive and root, we raise
+          NotImplementedError if either are present.
+        - If the path has no real parts, we raise ValueError.
+        - If the path ends in a slash, then a final empty part is added.
+        """
+        drv, root, rel = cls.parser.splitroot(pattern)
+        if root or drv:
+            raise NotImplementedError("Non-relative patterns are unsupported")
+        sep = cls.parser.sep
+        altsep = cls.parser.altsep
+        if altsep:
+            rel = rel.replace(altsep, sep)
+        parts = [x for x in rel.split(sep) if x and x != '.']
+        if not parts:
+            raise ValueError(f"Unacceptable pattern: {str(pattern)!r}")
+        elif rel.endswith(sep):
+            # GH-65238: preserve trailing slash in glob patterns.
+            parts.append('')
+        return parts
+
     @property
     def _raw_path(self):
         """The joined but unnormalized path."""
@@ -641,17 +666,7 @@ def glob(self, pattern, *, case_sensitive=None, 
recurse_symlinks=False):
         kind, including directories) matching the given relative pattern.
         """
         sys.audit("pathlib.Path.glob", self, pattern)
-        if not isinstance(pattern, PurePath):
-            pattern = self.with_segments(pattern)
-        if pattern.anchor:
-            raise NotImplementedError("Non-relative patterns are unsupported")
-        parts = pattern._tail.copy()
-        if not parts:
-            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
-        raw = pattern._raw_path
-        if raw[-1] in (self.parser.sep, self.parser.altsep):
-            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
-            parts.append('')
+        parts = self._parse_pattern(pattern)
         select = self._glob_selector(parts[::-1], case_sensitive, 
recurse_symlinks)
         root = str(self)
         paths = select(root)
@@ -672,9 +687,7 @@ def rglob(self, pattern, *, case_sensitive=None, 
recurse_symlinks=False):
         this subtree.
         """
         sys.audit("pathlib.Path.rglob", self, pattern)
-        if not isinstance(pattern, PurePath):
-            pattern = self.with_segments(pattern)
-        pattern = '**' / pattern
+        pattern = self.parser.join('**', pattern)
         return self.glob(pattern, case_sensitive=case_sensitive, 
recurse_symlinks=recurse_symlinks)
 
     def walk(self, top_down=True, on_error=None, follow_symlinks=False):
diff --git 
a/Misc/NEWS.d/next/Library/2024-11-03-14-43-51.gh-issue-126363.Xus7vU.rst 
b/Misc/NEWS.d/next/Library/2024-11-03-14-43-51.gh-issue-126363.Xus7vU.rst
new file mode 100644
index 00000000000000..20fea9b9ef99a0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-03-14-43-51.gh-issue-126363.Xus7vU.rst
@@ -0,0 +1,2 @@
+Speed up pattern parsing in :meth:`pathlib.Path.glob` by skipping creation
+of a :class:`pathlib.Path` object for the pattern.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to