https://github.com/python/cpython/commit/78cb377c622a98b1bf58df40c828e886575a6927
commit: 78cb377c622a98b1bf58df40c828e886575a6927
branch: main
author: Bénédikt Tran <10796600+picn...@users.noreply.github.com>
committer: barneygale <barney.g...@gmail.com>
date: 2024-11-27T16:42:45Z
summary:

gh-122288: Improve performances of `fnmatch.translate` (#122289)

Improve performance of this function by a factor of 1.7x.

Co-authored-by: Barney Gale <barney.g...@gmail.com>

files:
A Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst
M Lib/fnmatch.py
M Lib/glob.py
M Lib/test/test_fnmatch.py

diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py
index 73acb1fe8d4106..865baea23467ea 100644
--- a/Lib/fnmatch.py
+++ b/Lib/fnmatch.py
@@ -77,24 +77,30 @@ def translate(pat):
     There is no way to quote meta-characters.
     """
 
-    STAR = object()
-    parts = _translate(pat, STAR, '.')
-    return _join_translated_parts(parts, STAR)
+    parts, star_indices = _translate(pat, '*', '.')
+    return _join_translated_parts(parts, star_indices)
 
+_re_setops_sub = re.compile(r'([&~|])').sub
+_re_escape = functools.lru_cache(maxsize=512)(re.escape)
 
-def _translate(pat, STAR, QUESTION_MARK):
+def _translate(pat, star, question_mark):
     res = []
     add = res.append
+    star_indices = []
+
     i, n = 0, len(pat)
     while i < n:
         c = pat[i]
         i = i+1
         if c == '*':
+            # store the position of the wildcard
+            star_indices.append(len(res))
+            add(star)
             # compress consecutive `*` into one
-            if (not res) or res[-1] is not STAR:
-                add(STAR)
+            while i < n and pat[i] == '*':
+                i += 1
         elif c == '?':
-            add(QUESTION_MARK)
+            add(question_mark)
         elif c == '[':
             j = i
             if j < n and pat[j] == '!':
@@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
                     # Hyphens that create ranges shouldn't be escaped.
                     stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
                                      for s in chunks)
-                # Escape set operations (&&, ~~ and ||).
-                stuff = re.sub(r'([&~|])', r'\\\1', stuff)
                 i = j+1
                 if not stuff:
                     # Empty range: never match.
@@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
                     # Negated empty range: match any character.
                     add('.')
                 else:
+                    # Escape set operations (&&, ~~ and ||).
+                    stuff = _re_setops_sub(r'\\\1', stuff)
                     if stuff[0] == '!':
                         stuff = '^' + stuff[1:]
                     elif stuff[0] in ('^', '['):
                         stuff = '\\' + stuff
                     add(f'[{stuff}]')
         else:
-            add(re.escape(c))
-    assert i == n
-    return res
-
-
-def _join_translated_parts(inp, STAR):
-    # Deal with STARs.
-    res = []
-    add = res.append
-    i, n = 0, len(inp)
-    # Fixed pieces at the start?
-    while i < n and inp[i] is not STAR:
-        add(inp[i])
-        i += 1
-    # Now deal with STAR fixed STAR fixed ...
-    # For an interior `STAR fixed` pairing, we want to do a minimal
-    # .*? match followed by `fixed`, with no possibility of backtracking.
-    # Atomic groups ("(?>...)") allow us to spell that directly.
-    # Note: people rely on the undocumented ability to join multiple
-    # translate() results together via "|" to build large regexps matching
-    # "one of many" shell patterns.
-    while i < n:
-        assert inp[i] is STAR
-        i += 1
-        if i == n:
-            add(".*")
-            break
-        assert inp[i] is not STAR
-        fixed = []
-        while i < n and inp[i] is not STAR:
-            fixed.append(inp[i])
-            i += 1
-        fixed = "".join(fixed)
-        if i == n:
-            add(".*")
-            add(fixed)
-        else:
-            add(f"(?>.*?{fixed})")
+            add(_re_escape(c))
     assert i == n
-    res = "".join(res)
+    return res, star_indices
+
+
+def _join_translated_parts(parts, star_indices):
+    if not star_indices:
+        return fr'(?s:{"".join(parts)})\Z'
+    iter_star_indices = iter(star_indices)
+    j = next(iter_star_indices)
+    buffer = parts[:j]  # fixed pieces at the start
+    append, extend = buffer.append, buffer.extend
+    i = j + 1
+    for j in iter_star_indices:
+        # Now deal with STAR fixed STAR fixed ...
+        # For an interior `STAR fixed` pairing, we want to do a minimal
+        # .*? match followed by `fixed`, with no possibility of backtracking.
+        # Atomic groups ("(?>...)") allow us to spell that directly.
+        # Note: people rely on the undocumented ability to join multiple
+        # translate() results together via "|" to build large regexps matching
+        # "one of many" shell patterns.
+        append('(?>.*?')
+        extend(parts[i:j])
+        append(')')
+        i = j + 1
+    append('.*')
+    extend(parts[i:])
+    res = ''.join(buffer)
     return fr'(?s:{res})\Z'
diff --git a/Lib/glob.py b/Lib/glob.py
index ce9b3698888dd9..690ab1b8b9fb1d 100644
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, 
include_hidden=False, seps=None):
             if part:
                 if not include_hidden and part[0] in '*?':
                     results.append(r'(?!\.)')
-                results.extend(fnmatch._translate(part, f'{not_sep}*', 
not_sep))
+                results.extend(fnmatch._translate(part, f'{not_sep}*', 
not_sep)[0])
             if idx < last_part_idx:
                 results.append(any_sep)
     res = ''.join(results)
diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py
index 10ed496d4e2f37..9f360e1dc10f47 100644
--- a/Lib/test/test_fnmatch.py
+++ b/Lib/test/test_fnmatch.py
@@ -250,6 +250,75 @@ def test_translate(self):
         self.assertTrue(re.match(fatre, 'cbabcaxc'))
         self.assertFalse(re.match(fatre, 'dabccbad'))
 
+    def test_translate_wildcards(self):
+        for pattern, expect in [
+            ('ab*', r'(?s:ab.*)\Z'),
+            ('ab*cd', r'(?s:ab.*cd)\Z'),
+            ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
+            ('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
+            ('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
+            ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
+            ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+        for pattern, expect in [
+            ('*ab', r'(?s:.*ab)\Z'),
+            ('*ab*', r'(?s:(?>.*?ab).*)\Z'),
+            ('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
+            ('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
+            ('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
+            ('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
+            ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
+            ('*ab*cd*12*34*', 
r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+    def test_translate_expressions(self):
+        for pattern, expect in [
+            ('[', r'(?s:\[)\Z'),
+            ('[!', r'(?s:\[!)\Z'),
+            ('[]', r'(?s:\[\])\Z'),
+            ('[abc', r'(?s:\[abc)\Z'),
+            ('[!abc', r'(?s:\[!abc)\Z'),
+            ('[abc]', r'(?s:[abc])\Z'),
+            ('[!abc]', r'(?s:[^abc])\Z'),
+            ('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
+            # with [[
+            ('[[', r'(?s:\[\[)\Z'),
+            ('[[a', r'(?s:\[\[a)\Z'),
+            ('[[]', r'(?s:[\[])\Z'),
+            ('[[]a', r'(?s:[\[]a)\Z'),
+            ('[[]]', r'(?s:[\[]\])\Z'),
+            ('[[]a]', r'(?s:[\[]a\])\Z'),
+            ('[[a]', r'(?s:[\[a])\Z'),
+            ('[[a]]', r'(?s:[\[a]\])\Z'),
+            ('[[a]b', r'(?s:[\[a]b)\Z'),
+            # backslashes
+            ('[\\', r'(?s:\[\\)\Z'),
+            (r'[\]', r'(?s:[\\])\Z'),
+            (r'[\\]', r'(?s:[\\\\])\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+    def test_star_indices_locations(self):
+        from fnmatch import _translate
+
+        blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
+        parts, star_indices = _translate(''.join(blocks), '*', '.')
+        expect_parts = ['a', r'\^', 'b', '*',
+                        '.', '.', '[a-z]', '[1-9]', '*',
+                        r'\+', r'\+', r'\[', r'\[', 'a']
+        self.assertListEqual(parts, expect_parts)
+        self.assertListEqual(star_indices, [3, 8])
+
+
 class FilterTestCase(unittest.TestCase):
 
     def test_filter(self):
diff --git 
a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst 
b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst
new file mode 100644
index 00000000000000..26a18afca945d9
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst
@@ -0,0 +1,2 @@
+Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
+by Bénédikt Tran.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to