https://github.com/python/cpython/commit/8eb6fb0294956f3aaca0ace76b5ee77bd84b03a0
commit: 8eb6fb0294956f3aaca0ace76b5ee77bd84b03a0
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-06-26T14:15:12+03:00
summary:
gh-95555: Allow a negated property as a character set member (GH-152245)
A negated multi-range property such as \P{ASCII} or \P{Pattern_Syntax} was
rejected inside a character class. Such members are now alternated in with
the other members: [\P{ASCII}abc] becomes [abc] | [^ASCII], and [\P{ASCII}]
alone is just the negated charset.
Co-authored-by: Claude Opus 4.8 <[email protected]>
files:
M Lib/re/_parser.py
M Lib/test/test_re.py
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 262286748fb25b..aab9b59168015c 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -310,7 +310,7 @@ def checkgroupname(self, name, offset):
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
-def _property_escape(source, escape, in_set=False):
+def _property_escape(source, escape):
# handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
from . import _properties
if not source.match('{'):
@@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False):
if code is None:
raise source.error("unknown property name %r" % name,
len(name) + len(r'\p{}'))
- if in_set and code[1][0] == (NEGATE, None):
- # A negated multi-range property cannot be a member of a set.
- raise source.error("bad escape %s in character class" % escape,
- len(name) + len(r'\p{}'))
return code
def _class_escape(source, escape):
@@ -369,7 +365,7 @@ def _class_escape(source, escape):
len(charname) + len(r'\N{}')) from None
return LITERAL, c
elif c in "pP" and source.istext:
- return _property_escape(source, escape, in_set=True)
+ return _property_escape(source, escape)
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
@@ -574,11 +570,15 @@ def _difference(left, right, state):
# with the next operand.
_SETOPS = {'||': _union, '&&': _intersect, '--': _difference}
-def _operand_elements(set, compound):
- # The operand's elements: a standalone nested set, else the member union.
+def _operand_elements(set, compound, negated, state):
+ # The operand's elements: a standalone nested set, else the member union,
+ # with any negated-property members alternated in (see addmember).
if compound is not None:
return compound
- return [_charset_node(_uniq(set))]
+ result = [_charset_node(_uniq(set))] if set or not negated else None
+ for neg in negated:
+ result = [neg] if result is None else _union(result, [neg], state)
+ return result
def _parse_operand(source, state, nested, here, allow_nested):
# Read one operand, stopping at a set operator or the closing ']'. An
@@ -591,10 +591,15 @@ def _parse_operand(source, state, nested, here,
allow_nested):
sourcematch = source.match
set = []
setappend = set.append
+ negated = [] # \P{...} negated-range props, alternated in at the end
def addmember(code):
- # Flatten a \p{...} property's IN into the member set.
+ # Flatten a \p{...} property's IN into the member set; a negated one
is a
+ # complemented charset, set aside to _union in (it can't join the
union).
if code[0] is IN:
- set.extend(code[1])
+ if code[1][0][0] is NEGATE:
+ negated.append(code)
+ else:
+ set.extend(code[1])
else:
setappend(code)
compound = None # elements of a standalone nested-set operand
@@ -607,9 +612,9 @@ def addmember(code):
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
- if set or compound is not None:
+ if set or compound is not None or negated:
if this == "]":
- return _operand_elements(set, compound), None
+ return _operand_elements(set, compound, negated, state), None
if this in '-&|~' and source.next == this:
if this == '~':
import warnings
@@ -621,7 +626,7 @@ def addmember(code):
else:
# '--', '&&' or '||' ends this operand and starts the next.
sourceget() # consume the second operator character
- return _operand_elements(set, compound), this + this
+ return _operand_elements(set, compound, negated, state),
this + this
if this[0] == "\\":
code1 = _class_escape(source, this)
else:
@@ -641,12 +646,12 @@ def addmember(code):
# A trailing '-' is a literal.
addmember(code1)
setappend((LITERAL, _ord("-")))
- return [_charset_node(_uniq(set))], None
+ return _operand_elements(set, None, negated, state), None
if that == "-":
# 'X--': difference, not a range. '--' after a single member
# lands here because the range probe consumed the first '-'.
addmember(code1)
- return [_charset_node(_uniq(set))], "--"
+ return _operand_elements(set, None, negated, state), "--"
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 7e8ed0e02833e8..af6e4612dcfaef 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1061,6 +1061,19 @@ def test_property_escapes(self):
self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
+ # A negated multi-range property (not backed by an engine category) can
+ # be a set member; it is alternated in with the other members.
+ self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
+ self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
+ self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
+ self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
+ self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
+ self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~')) # = ASCII
+ self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
+ # Composes with set operations.
+ self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_')) # \w and
ASCII
+ self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))
+
# Errors.
self.checkPatternError(r'\p', 'missing {, expected property name', 2)
self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
@@ -1072,10 +1085,6 @@ def test_property_escapes(self):
# \p is not special in bytes patterns.
self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
- # A negated multi-range property (one not backed by an engine
- # category) cannot be a set member.
- self.checkPatternError(r'[\P{ASCII}]',
- r'bad escape \P in character class', 1)
def test_word_boundaries(self):
# See http://bugs.python.org/issue10713
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]