https://github.com/python/cpython/commit/bd4bd3e76a684969022c00aafb8acf18006ac89b commit: bd4bd3e76a684969022c00aafb8acf18006ac89b branch: main author: Serhiy Storchaka <[email protected]> committer: serhiy-storchaka <[email protected]> date: 2026-06-25T10:09:41+03:00 summary:
gh-152100: Support set operations in character classes (GH-152153) Implement set difference [A--B], intersection [A&&B] and union [A||B] in regular expression character classes (Unicode Technical Standard #18), including nested, complemented and compound set operands. Symmetric difference [A~~B] remains reserved. Also use the new syntax in the standard library (_strptime, textwrap, doctest, pkgutil). Co-authored-by: Claude Opus 4.8 <[email protected]> files: A Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152100.Set0ps.rst M Doc/library/re.rst M Doc/whatsnew/3.16.rst M Lib/_strptime.py M Lib/doctest.py M Lib/pkgutil.py M Lib/re/_parser.py M Lib/test/test_re.py M Lib/textwrap.py diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 4745c1b98a4554..7c8c589b3f5dfc 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -279,25 +279,47 @@ The special characters are: ``[]()[{}]`` will match a right bracket, as well as left bracket, braces, and parentheses. - .. .. index:: single: --; in regular expressions - .. .. index:: single: &&; in regular expressions - .. .. index:: single: ~~; in regular expressions - .. .. index:: single: ||; in regular expressions - - * Support of nested sets and set operations as in `Unicode Technical - Standard #18`_ might be added in the future. This would change the - syntax, so to facilitate this change a :exc:`FutureWarning` will be raised - in ambiguous cases for the time being. - That includes sets starting with a literal ``'['`` or containing literal - character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To - avoid a warning escape them with a backslash. + .. index:: + single: --; in regular expressions + single: &&; in regular expressions + single: ||; in regular expressions + + * A character set may contain a nested set written in square brackets, and + two sets may be combined with a set operator, as in `Unicode Technical + Standard #18`_: + + * ``[A--B]`` (*difference*) matches a character that is in *A* but not + in *B*; for example ``[a-z--[aeiou]]`` matches an ASCII lowercase + consonant. + * ``[A&&B]`` (*intersection*) matches a character that is in both *A* + and *B*; for example ``[\w&&[a-z]]`` matches an ASCII lowercase letter. + * ``[A||B]`` (*union*) matches a character that is in *A* or in *B*; this + is the same as listing the members of both sets in a single set, but + allows combining nested sets. + + Operators have no precedence and are applied from left to right. To + group, write a nested set as the operand after an operator, as in + ``[a-z--[aeiou]]``. A leading ``'^'`` complements the whole result. + A ``'['`` begins a nested set only immediately after a set operator; + anywhere else -- including at the start of a character set -- it is an + ordinary character, so existing patterns keep their meaning. Escape it + as ``'\['`` to include a literal ``'['`` right after an operator. .. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/ + .. note:: + + Symmetric difference (``A~~B``) is not yet supported; a literal ``'~~'`` + in a character set still raises a :exc:`FutureWarning`. + .. versionchanged:: 3.7 :exc:`FutureWarning` is raised if a character set contains constructs that will change semantically in the future. + .. versionchanged:: next + Added support for nested sets and the set operators ``--``, ``&&`` + and ``||``. + .. index:: single: | (vertical bar); in regular expressions ``|`` diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 18e500df6f3074..32962a9520fa69 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -181,6 +181,18 @@ os (Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.) +re +-- + +* :mod:`re` now supports set operations and nested sets in character classes, + as described in `Unicode Technical Standard #18 + <https://unicode.org/reports/tr18/>`__: set difference (``[A--B]``), + intersection (``[A&&B]``) and union (``[A||B]``), where an operand may be a + nested set written in square brackets. For example, ``[a-z--[aeiou]]`` + matches an ASCII lowercase consonant. + (Contributed by Serhiy Storchaka in :gh:`152100`.) + + shlex ----- diff --git a/Lib/_strptime.py b/Lib/_strptime.py index 746b0907c1d9f4..59ac96745aa15e 100644 --- a/Lib/_strptime.py +++ b/Lib/_strptime.py @@ -238,7 +238,7 @@ def __calc_date_time(self): current_format = current_format.replace(tz, "%Z") # Transform all non-ASCII digits to digits in range U+0660 to U+0669. if not current_format.isascii() and self.LC_alt_digits is None: - current_format = re_sub(r'\d(?<![0-9])', + current_format = re_sub(r'[\d--0-9]', lambda m: chr(0x0660 + int(m[0])), current_format) for old, new in replacement_pairs: diff --git a/Lib/doctest.py b/Lib/doctest.py index be950079e396de..8a55fe3ddd2615 100644 --- a/Lib/doctest.py +++ b/Lib/doctest.py @@ -1768,7 +1768,7 @@ def check_output(self, want, got, optionflags): '', want) # If a line in got contains only spaces, then remove the # spaces. - got = re.sub(r'(?m)^[^\S\n]+$', '', got) + got = re.sub(r'(?m)^[\s--\n]+$', '', got) if got == want: return True diff --git a/Lib/pkgutil.py b/Lib/pkgutil.py index 11c2a4b0ef4635..9121d6a1e2285c 100644 --- a/Lib/pkgutil.py +++ b/Lib/pkgutil.py @@ -443,7 +443,7 @@ def resolve_name(name, *, strict=False): within the imported package to get to the desired object. """ global _LENIENT_PATTERN, _STRICT_PATTERN - dotted_words = r'(?!\d)(\w+)(\.(?!\d)(\w+))*' + dotted_words = r'([\w--\d]\w*)(\.([\w--\d]\w*))*' if strict: if _STRICT_PATTERN is None: _STRICT_PATTERN = re.compile( diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index b8c19cd3070c4d..cc2b66c54b6681 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -509,6 +509,201 @@ def _parse_sub(source, state, verbose, nested): subpattern.append((BRANCH, (None, items))) return subpattern +def _charset_node(items): + # One element matching a character in the union `items`. A lone LITERAL or + # CATEGORY is already a one-character matcher and needs no IN wrapper. + if len(items) == 1 and items[0][0] in _SETITEMCODES: + return items[0] + return (IN, items) + +def _flat_items(elements): + # The items if `elements` is a single flat charset (no complement), else + # None -- the dual of _charset_node: a lone LITERAL or CATEGORY is an item. + if len(elements) == 1: + op, av = elements[0] + if op in _SETITEMCODES: + return [elements[0]] + if op is IN and all(o is not NEGATE for o, _av in av): + return av + return None + +def _union(left, right, state): + # A || B: merge two flat character classes into one charset where possible, + # else alternate the one-character matchers. + left_items = _flat_items(left) + right_items = _flat_items(right) + if left_items is not None and right_items is not None: + return [_charset_node(_uniq(left_items + right_items))] + return [(BRANCH, (None, [SubPattern(state, left), + SubPattern(state, right)]))] + +def _intersect(left, right, state): + # A && B: A, then require the same character to also match B (lookbehind). + return left + [(ASSERT, (-1, SubPattern(state, right)))] + +def _difference(left, right, state): + # A -- B: A, then require the character not to match B (lookbehind). + return left + [(ASSERT_NOT, (-1, SubPattern(state, right)))] + +# Map a set-operator token to the function combining the accumulated result +# with the next operand. +_SETOPS = {'||': _union, '&&': _intersect, '--': _difference} + +def _operand_elements(set, compound): + # The operand's elements: a standalone nested set, else the member union. + if compound is not None: + return compound + return [_charset_node(_uniq(set))] + +def _parse_operand(source, state, nested, here, allow_nested): + # Read one operand, stopping at a set operator or the closing ']'. An + # operand is either a union of members/ranges/escapes or, when allow_nested, + # a single nested set ([...]) -- not a mix. Return (elements, terminator), + # where terminator is the operator that ended the operand, or None at the end + # of the class. + _ord = ord + sourceget = source.get + sourcematch = source.match + set = [] + setappend = set.append + compound = None # elements of a standalone nested-set operand + if allow_nested and sourcematch("["): + # A nested set after an operator is the whole operand, used as-is (not + # wrapped in a group); it cannot be combined with loose members. + compound = _parse_charset(source, state, nested + 1) + while True: + this = sourceget() + if this is None: + raise source.error("unterminated character set", + source.tell() - here) + if set or compound is not None: + if this == "]": + return _operand_elements(set, compound), None + if this in '-&|~' and source.next == this: + if this == '~': + import warnings + warnings.warn( + 'Possible set symmetric difference at position %d' + % (source.tell() - 1), + FutureWarning, stacklevel=nested + 8 + ) + else: + # '--', '&&' or '||' ends this operand and starts the next. + sourceget() # consume the second operator character + return _operand_elements(set, compound), this + this + if this[0] == "\\": + code1 = _class_escape(source, this) + else: + code1 = LITERAL, _ord(this) + if compound is not None: + # A standalone nested set cannot be combined with other members. + raise source.error("unsupported nested set operand", + source.tell() - here) + # Past this point the operand is a plain member set (compound is None). + if sourcematch("-"): + # potential range + that = sourceget() + if that is None: + raise source.error("unterminated character set", + source.tell() - here) + if that == "]": + # A trailing '-' is a literal. + setappend(code1) + setappend((LITERAL, _ord("-"))) + return [_charset_node(_uniq(set))], None + if that == "-": + # 'X--': difference, not a range. '--' after a single member + # lands here because the range probe consumed the first '-'. + setappend(code1) + return [_charset_node(_uniq(set))], "--" + if that[0] == "\\": + code2 = _class_escape(source, that) + else: + code2 = LITERAL, _ord(that) + if code1[0] != LITERAL or code2[0] != LITERAL: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + lo = code1[1] + hi = code2[1] + if hi < lo: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + setappend((RANGE, (lo, hi))) + else: + setappend(code1) + +def _complement(elements, state): + # The complement of `elements` (a single matcher, or a set operation as a + # head followed by lookbehind assertions). De Morgan pushes the negation in + # -- recursively through nested set operations -- so no lookahead is needed. + op, av = elements[0] + if op is LITERAL: + result = [(NOT_LITERAL, av)] + elif op is NOT_LITERAL: + result = [(LITERAL, av)] + elif op is CATEGORY: + result = [(CATEGORY, CH_NEGATE[av])] + elif op is IN: + # Negate by toggling a leading NEGATE: a doubly negated set flips back + # to positive instead of stacking a second NEGATE. + if av[0][0] is NEGATE: + result = [(IN, av[1:])] + else: + result = [(IN, [(NEGATE, None)] + av)] + else: + # An un-merged union (A||B as an alternation). De Morgan: + # ~(A | B | ...) = ~A & ~B & ... -- intersect the operand complements. + assert op is BRANCH + branches = av[1] + result = _complement(branches[0].data, state) + for sub in branches[1:]: + result = _intersect(result, _complement(sub.data, state), state) + # A set operation: a head followed by lookbehind assertions. De Morgan: + # ~(head & ~B & C ...) = ~head | B | ~C ... + for op, av in elements[1:]: + if op is ASSERT_NOT: # '--' operand B: union with B + result = _union(result, av[1].data, state) + else: # '&&' operand B (ASSERT): union with [^B] + result = _union(result, _complement(av[1].data, state), state) + return result + +def _parse_charset(source, state, nested): + # Parse a character set, assuming the opening '[' has been consumed, up to + # and including the closing ']'. Return a list of subpattern elements that + # together consume exactly one character. + # + # A set operation (UTS #18 RL1.3) maps to assertions on, or alternatives of, + # the matched character: + # [A--B] -> A (?<![B]) difference + # [A&&B] -> A (?<=[B]) intersection + # [A||B] -> [AB] or (?:A|B) union + # Operators chain left-to-right with no precedence. A leading '^' negates by + # De Morgan, pushing the negation into the operands (no lookahead needed): + # [^A--B] -> [^A] | B ; [^A&&B] -> [^A] | [^B] ; [^A||B] -> [^A] && [^B] + # Each operand compiles in its own flag context, so this is IGNORECASE-safe. + here = source.tell() - 1 + if source.next == '[': + # A '[' at the start of a class stays a literal (the first operand never + # needs grouping), but the position is reserved -- keep warning. + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 7 + ) + negate = source.match("^") + result, term = _parse_operand(source, state, nested, here, False) + while term is not None: + combine = _SETOPS[term] + operand, term = _parse_operand(source, state, nested, here, True) + result = combine(result, operand, state) + if negate: + # Push the negation into the operands by De Morgan (see above). + result = _complement(result, state) + + # A single one-character matcher, or a set operation (head + assertions); + # the caller groups a multi-element result if a quantifier could follow. + return result + def _parse(source, state, verbose, nested, first=False): # parse a simple pattern subpattern = SubPattern(state) @@ -548,95 +743,15 @@ def _parse(source, state, verbose, nested, first=False): subpatternappend((LITERAL, _ord(this))) elif this == "[": - here = source.tell() - 1 - # character set - set = [] - setappend = set.append -## if sourcematch(":"): -## pass # handle character classes - if source.next == '[': - import warnings - warnings.warn( - 'Possible nested set at position %d' % source.tell(), - FutureWarning, stacklevel=nested + 6 - ) - negate = sourcematch("^") - # check remaining characters - while True: - this = sourceget() - if this is None: - raise source.error("unterminated character set", - source.tell() - here) - if this == "]" and set: - break - elif this[0] == "\\": - code1 = _class_escape(source, this) - else: - if set and this in '-&~|' and source.next == this: - import warnings - warnings.warn( - 'Possible set %s at position %d' % ( - 'difference' if this == '-' else - 'intersection' if this == '&' else - 'symmetric difference' if this == '~' else - 'union', - source.tell() - 1), - FutureWarning, stacklevel=nested + 6 - ) - code1 = LITERAL, _ord(this) - if sourcematch("-"): - # potential range - that = sourceget() - if that is None: - raise source.error("unterminated character set", - source.tell() - here) - if that == "]": - setappend(code1) - setappend((LITERAL, _ord("-"))) - break - if that[0] == "\\": - code2 = _class_escape(source, that) - else: - if that == '-': - import warnings - warnings.warn( - 'Possible set difference at position %d' % ( - source.tell() - 2), - FutureWarning, stacklevel=nested + 6 - ) - code2 = LITERAL, _ord(that) - if code1[0] != LITERAL or code2[0] != LITERAL: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - lo = code1[1] - hi = code2[1] - if hi < lo: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - setappend((RANGE, (lo, hi))) - else: - setappend(code1) - - set = _uniq(set) - # XXX: <fl> should move set optimization to compiler! - if _len(set) == 1 and set[0][0] is LITERAL: - # optimization - if negate: - subpatternappend((NOT_LITERAL, set[0][1])) - else: - subpatternappend(set[0]) - elif _len(set) == 1 and set[0][0] is CATEGORY: - # optimization: a lone category like [\d] or [^\d] - if negate: - subpatternappend((CATEGORY, CH_NEGATE[set[0][1]])) - else: - subpatternappend(set[0]) + charset = _parse_charset(source, state, nested) + if len(charset) == 1: + code = charset[0] else: - if negate: - set.insert(0, (NEGATE, None)) - # charmap optimization can't be added here because - # global flags still are not known - subpatternappend((IN, set)) + # Wrap a multi-element set operation in a non-capturing group so + # a following quantifier (e.g. [a-z--[aeiou]]+) binds the whole + # operation, not just its trailing assertion. + code = (SUBPATTERN, (None, 0, 0, SubPattern(state, charset))) + subpatternappend(code) elif this in REPEAT_CHARS: # repeat previous item diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 69d730c49387be..2a57370a6fb643 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1288,80 +1288,90 @@ def test_not_literal(self): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") - def test_possible_set_operations(self): + def test_set_operations(self): + # UTS #18 RL1.3 set operations in character classes: '--' (difference), + # '&&' (intersection) and '||' (union) are operators on the matched + # character; '~~' (symmetric difference) is still reserved + # (FutureWarning). s = bytes(range(128)).decode() - with self.assertWarnsRegex(FutureWarning, 'Possible set difference') as w: - p = re.compile(r'[0-9--1]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('-./0123456789')) - with self.assertWarnsRegex(FutureWarning, 'Possible set difference') as w: - self.assertEqual(re.findall(r'[0-9--2]', s), list('-./0123456789')) - self.assertEqual(w.filename, __file__) + # Set difference A--B == A and not B. + self.assertEqual(re.findall(r'[0-9--1]', s), list('023456789')) + self.assertEqual(re.findall(r'[0-9--2]', s), list('013456789')) + self.assertEqual(re.findall(r'[%--1]', s), list('%')) + # A leading '-' is a literal, so this stays a range. self.assertEqual(re.findall(r'[--1]', s), list('-./01')) - - with self.assertWarnsRegex(FutureWarning, 'Possible set difference') as w: - p = re.compile(r'[%--1]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list("%&'()*+,-1")) - - with self.assertWarnsRegex(FutureWarning, 'Possible set difference ') as w: - p = re.compile(r'[%--]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list("%&'()*+,-")) - - with self.assertWarnsRegex(FutureWarning, 'Possible set intersection ') as w: - p = re.compile(r'[0-9&&1]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('&0123456789')) - with self.assertWarnsRegex(FutureWarning, 'Possible set intersection ') as w: - self.assertEqual(re.findall(r'[0-8&&1]', s), list('&012345678')) - self.assertEqual(w.filename, __file__) - - with self.assertWarnsRegex(FutureWarning, 'Possible set intersection ') as w: - p = re.compile(r'[\d&&1]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('&0123456789')) - + # A dangling operator (empty operand) is an error. + self.assertRaises(re.PatternError, re.compile, r'[%--]') + + # Set intersection A&&B == A and B. + self.assertEqual(re.findall(r'[0-9&&1]', s), list('1')) + self.assertEqual(re.findall(r'[0-8&&1]', s), list('1')) + self.assertEqual(re.findall(r'[\d&&1]', s), list('1')) + # A leading '&' is a literal. self.assertEqual(re.findall(r'[&&1]', s), list('&1')) - with self.assertWarnsRegex(FutureWarning, 'Possible set union ') as w: - p = re.compile(r'[0-9||a]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('0123456789a|')) - - with self.assertWarnsRegex(FutureWarning, 'Possible set union ') as w: - p = re.compile(r'[\d||a]') + # Nested sets and lookbehind-mapped operands. + self.assertEqual(re.findall(r'[a-z--[aeiou]]', s), + list('bcdfghjklmnpqrstvwxyz')) + self.assertEqual(re.findall(r'[\w&&[a-z]]', s), + list('abcdefghijklmnopqrstuvwxyz')) + # Operators chain and mix left-to-right. + self.assertEqual(re.findall(r'[a-z--[aeiou]--[xyz]]', s), + list('bcdfghjklmnpqrstvw')) + self.assertEqual(re.findall(r'[\w&&[a-z]&&[m-z]]', s), + list('mnopqrstuvwxyz')) + # A negated set operation: [^A--B] == complement of (A minus B). + self.assertEqual(re.findall(r'[^a-z--aeiou]', s), + [c for c in s if not ('a' <= c <= 'z' and c not in 'aeiou')]) + # A nested operand may be complemented or itself a set operation; it is + # used directly as the assertion body. + self.assertEqual(re.findall(r'[a-z--[^m]]', s), list('m')) + self.assertEqual(re.findall(r'[\w&&[a-c--b]]', s), list('ac')) + self.assertEqual(re.findall(r'[a-f&&[^bc]]', s), list('adef')) + # A nested set is the whole operand; it cannot be mixed with loose + # members (write the members in the set instead). + self.assertEqual(re.findall(r'[a-c--[ab]]', s), list('c')) + self.assertRaises(re.PatternError, re.compile, r'[a-c--[ab]d]') + self.assertRaises(re.PatternError, re.compile, r'[a-c--[ab][c]]') + # A '[' is a nested set only immediately after a set operator; + # elsewhere it is a literal, so these stay backward compatible. + self.assertEqual(re.findall(r'[*?[]', s), list('*?[')) + self.assertEqual(re.findall(r'[a[b]', s), list('[ab')) + self.assertEqual(re.findall(r'[^[]', 'a[b'), list('ab')) + # A '[' at the start of a class also stays a literal (the position is + # reserved, so it still warns) and keeps its historical meaning. + with self.assertWarnsRegex(FutureWarning, 'Possible nested set ') as w: + p = re.compile(r'[[a-z]]') self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('0123456789a|')) - + self.assertEqual(p.findall('a]b[c'), ['a]']) # {[, a-z} then a literal ']' + with self.assertWarnsRegex(FutureWarning, 'Possible nested set '): + re.compile(r'[[:digit:]]') + # A nested set after an operator does not warn. + with warnings.catch_warnings(): + warnings.simplefilter('error', FutureWarning) + re.compile(r'[a-z--[aeiou]]') + + # Set union A||B == A or B (an explicit form of [AB]); flat operands + # merge into one charset, otherwise the operations are alternated. + self.assertEqual(re.findall(r'[0-9||a]', s), list('0123456789a')) + self.assertEqual(re.findall(r'[\d||a]', s), list('0123456789a')) + self.assertEqual(re.findall(r'[a-z--m||0-9]', s), + list('0123456789abcdefghijklnopqrstuvwxyz')) + # A leading '|' is a literal. self.assertEqual(re.findall(r'[||1]', s), list('1|')) + # '~~' remains reserved. + with self.assertWarnsRegex(FutureWarning, 'Possible set symmetric difference ') as w: p = re.compile(r'[0-9~~1]') self.assertEqual(w.filename, __file__) self.assertEqual(p.findall(s), list('0123456789~')) - with self.assertWarnsRegex(FutureWarning, 'Possible set symmetric difference ') as w: - p = re.compile(r'[\d~~1]') + self.assertEqual(re.findall(r'[\d~~1]', s), list('0123456789~')) self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('0123456789~')) - self.assertEqual(re.findall(r'[~~1]', s), list('1~')) - with self.assertWarnsRegex(FutureWarning, 'Possible nested set ') as w: - p = re.compile(r'[[0-9]|]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list('0123456789[]')) - with self.assertWarnsRegex(FutureWarning, 'Possible nested set ') as w: - self.assertEqual(re.findall(r'[[0-8]|]', s), list('012345678[]')) - self.assertEqual(w.filename, __file__) - - with self.assertWarnsRegex(FutureWarning, 'Possible nested set ') as w: - p = re.compile(r'[[:digit:]|]') - self.assertEqual(w.filename, __file__) - self.assertEqual(p.findall(s), list(':[]dgit')) - def test_search_coverage(self): self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 41366fbf443a4f..2f213e34c2c329 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -72,7 +72,7 @@ class TextWrapper: # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! # (after stripping out empty strings). word_punct = r'[\w!"\'&.,?]' - letter = r'[^\d\W]' + letter = r'[\w--\d]' whitespace = r'[%s]' % re.escape(_whitespace) nowhitespace = '[^' + whitespace[1:] wordsep_re = re.compile(r''' diff --git a/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152100.Set0ps.rst b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152100.Set0ps.rst new file mode 100644 index 00000000000000..848740ed7a56d3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-24-12-00-00.gh-issue-152100.Set0ps.rst @@ -0,0 +1,3 @@ +Support set operations and nested sets in regular expression character +classes, as described in Unicode Technical Standard #18: set difference +(``[A--B]``), intersection (``[A&&B]``) and union (``[A||B]``). _______________________________________________ Python-checkins mailing list -- [email protected] To unsubscribe send an email to [email protected] https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: [email protected]
