Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-regex for openSUSE:Factory checked in at 2025-09-05 21:42:33 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-regex (Old) and /work/SRC/openSUSE:Factory/.python-regex.new.1977 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-regex" Fri Sep 5 21:42:33 2025 rev:29 rq:1302893 version:2025.9.1 Changes: -------- --- /work/SRC/openSUSE:Factory/python-regex/python-regex.changes 2025-01-12 11:08:36.628751612 +0100 +++ /work/SRC/openSUSE:Factory/.python-regex.new.1977/python-regex.changes 2025-09-05 21:42:58.491795109 +0200 @@ -1,0 +2,7 @@ +Fri Sep 5 09:18:35 UTC 2025 - Dirk Müller <dmuel...@suse.com> + +- update to 2025.9.11: + * Git PR 585: Fix AttributeError: 'AnyAll' object has no attribute + '_key' + +------------------------------------------------------------------- Old: ---- regex-2024.11.6.tar.gz New: ---- regex-2025.9.1.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-regex.spec ++++++ --- /var/tmp/diff_new_pack.BkSQly/_old 2025-09-05 21:42:59.383832642 +0200 +++ /var/tmp/diff_new_pack.BkSQly/_new 2025-09-05 21:42:59.383832642 +0200 @@ -1,7 +1,7 @@ # # spec file for package python-regex # -# Copyright (c) 2025 SUSE LLC +# Copyright (c) 2025 SUSE LLC and contributors # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -18,7 +18,7 @@ %{?sle15_python_module_pythons} Name: python-regex -Version: 2024.11.6 +Version: 2025.9.1 Release: 0 Summary: Alternative regular expression module for Python License: Apache-2.0 ++++++ regex-2024.11.6.tar.gz -> regex-2025.9.1.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/PKG-INFO new/regex-2025.9.1/PKG-INFO --- old/regex-2024.11.6/PKG-INFO 2024-11-06 20:49:53.200616800 +0100 +++ new/regex-2025.9.1/PKG-INFO 2025-09-01 23:19:23.484337000 +0200 @@ -1,28 +1,27 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.4 Name: regex -Version: 2024.11.6 +Version: 2025.9.1 Summary: Alternative regular expression module, to replace re. -Home-page: https://github.com/mrabarnett/mrab-regex -Author: Matthew Barnett -Author-email: re...@mrabarnett.plus.com -License: Apache Software License +Author-email: Matthew Barnett <re...@mrabarnett.plus.com> +License-Expression: Apache-2.0 AND CNRI-Python +Project-URL: Homepage, https://github.com/mrabarnett/mrab-regex Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: General -Requires-Python: >=3.8 +Requires-Python: >=3.9 Description-Content-Type: text/x-rst License-File: LICENSE.txt +Dynamic: license-file Introduction ------------ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/pyproject.toml new/regex-2025.9.1/pyproject.toml --- old/regex-2024.11.6/pyproject.toml 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/pyproject.toml 2025-09-01 23:19:18.000000000 +0200 @@ -1,3 +1,44 @@ [build-system] -requires = ["setuptools"] +requires = ["setuptools > 77.0.3"] build-backend = "setuptools.build_meta" + +[project] +name = "regex" +version = "2025.9.1" +description = "Alternative regular expression module, to replace re." +readme = "README.rst" +authors = [ + {name = "Matthew Barnett", email = "re...@mrabarnett.plus.com"}, +] +license = "Apache-2.0 AND CNRI-Python" +license-files = ["LICENSE.txt"] + +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing", + "Topic :: Text Processing :: General", +] + +requires-python = ">= 3.9" + +[project.urls] +Homepage = "https://github.com/mrabarnett/mrab-regex" + +[tool.setuptools] +package-dir = {regex = "regex_3"} +py-modules = [ + "regex.__init__", + "regex.regex", + "regex._regex_core", + "regex.test_regex", +] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/regex.egg-info/PKG-INFO new/regex-2025.9.1/regex.egg-info/PKG-INFO --- old/regex-2024.11.6/regex.egg-info/PKG-INFO 2024-11-06 20:49:53.000000000 +0100 +++ new/regex-2025.9.1/regex.egg-info/PKG-INFO 2025-09-01 23:19:23.000000000 +0200 @@ -1,28 +1,27 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.4 Name: regex -Version: 2024.11.6 +Version: 2025.9.1 Summary: Alternative regular expression module, to replace re. -Home-page: https://github.com/mrabarnett/mrab-regex -Author: Matthew Barnett -Author-email: re...@mrabarnett.plus.com -License: Apache Software License +Author-email: Matthew Barnett <re...@mrabarnett.plus.com> +License-Expression: Apache-2.0 AND CNRI-Python +Project-URL: Homepage, https://github.com/mrabarnett/mrab-regex Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: General -Requires-Python: >=3.8 +Requires-Python: >=3.9 Description-Content-Type: text/x-rst License-File: LICENSE.txt +Dynamic: license-file Introduction ------------ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/regex_3/_regex.c new/regex-2025.9.1/regex_3/_regex.c --- old/regex-2024.11.6/regex_3/_regex.c 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/regex_3/_regex.c 2025-09-01 23:19:18.000000000 +0200 @@ -58,6 +58,9 @@ typedef RE_UINT32 RE_CODE; typedef unsigned char BYTE; +/* An unassigned codepoint. */ +#define UNASSIGNED_CODEPOINT 0x10FFFF + /* Properties in the General Category. */ #define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN) #define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU) @@ -157,6 +160,11 @@ /* Various flags stored in a node status member. */ #define RE_STATUS_SHIFT 11 +#define RE_ENCODING_SHIFT 16 +#define ASCII_ENCODING 1 +#define UNICODE_ENCODING 2 +#define ENCODING_KIND(NODE) (((NODE)->status >> RE_ENCODING_SHIFT) & 0x3) + #define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT) #define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT) #define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT) @@ -809,12 +817,8 @@ /* Checks whether a character has a property. */ Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) { if (ch > RE_ASCII_MAX) { - /* Outside the ASCII range. */ - RE_UINT32 value; - - value = property & 0xFFFF; - - return value == 0; + /* Treat it as an unassigned codepoint. */ + ch = UNASSIGNED_CODEPOINT; } return unicode_has_property(property, ch); @@ -824,19 +828,12 @@ Py_LOCAL_INLINE(BOOL) ascii_has_property_ign(RE_CODE property, Py_UCS4 ch) { RE_UINT32 prop; - prop = property >> 16; - - /* We are working with ASCII. */ - if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property == - RE_PROP_GC_LT) { - RE_UINT32 value; - - value = re_get_general_category(ch); + if (ch > RE_ASCII_MAX) { + /* Treat it as an unassigned codepoint. */ + ch = UNASSIGNED_CODEPOINT; + } - return value == RE_PROP_LU || value == RE_PROP_LL || value == - RE_PROP_LT; - } else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE) - return (BOOL)re_get_cased(ch); + prop = property >> 16; /* The property is case-insensitive. */ return ascii_has_property(property, ch); @@ -2902,7 +2899,14 @@ /* Checks whether a character has a property. */ Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding, RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) { - return encoding->has_property(locale_info, node->values[0], ch); + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + return ascii_encoding.has_property(locale_info, node->values[0], ch); + case UNICODE_ENCODING: + return unicode_encoding.has_property(locale_info, node->values[0], ch); + default: + return encoding->has_property(locale_info, node->values[0], ch); + } } /* Checks whether a character has a property, ignoring case. */ @@ -2914,6 +2918,15 @@ property = node->values[0]; prop = property >> 16; + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + encoding = &ascii_encoding; + break; + case UNICODE_ENCODING: + encoding = &unicode_encoding; + break; + } + /* We need to do special handling of case-sensitive properties according to * the 'encoding'. */ @@ -2988,6 +3001,9 @@ Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_LocaleInfo* locale_info, RE_Node* member, Py_UCS4 ch) { switch (member->op) { + case RE_OP_ANY_ALL: + TRACE(("%s\n", re_op_text[member->op])) + return TRUE; case RE_OP_CHARACTER: /* values are: char_code */ TRACE(("%s %d %d\n", re_op_text[member->op], member->match, @@ -2997,7 +3013,15 @@ /* values are: property */ TRACE(("%s %d %d\n", re_op_text[member->op], member->match, member->values[0])) - return encoding->has_property(locale_info, member->values[0], ch); + + switch (ENCODING_KIND(member)) { + case ASCII_ENCODING: + return ascii_encoding.has_property(locale_info, member->values[0], ch); + case UNICODE_ENCODING: + return unicode_encoding.has_property(locale_info, member->values[0], ch); + default: + return encoding->has_property(locale_info, member->values[0], ch); + } case RE_OP_RANGE: /* values are: lower, upper */ TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match, @@ -4003,7 +4027,19 @@ text = state->text; match = node->match == match; - encoding = state->encoding; + + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + encoding = &ascii_encoding; + break; + case UNICODE_ENCODING: + encoding = &unicode_encoding; + break; + default: + encoding = state->encoding; + break; + } + locale_info = state->locale_info; property = node->values[0]; @@ -4101,7 +4137,19 @@ text = state->text; match = node->match == match; - encoding = state->encoding; + + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + encoding = &ascii_encoding; + break; + case UNICODE_ENCODING: + encoding = &unicode_encoding; + break; + default: + encoding = state->encoding; + break; + } + locale_info = state->locale_info; property = node->values[0]; @@ -4199,7 +4247,19 @@ text = state->text; match = node->match == match; - encoding = state->encoding; + + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + encoding = &ascii_encoding; + break; + case UNICODE_ENCODING: + encoding = &unicode_encoding; + break; + default: + encoding = state->encoding; + break; + } + locale_info = state->locale_info; property = node->values[0]; @@ -4297,7 +4357,19 @@ text = state->text; match = node->match == match; - encoding = state->encoding; + + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + encoding = &ascii_encoding; + break; + case UNICODE_ENCODING: + encoding = &unicode_encoding; + break; + default: + encoding = state->encoding; + break; + } + locale_info = state->locale_info; property = node->values[0]; @@ -6879,8 +6951,17 @@ /* Checks whether a position is on a word boundary. */ Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node, Py_ssize_t text_pos) { - return bool_as_status(state->encoding->at_boundary(state, text_pos) == - node->match); + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + return bool_as_status(ascii_encoding.at_boundary(state, text_pos) == + node->match); + case UNICODE_ENCODING: + return bool_as_status(unicode_encoding.at_boundary(state, text_pos) == + node->match); + default: + return bool_as_status(state->encoding->at_boundary(state, text_pos) == + node->match); + } } /* Checks whether there's a character at a position. */ @@ -7721,7 +7802,17 @@ node, Py_ssize_t text_pos, BOOL* is_partial) { BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos); - at_boundary = state->encoding->at_boundary; + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + at_boundary = ascii_encoding.at_boundary; + break; + case UNICODE_ENCODING: + at_boundary = unicode_encoding.at_boundary; + break; + default: + at_boundary = state->encoding->at_boundary; + break; + } *is_partial = FALSE; @@ -7741,7 +7832,17 @@ node, Py_ssize_t text_pos, BOOL* is_partial) { BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos); - at_boundary = state->encoding->at_boundary; + switch (ENCODING_KIND(node)) { + case ASCII_ENCODING: + at_boundary = ascii_encoding.at_boundary; + break; + case UNICODE_ENCODING: + at_boundary = unicode_encoding.at_boundary; + break; + default: + at_boundary = state->encoding->at_boundary; + break; + } *is_partial = FALSE; @@ -23873,6 +23974,7 @@ if (!node) return RE_ERROR_MEMORY; + node->match = TRUE; args->code += 2; /* Append the node. */ @@ -25009,6 +25111,11 @@ /* Compile the character set. */ do { switch (args->code[0]) { + case RE_OP_ANY_ALL: + status = build_ANY(args); + if (status != RE_ERROR_SUCCESS) + return status; + break; case RE_OP_CHARACTER: case RE_OP_PROPERTY: status = build_CHARACTER_or_PROPERTY(args); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/regex_3/_regex_core.py new/regex-2025.9.1/regex_3/_regex_core.py --- old/regex-2024.11.6/regex_3/_regex_core.py 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/regex_3/_regex_core.py 2025-09-01 23:19:18.000000000 +0200 @@ -336,7 +336,7 @@ "Compiles the firstset for the pattern." reverse = bool(info.flags & REVERSE) fs = _check_firstset(info, reverse, fs) - if not fs: + if not fs or isinstance(fs, AnyAll): return [] # Compile the firstset. @@ -1103,6 +1103,11 @@ "Parses a subpattern with scoped flags." saved_flags = info.flags info.flags = (info.flags | flags_on) & ~flags_off + + # Ensure that there aren't multiple encoding flags set. + if info.flags & (ASCII | LOCALE | UNICODE): + info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on + source.ignore_space = bool(info.flags & VERBOSE) try: subpattern = _parse_pattern(source, info) @@ -1235,13 +1240,23 @@ if not in_set: if info.flags & WORD: value = WORD_POSITION_ESCAPES.get(ch) + elif info.flags & ASCII: + value = ASCII_POSITION_ESCAPES.get(ch) + elif info.flags & UNICODE: + value = UNICODE_POSITION_ESCAPES.get(ch) else: value = POSITION_ESCAPES.get(ch) if value: return value - value = CHARSET_ESCAPES.get(ch) + if info.flags & ASCII: + value = ASCII_CHARSET_ESCAPES.get(ch) + elif info.flags & UNICODE: + value = UNICODE_CHARSET_ESCAPES.get(ch) + else: + value = CHARSET_ESCAPES.get(ch) + if value: return value @@ -1380,11 +1395,26 @@ prop_name, name = parse_property_name(source) if source.match("}"): # It's correctly delimited. - prop = lookup_property(prop_name, name, positive != negate, source) + if info.flags & ASCII: + encoding = ASCII_ENCODING + elif info.flags & UNICODE: + encoding = UNICODE_ENCODING + else: + encoding = 0 + + prop = lookup_property(prop_name, name, positive != negate, source, + encoding=encoding) return make_property(info, prop, in_set) elif ch and ch in "CLMNPSZ": # An abbreviated property, eg \pL. - prop = lookup_property(None, ch, positive, source) + if info.flags & ASCII: + encoding = ASCII_ENCODING + elif info.flags & UNICODE: + encoding = UNICODE_ENCODING + else: + encoding = 0 + + prop = lookup_property(None, ch, positive, source, encoding=encoding) return make_property(info, prop, in_set) # Not a property, so treat as a literal "p" or "P". @@ -1634,7 +1664,7 @@ _BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) -def lookup_property(property, value, positive, source=None, posix=False): +def lookup_property(property, value, positive, source=None, posix=False, encoding=0): "Looks up a property." # Normalise the names (which may still be lists). property = standardise_name(property) if property else None @@ -1663,7 +1693,7 @@ raise error("unknown property value", source.string, source.pos) - return Property((prop_id << 16) | val_id, positive) + return Property((prop_id << 16) | val_id, positive, encoding=encoding) # Only the value is provided. # It might be the name of a GC, script or block value. @@ -1671,16 +1701,16 @@ prop_id, value_dict = PROPERTIES.get(property) val_id = value_dict.get(value) if val_id is not None: - return Property((prop_id << 16) | val_id, positive) + return Property((prop_id << 16) | val_id, positive, encoding=encoding) # It might be the name of a binary property. prop = PROPERTIES.get(value) if prop: prop_id, value_dict = prop if set(value_dict) == _BINARY_VALUES: - return Property((prop_id << 16) | 1, positive) + return Property((prop_id << 16) | 1, positive, encoding=encoding) - return Property(prop_id << 16, not positive) + return Property(prop_id << 16, not positive, encoding=encoding) # It might be the name of a binary property starting with a prefix. if value.startswith("IS"): @@ -1688,7 +1718,7 @@ if prop: prop_id, value_dict = prop if "YES" in value_dict: - return Property((prop_id << 16) | 1, positive) + return Property((prop_id << 16) | 1, positive, encoding=encoding) # It might be the name of a script or block starting with a prefix. for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): @@ -1696,7 +1726,7 @@ prop_id, value_dict = PROPERTIES.get(property) val_id = value_dict.get(value[2 : ]) if val_id is not None: - return Property((prop_id << 16) | val_id, positive) + return Property((prop_id << 16) | val_id, positive, encoding=encoding) # Unknown property. if not source: @@ -1832,6 +1862,7 @@ FUZZY_OP = 0x4 REVERSE_OP = 0x8 REQUIRED_OP = 0x10 +ENCODING_OP_SHIFT = 5 POS_TEXT = {False: "NON-MATCH", True: "MATCH"} CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", @@ -1914,9 +1945,10 @@ # Base class for zero-width nodes. class ZeroWidthBase(RegexBase): - def __init__(self, positive=True): + def __init__(self, positive=True, encoding=0): RegexBase.__init__(self) self.positive = bool(positive) + self.encoding = encoding self._key = self.__class__, self.positive @@ -1931,11 +1963,12 @@ flags |= FUZZY_OP if reverse: flags |= REVERSE_OP + flags |= self.encoding << ENCODING_OP_SHIFT return [(self._opcode, flags)] def dump(self, indent, reverse): - print("{}{} {}".format(INDENT * indent, self._op_name, - POS_TEXT[self.positive])) + print("{}{} {}{}".format(INDENT * indent, self._op_name, + POS_TEXT[self.positive], ["", " ASCII"][self.encoding])) def max_width(self): return 0 @@ -1963,6 +1996,13 @@ _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} _op_name = "ANY_ALL" + def __init__(self): + self.positive = True + self.zerowidth = False + self.case_flags = 0 + + self._key = self.__class__, self.positive + class AnyU(Any): _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} _op_name = "ANY_U" @@ -3211,18 +3251,20 @@ True): OP.PROPERTY_IGN_REV} def __init__(self, value, positive=True, case_flags=NOCASE, - zerowidth=False): + zerowidth=False, encoding=0): RegexBase.__init__(self) self.value = value self.positive = bool(positive) self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] self.zerowidth = bool(zerowidth) + self.encoding = encoding self._key = (self.__class__, self.value, self.positive, self.case_flags, self.zerowidth) def rebuild(self, positive, case_flags, zerowidth): - return Property(self.value, positive, case_flags, zerowidth) + return Property(self.value, positive, case_flags, zerowidth, + self.encoding) def optimise(self, info, reverse, in_set=False): return self @@ -3241,13 +3283,15 @@ flags |= ZEROWIDTH_OP if fuzzy: flags |= FUZZY_OP + flags |= self.encoding << ENCODING_OP_SHIFT return [(self._opcode[self.case_flags, reverse], flags, self.value)] def dump(self, indent, reverse): prop = PROPERTY_NAMES[self.value >> 16] name, value = prop[0], prop[1][self.value & 0xFFFF] - print("{}PROPERTY {} {}:{}{}".format(INDENT * indent, - POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags])) + print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent, + POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags], + ["", " ASCII"][self.encoding])) def matches(self, ch): return _regex.has_property_value(self.value, ch) == self.positive @@ -3813,9 +3857,21 @@ if isinstance(m, SetUnion) and m.positive: # Union in union. items.extend(m.items) + elif isinstance(m, AnyAll): + return AnyAll() else: items.append(m) + # Are there complementary properties? + properties = (set(), set()) + + for m in items: + if isinstance(m, Property): + properties[m.positive].add((m.value, m.case_flags, m.zerowidth)) + + if properties[0] & properties[1]: + return AnyAll() + if len(items) == 1: i = items[0] return i.with_flags(positive=i.positive == self.positive, @@ -4455,6 +4511,9 @@ "v": "\v", } +ASCII_ENCODING = 1 +UNICODE_ENCODING = 2 + # Predefined character set escape sequences. CHARSET_ESCAPES = { "d": lookup_property(None, "Digit", True), @@ -4466,6 +4525,25 @@ "W": lookup_property(None, "Word", False), } +ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) +ASCII_CHARSET_ESCAPES.update({ + "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING), + "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING), + "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING), + "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING), + "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING), + "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING), +}) +UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) +UNICODE_CHARSET_ESCAPES.update({ + "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING), + "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING), + "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING), + "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING), + "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING), + "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING), +}) + # Positional escape sequences. POSITION_ESCAPES = { "A": StartOfString(), @@ -4476,6 +4554,20 @@ "M": EndOfWord(), "Z": EndOfString(), } +ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES) +ASCII_POSITION_ESCAPES.update({ + "b": Boundary(encoding=ASCII_ENCODING), + "B": Boundary(False, encoding=ASCII_ENCODING), + "m": StartOfWord(encoding=ASCII_ENCODING), + "M": EndOfWord(encoding=ASCII_ENCODING), +}) +UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES) +UNICODE_POSITION_ESCAPES.update({ + "b": Boundary(encoding=UNICODE_ENCODING), + "B": Boundary(False, encoding=UNICODE_ENCODING), + "m": StartOfWord(encoding=UNICODE_ENCODING), + "M": EndOfWord(encoding=UNICODE_ENCODING), +}) # Positional escape sequences when WORD flag set. WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/regex_3/regex.py new/regex-2025.9.1/regex_3/regex.py --- old/regex-2024.11.6/regex_3/regex.py 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/regex_3/regex.py 2025-09-01 23:19:18.000000000 +0200 @@ -241,7 +241,7 @@ "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__", "RegexFlag"] -__version__ = "2.5.148" +__version__ = "2.5.161" # -------------------------------------------------------------------- # Public interface. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/regex_3/test_regex.py new/regex-2025.9.1/regex_3/test_regex.py --- old/regex-2024.11.6/regex_3/test_regex.py 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/regex_3/test_regex.py 2025-09-01 23:19:18.000000000 +0200 @@ -4356,6 +4356,58 @@ self.assertEqual(bool(regex.match(r'<thinking>.*?</thinking>', '<thinking>xyz abc foo ', partial=True)), True) self.assertEqual(bool(regex.match(r'<thinking>.*?</thinking>', '<thinking>xyz abc foo bar', partial=True)), True) + # Git issue 551: + self.assertEqual(bool(regex.match(r'(?V1)[[\s\S]]', 'a')), True) + self.assertEqual(bool(regex.match(r'(?V1)[[\s\S]-a]', 'a')), True) + self.assertEqual(bool(regex.match(r'(?V1)[[\s\S]--a]', 'a')), False) + self.assertEqual(bool(regex.match(r'(?V1)[[a-z]--b]', 'a')), True) + self.assertEqual(bool(regex.match(r'(?V1)[[\s\S]--b]', 'a')), True) + self.assertEqual(bool(regex.match(r'(?V1)[a-[\s\S]]', 'a')), True) + self.assertEqual(bool(regex.match(r'(?V1)[a--[\s\S]]', 'a')), False) + + self.assertEqual(regex.search(r'(?ifu)(H\N{LATIN SMALL LETTER O WITH DIAERESIS}gskolan?)[\\s\\S]*p', + 'Yrkesh\N{LATIN SMALL LETTER O WITH DIAERESIS}gskola . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen . Studie\N{LATIN SMALL LETTER A WITH DIAERESIS}mnen'), + None) + + # Git issue 572: Inline ASCII modifier doesn't seem to affect anything + self.assertEqual(bool(regex.match(r'\d', '\uFF19')), True) + self.assertEqual(bool(regex.match(r'(?a:\d)', '\uFF19')), False) + + # Git issue 575: Issues with ASCII/Unicode modifiers + self.assertEqual(regex.findall('\\d', '9\uFF19'), ['9', '\uff19']) + self.assertEqual(regex.findall('(?u:\\d)', '9\uFF19'), ['9', '\uff19']) + self.assertEqual(regex.findall('(?a:\\d)', '9\uFF19'), ['9']) + + self.assertEqual(regex.findall('\\d', '9\uFF19', flags=regex.U), ['9', '\uff19']) + self.assertEqual(regex.findall('(?u:\\d)', '9\uFF19', flags=regex.U), ['9', '\uff19']) + self.assertEqual(regex.findall('(?a:\\d)', '9\uFF19', flags=regex.U), ['9']) + + self.assertEqual(regex.findall('\\d', '9\uFF19', flags=regex.A), ['9']) + self.assertEqual(regex.findall('(?u:\\d)', '9\uFF19', flags=regex.A), ['9', '\uff19']) + self.assertEqual(regex.findall('(?a:\\d)', '9\uFF19', flags=regex.A), ['9']) + + self.assertEqual(len(regex.findall(r'\p{L}', ''.join(chr(c) for c in range(0x100)), flags=0)), 117) + self.assertEqual(len(regex.findall(r'\p{L}', ''.join(chr(c) for c in range(0x100)), flags=regex.A)), 52) + self.assertEqual(len(regex.findall(r'\p{L}', ''.join(chr(c) for c in range(0x100)), flags=regex.U)), 117) + + self.assertEqual(len(regex.findall(r'(?a:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=0)), 52) + self.assertEqual(len(regex.findall(r'(?a:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=regex.A)), 52) + self.assertEqual(len(regex.findall(r'(?a:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=regex.U)), 52) + + self.assertEqual(len(regex.findall(r'(?u:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=0)), 117) + self.assertEqual(len(regex.findall(r'(?u:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=regex.A)), 117) + self.assertEqual(len(regex.findall(r'(?u:\p{L})', ''.join(chr(c) for c in range(0x100)), flags=regex.U)), 117) + + # Git issue 580: Regression in v2025.7.31: \P{L} no longer matches in simple patterns + self.assertEqual(bool(regex.match(r"\A\P{L}?\p{L}", "hello,")), True) + self.assertEqual(bool(regex.fullmatch(r"\A\P{L}*(?P<w>\p{L}+)\P{L}*\Z", "hello,")), True) + + # Git issue 584: AttributeError: 'AnyAll' object has no attribute 'positive' + self.assertEqual(bool(regex.compile('(\\s|\\S)')), True) + + # Git PR 585: Fix AttributeError: 'AnyAll' object has no attribute '_key' + self.assertEqual(bool(regex.compile('(?:[\\S\\s]|[A-D][M-Z])')), True) + def test_fuzzy_ext(self): self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', 'e')), True) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/setup.py new/regex-2025.9.1/setup.py --- old/regex-2024.11.6/setup.py 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/setup.py 2025-09-01 23:19:18.000000000 +0200 @@ -3,41 +3,7 @@ from setuptools import setup, Extension from os.path import join -with open('README.rst', encoding='utf-8') as file: - long_description = file.read() - setup( - name='regex', - version='2024.11.6', - description='Alternative regular expression module, to replace re.', - long_description=long_description, - long_description_content_type='text/x-rst', - author='Matthew Barnett', - author_email='re...@mrabarnett.plus.com', - url='https://github.com/mrabarnett/mrab-regex', - license='Apache Software License', - - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Python :: 3.13', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Text Processing', - 'Topic :: Text Processing :: General', - ], - python_requires='>=3.8', - - package_dir={'regex': 'regex_3'}, - py_modules=['regex.__init__', 'regex.regex', 'regex._regex_core', - 'regex.test_regex'], ext_modules=[Extension('regex._regex', [join('regex_3', '_regex.c'), join('regex_3', '_regex_unicode.c')])], ) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/regex-2024.11.6/tools/build_regex_unicode.py new/regex-2025.9.1/tools/build_regex_unicode.py --- old/regex-2024.11.6/tools/build_regex_unicode.py 2024-11-06 20:49:51.000000000 +0100 +++ new/regex-2025.9.1/tools/build_regex_unicode.py 2025-09-01 23:19:18.000000000 +0200 @@ -793,9 +793,6 @@ raise ValueError('cannot determine C type for {}..{}'.format(lower, upper)) -def is_binary(property): - return sum(1 for val in val_list if val['id'] != 0) == 1 - def count_ranges(property): count = 0 default_id = property['values'][munge(property['default'])]['id']