[Python-checkins] [3.13] gh-53203: Fix strptime() for %c, %x and %X formats on some locales (GH-135971) (GH-136020)

serhiy-storchaka Fri, 27 Jun 2025 01:31:08 -0700

https://github.com/python/cpython/commit/ea25f4a8ecc1fd32960ce4c3e8d27f85a1bf9477
commit: ea25f4a8ecc1fd32960ce4c3e8d27f85a1bf9477
branch: 3.13
author: Miss Islington (bot) <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-06-27T08:12:58Z
summary:


[3.13] gh-53203: Fix strptime() for %c, %x and %X formats on some locales 
(GH-135971) (GH-136020)

* Add detection of decimal non-ASCII alt digits.
* Add support of non-decimal alt digits on locale lzh_TW.
* Accept only numbers in correct range if alt digits are known.
* Fix bug in detecting the position of the week day name on locales byn_ER and 
wal_ET.
* Fix support of single-digit hour on locales ar_SA and bg_BG.
* Add support for %T, %R, %r, %C, %OC.
* Prepare code to use nl_langinfo().

(cherry picked from commit 07183ebce36462aaaea4d20e0502b20821dd2682)

Co-authored-by: Serhiy Storchaka <[email protected]>

files:
A Misc/NEWS.d/next/Library/2025-06-26-11-52-40.gh-issue-53203.TMigBr.rst
M Lib/_strptime.py
M Lib/test/test_strptime.py

diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index bf92a0b028c9ab..f570cf820d5054 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -14,6 +14,7 @@
 import time
 import locale
 import calendar
+import re
 from re import compile as re_compile
 from re import sub as re_sub
 from re import IGNORECASE
@@ -41,6 +42,21 @@ def _findall(haystack, needle):
         yield i
         i += len(needle)
 
+
+lzh_TW_alt_digits = (
+    # 〇:一:二:三:四:五:六:七:八:九
+    '\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
+    '\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
+    # 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
+    '\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
+    '\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', 
'\u5341\u4e5d',
+    # 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
+    '\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
+    '\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', 
'\u5eff\u4e5d',
+    # 卅:卅一
+    '\u5345', '\u5345\u4e00')
+
+
 class LocaleTime(object):
     """Stores and handles locale-specific information related to time.
 
@@ -84,6 +100,7 @@ def __init__(self):
         self.__calc_weekday()
         self.__calc_month()
         self.__calc_am_pm()
+        self.__calc_alt_digits()
         self.__calc_timezone()
         self.__calc_date_time()
         if _getlang() != self.lang:
@@ -119,9 +136,43 @@ def __calc_am_pm(self):
             am_pm.append(time.strftime("%p", time_tuple).lower().strip())
         self.am_pm = am_pm
 
+    def __calc_alt_digits(self):
+        # Set self.LC_alt_digits by using time.strftime().
+
+        # The magic data should contain all decimal digits.
+        time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
+        s = time.strftime("%x%X", time_tuple)
+        if s.isascii():
+            # Fast path -- all digits are ASCII.
+            self.LC_alt_digits = ()
+            return
+
+        digits = ''.join(sorted(set(re.findall(r'\d', s))))
+        if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
+            # All 10 decimal digits from the same set.
+            if digits.isascii():
+                # All digits are ASCII.
+                self.LC_alt_digits = ()
+                return
+
+            self.LC_alt_digits = [a + b for a in digits for b in digits]
+            # Test whether the numbers contain leading zero.
+            time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
+            if self.LC_alt_digits[1] not in time.strftime("%x %X", 
time_tuple2):
+                self.LC_alt_digits[:10] = digits
+            return
+
+        # Either non-Gregorian calendar or non-decimal numbers.
+        if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
+            # lzh_TW
+            self.LC_alt_digits = lzh_TW_alt_digits
+            return
+
+        self.LC_alt_digits = None
+
     def __calc_date_time(self):
-        # Set self.date_time, self.date, & self.time by using
-        # time.strftime().
+        # Set self.LC_date_time, self.LC_date, self.LC_time and
+        # self.LC_time_ampm by using time.strftime().
 
         # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
         # overloaded numbers is minimized.  The order in which searches for
@@ -129,26 +180,32 @@ def __calc_date_time(self):
         # possible ambiguity for what something represents.
         time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
         time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
-        replacement_pairs = [
+        replacement_pairs = []
+
+        # Non-ASCII digits
+        if self.LC_alt_digits or self.LC_alt_digits is None:
+            for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
+                         (44, '%OM'), (55, '%OS'), (17, '%Od'),
+                         (3, '%Om'), (2, '%Ow'), (10, '%OI')]:
+                if self.LC_alt_digits is None:
+                    s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
+                    replacement_pairs.append((s, d))
+                    if n < 10:
+                        replacement_pairs.append((s[1], d))
+                elif len(self.LC_alt_digits) > n:
+                    replacement_pairs.append((self.LC_alt_digits[n], d))
+                else:
+                    replacement_pairs.append((time.strftime(d, time_tuple), d))
+        replacement_pairs += [
             ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
             ('44', '%M'), ('55', '%S'), ('76', '%j'),
             ('17', '%d'), ('03', '%m'), ('3', '%m'),
             # '3' needed for when no leading zero.
             ('2', '%w'), ('10', '%I'),
-            # Non-ASCII digits
-            ('\u0661\u0669\u0669\u0669', '%Y'),
-            ('\u0669\u0669', '%Oy'),
-            ('\u0662\u0662', '%OH'),
-            ('\u0664\u0664', '%OM'),
-            ('\u0665\u0665', '%OS'),
-            ('\u0661\u0667', '%Od'),
-            ('\u0660\u0663', '%Om'),
-            ('\u0663', '%Om'),
-            ('\u0662', '%Ow'),
-            ('\u0661\u0660', '%OI'),
         ]
+
         date_time = []
-        for directive in ('%c', '%x', '%X'):
+        for directive in ('%c', '%x', '%X', '%r'):
             current_format = time.strftime(directive, time_tuple).lower()
             current_format = current_format.replace('%', '%%')
             # The month and the day of the week formats are treated specially
@@ -172,9 +229,10 @@ def __calc_date_time(self):
                     if tz:
                         current_format = current_format.replace(tz, "%Z")
             # Transform all non-ASCII digits to digits in range U+0660 to 
U+0669.
-            current_format = re_sub(r'\d(?<![0-9])',
-                                    lambda m: chr(0x0660 + int(m[0])),
-                                    current_format)
+            if not current_format.isascii() and self.LC_alt_digits is None:
+                current_format = re_sub(r'\d(?<![0-9])',
+                                        lambda m: chr(0x0660 + int(m[0])),
+                                        current_format)
             for old, new in replacement_pairs:
                 current_format = current_format.replace(old, new)
             # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@@ -189,6 +247,7 @@ def __calc_date_time(self):
         self.LC_date_time = date_time[0]
         self.LC_date = date_time[1]
         self.LC_time = date_time[2]
+        self.LC_time_ampm = date_time[3]
 
     def __find_month_format(self, directive):
         """Find the month format appropriate for the current locale.
@@ -213,7 +272,7 @@ def __find_month_format(self, directive):
                 full_indices &= indices
             indices = set(_findall(datetime, self.a_month[m]))
             if abbr_indices is None:
-                abbr_indices = indices
+                abbr_indices = set(indices)
             else:
                 abbr_indices &= indices
             if not full_indices and not abbr_indices:
@@ -241,7 +300,7 @@ def __find_weekday_format(self, directive):
             if self.f_weekday[wd] != self.a_weekday[wd]:
                 indices = set(_findall(datetime, self.a_weekday[wd]))
             if abbr_indices is None:
-                abbr_indices = indices
+                abbr_indices = set(indices)
             else:
                 abbr_indices &= indices
             if not full_indices and not abbr_indices:
@@ -288,8 +347,10 @@ def __init__(self, locale_time=None):
             # The " [1-9]" part of the regex is to make %c from ANSI C work
             'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
             'f': r"(?P<f>[0-9]{1,6})",
-            'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
+            'H': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
+            'k': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
             'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
+            'l': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
             'G': r"(?P<G>\d\d\d\d)",
             'j': 
r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
             'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
@@ -312,16 +373,49 @@ def __init__(self, locale_time=None):
                                         for tz in tz_names),
                                 'Z'),
             '%': '%'}
-        for d in 'dmyHIMS':
-            mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
-        mapping['Ow'] = r'(?P<w>\d)'
+        if self.locale_time.LC_alt_digits is None:
+            for d in 'dmyCHIMS':
+                mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
+            mapping['Ow'] = r'(?P<w>\d)'
+        else:
+            mapping.update({
+                'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
+                                     '3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
+                'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
+                                     '1[0-2]|0[1-9]|[1-9]'),
+                'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
+                                     '[0-6]'),
+                'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
+                                     '[0-9][0-9]'),
+                'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
+                                     '[0-9][0-9]'),
+                'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
+                                     '2[0-3]|[0-1][0-9]|[0-9]'),
+                'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
+                                     '1[0-2]|0[1-9]|[1-9]'),
+                'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
+                                     '[0-5][0-9]|[0-9]'),
+                'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
+                                     '6[0-1]|[0-5][0-9]|[0-9]'),
+            })
+        mapping.update({
+            'e': mapping['d'],
+            'Oe': mapping['Od'],
+            'P': mapping['p'],
+            'Op': mapping['p'],
+            'W': mapping['U'].replace('U', 'W'),
+        })
         mapping['W'] = mapping['U'].replace('U', 'W')
+
         base.__init__(mapping)
+        base.__setitem__('T', self.pattern('%H:%M:%S'))
+        base.__setitem__('R', self.pattern('%H:%M'))
+        base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
         base.__setitem__('X', self.pattern(self.locale_time.LC_time))
         base.__setitem__('x', self.pattern(self.locale_time.LC_date))
         base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
 
-    def __seqToRE(self, to_convert, directive):
+    def __seqToRE(self, to_convert, directive, altregex=None):
         """Convert a list to a regex string for matching a directive.
 
         Want possible matching values to be from longest to shortest.  This
@@ -337,8 +431,9 @@ def __seqToRE(self, to_convert, directive):
         else:
             return ''
         regex = '|'.join(re_escape(stuff) for stuff in to_convert)
-        regex = '(?P<%s>%s' % (directive, regex)
-        return '%s)' % regex
+        if altregex is not None:
+            regex += '|' + altregex
+        return '(?P<%s>%s)' % (directive, regex)
 
     def pattern(self, format):
         """Return regex pattern for the format string.
@@ -365,7 +460,7 @@ def repl(m):
                     nonlocal day_of_month_in_format
                     day_of_month_in_format = True
             return self[format_char]
-        format = re_sub(r'%([OE]?\\?.?)', repl, format)
+        format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
         if day_of_month_in_format and not year_in_format:
             import warnings
             warnings.warn("""\
@@ -467,6 +562,15 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
     # values
     weekday = julian = None
     found_dict = found.groupdict()
+    if locale_time.LC_alt_digits:
+        def parse_int(s):
+            try:
+                return locale_time.LC_alt_digits.index(s)
+            except ValueError:
+                return int(s)
+    else:
+        parse_int = int
+
     for group_key in found_dict.keys():
         # Directives not explicitly handled below:
         #   c, x, X
@@ -474,30 +578,34 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
         #   U, W
         #      worthless without day of the week
         if group_key == 'y':
-            year = int(found_dict['y'])
-            # Open Group specification for strptime() states that a %y
-            #value in the range of [00, 68] is in the century 2000, while
-            #[69,99] is in the century 1900
-            if year <= 68:
-                year += 2000
+            year = parse_int(found_dict['y'])
+            if 'C' in found_dict:
+                century = parse_int(found_dict['C'])
+                year += century * 100
             else:
-                year += 1900
+                # Open Group specification for strptime() states that a %y
+                #value in the range of [00, 68] is in the century 2000, while
+                #[69,99] is in the century 1900
+                if year <= 68:
+                    year += 2000
+                else:
+                    year += 1900
         elif group_key == 'Y':
             year = int(found_dict['Y'])
         elif group_key == 'G':
             iso_year = int(found_dict['G'])
         elif group_key == 'm':
-            month = int(found_dict['m'])
+            month = parse_int(found_dict['m'])
         elif group_key == 'B':
             month = locale_time.f_month.index(found_dict['B'].lower())
         elif group_key == 'b':
             month = locale_time.a_month.index(found_dict['b'].lower())
         elif group_key == 'd':
-            day = int(found_dict['d'])
+            day = parse_int(found_dict['d'])
         elif group_key == 'H':
-            hour = int(found_dict['H'])
+            hour = parse_int(found_dict['H'])
         elif group_key == 'I':
-            hour = int(found_dict['I'])
+            hour = parse_int(found_dict['I'])
             ampm = found_dict.get('p', '').lower()
             # If there was no AM/PM indicator, we'll treat this like AM
             if ampm in ('', locale_time.am_pm[0]):
@@ -513,9 +621,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
                 if hour != 12:
                     hour += 12
         elif group_key == 'M':
-            minute = int(found_dict['M'])
+            minute = parse_int(found_dict['M'])
         elif group_key == 'S':
-            second = int(found_dict['S'])
+            second = parse_int(found_dict['S'])
         elif group_key == 'f':
             s = found_dict['f']
             # Pad to always return microseconds.
diff --git a/Lib/test/test_strptime.py b/Lib/test/test_strptime.py
index 1a42de611ab0ab..bd9ff5c26ca93d 100644
--- a/Lib/test/test_strptime.py
+++ b/Lib/test/test_strptime.py
@@ -224,14 +224,16 @@ def test_ValueError(self):
         self.assertRaises(ValueError, _strptime._strptime_time, 
data_string="%d",
                           format="%A")
         for bad_format in ("%", "% ", "%\n"):
-            with self.assertRaisesRegex(ValueError, "stray % in format "):
+            with (self.subTest(format=bad_format),
+                  self.assertRaisesRegex(ValueError, "stray % in format ")):
                 _strptime._strptime_time("2005", bad_format)
-        for bad_format in ("%e", "%Oe", "%O", "%O ", "%Ee", "%E", "%E ",
-                           "%.", "%+", "%_", "%~", "%\\",
+        for bad_format in ("%i", "%Oi", "%O", "%O ", "%Ee", "%E", "%E ",
+                           "%.", "%+", "%~", "%\\",
                            "%O.", "%O+", "%O_", "%O~", "%O\\"):
             directive = bad_format[1:].rstrip()
-            with self.assertRaisesRegex(ValueError,
-                    f"'{re.escape(directive)}' is a bad directive in format "):
+            with (self.subTest(format=bad_format),
+                  self.assertRaisesRegex(ValueError,
+                    f"'{re.escape(directive)}' is a bad directive in format 
")):
                 _strptime._strptime_time("2005", bad_format)
 
         msg_week_no_year_or_weekday = r"ISO week directive '%V' must be used 
with " \
@@ -486,13 +488,11 @@ def test_bad_timezone(self):
     # * Year is not included: ha_NG.
     # * Use non-Gregorian calendar: lo_LA, thai, th_TH.
     #   On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
-    #
-    # BUG: Generates regexp that does not match the current date and time
-    # for lzh_TW.
     @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
                       'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
                       'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
-                      'my_MM', 'or_IN', 'shn_MM', 'az_IR')
+                      'my_MM', 'or_IN', 'shn_MM', 'az_IR',
+                      'byn_ER', 'wal_ET', 'lzh_TW')
     def test_date_time_locale(self):
         # Test %c directive
         loc = locale.getlocale(locale.LC_TIME)[0]
@@ -531,11 +531,9 @@ def test_date_time_locale2(self):
 
     # NB: Does not roundtrip because use non-Gregorian calendar:
     # lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
-    # BUG: Generates regexp that does not match the current date
-    # for lzh_TW.
     @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
                       'he_IL', 'eu_ES', 'ar_AE',
-                      'az_IR', 'my_MM', 'or_IN', 'shn_MM')
+                      'az_IR', 'my_MM', 'or_IN', 'shn_MM', 'lzh_TW')
     def test_date_locale(self):
         # Test %x directive
         now = time.time()
@@ -555,7 +553,7 @@ def test_date_locale(self):
         "musl libc issue on Emscripten, bpo-46390"
     )
     @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
-                      'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
+                      'eu_ES', 'ar_AE', 'my_MM', 'shn_MM', 'lzh_TW')
     def test_date_locale2(self):
         # Test %x directive
         loc = locale.getlocale(locale.LC_TIME)[0]
@@ -571,11 +569,11 @@ def test_date_locale2(self):
     #   norwegian, nynorsk.
     # * Hours are in 12-hour notation without AM/PM indication: hy_AM,
     #   ms_MY, sm_WS.
-    # BUG: Generates regexp that does not match the current time for lzh_TW.
     @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
                       'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
                       'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
-                      'ti_ET', 'tig_ER', 'wal_ET')
+                      'ti_ET', 'tig_ER', 'wal_ET', 'lzh_TW',
+                      'ar_SA', 'bg_BG')
     def test_time_locale(self):
         # Test %X directive
         loc = locale.getlocale(locale.LC_TIME)[0]
diff --git 
a/Misc/NEWS.d/next/Library/2025-06-26-11-52-40.gh-issue-53203.TMigBr.rst 
b/Misc/NEWS.d/next/Library/2025-06-26-11-52-40.gh-issue-53203.TMigBr.rst
new file mode 100644
index 00000000000000..ba2fae49fdc933
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-06-26-11-52-40.gh-issue-53203.TMigBr.rst
@@ -0,0 +1,2 @@
+Fix :func:`time.strptime` for ``%c`` and ``%x`` formats on locales byn_ER,
+wal_ET and lzh_TW, and for ``%X`` format on locales ar_SA, bg_BG and lzh_TW.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] [3.13] gh-53203: Fix strptime() for %c, %x and %X formats on some locales (GH-135971) (GH-136020)

Reply via email to