https://github.com/python/cpython/commit/e66f4a5a9c7ce744030d6352bf5575639b1096cc
commit: e66f4a5a9c7ce744030d6352bf5575639b1096cc
branch: main
author: James <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-02-12T18:50:40+02:00
summary:
gh-80667: Fix case-sensitivity of some Unicode literal escapes (GH-107281)
Lookup for CJK ideograms and Hangul syllables is now case-insensitive,
as is the case for other character names.
files:
A
Misc/NEWS.d/next/Core_and_Builtins/2023-07-26-00-03-00.gh-issue-80667.N7Dh8B.rst
M Lib/test/test_ucn.py
M Modules/unicodedata.c
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
index 0e2c25aaff2fe9..0c641a455c0747 100644
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -88,6 +88,9 @@ def test_hangul_syllables(self):
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
+ self.checkletter("haNGul SYllABle WAe", '\uc65c')
+ self.checkletter("HAngUL syLLabLE waE", '\uc65c')
+
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
def test_cjk_unified_ideographs(self):
@@ -103,6 +106,11 @@ def test_cjk_unified_ideographs(self):
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
+ self.checkletter("cjK UniFIeD idEogRAph-3aBc", "\u3abc")
+ self.checkletter("CJk uNIfiEd IDeOGraPH-3AbC", "\u3abc")
+ self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
+ self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
+
def test_bmp_characters(self):
for code in range(0x10000):
char = chr(code)
diff --git
a/Misc/NEWS.d/next/Core_and_Builtins/2023-07-26-00-03-00.gh-issue-80667.N7Dh8B.rst
b/Misc/NEWS.d/next/Core_and_Builtins/2023-07-26-00-03-00.gh-issue-80667.N7Dh8B.rst
new file mode 100644
index 00000000000000..db87a5ed9c7fc2
--- /dev/null
+++
b/Misc/NEWS.d/next/Core_and_Builtins/2023-07-26-00-03-00.gh-issue-80667.N7Dh8B.rst
@@ -0,0 +1,2 @@
+Literals using the ``\N{name}`` escape syntax can now construct CJK
+ideographs and Hangul syllables using case-insensitive names.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 091e6bcb9f3f49..44ffedec3840fe 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1405,7 +1405,7 @@ find_syllable(const char *str, int *len, int *pos, int
count, int column)
len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
if (len1 <= *len)
continue;
- if (strncmp(str, s, len1) == 0) {
+ if (PyOS_strnicmp(str, s, len1) == 0) {
*len = len1;
*pos = i;
}
@@ -1437,7 +1437,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
* PUA */
/* Check for hangul syllables. */
- if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
+ if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
int len, L = -1, V = -1, T = -1;
const char *pos = name + 16;
find_syllable(pos, &len, &L, LCount, 0);
@@ -1455,7 +1455,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
}
/* Check for unified ideographs. */
- if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+ if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
unsigned int v;
v = 0;
@@ -1465,10 +1465,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
return 0;
while (namelen--) {
v *= 16;
- if (*name >= '0' && *name <= '9')
- v += *name - '0';
- else if (*name >= 'A' && *name <= 'F')
- v += *name - 'A' + 10;
+ Py_UCS1 c = Py_TOUPPER(*name);
+ if (c >= '0' && c <= '9')
+ v += c - '0';
+ else if (c >= 'A' && c <= 'F')
+ v += c - 'A' + 10;
else
return 0;
name++;
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]