* lib/localeinfo.c (CHAR32_T_IS_UNICODE): New macro. (case_folded_counterparts): In single-byte locales where char32_t is not known to be Unicode, check all 255 possibilities instead of hoping that char32_t is Unicode. * lib/localeinfo.h (CASE_FOLDED_BUFSIZE): Increase to 255. * modules/localeinfo (Depends-on): Add btoc32. --- ChangeLog | 8 ++++++++ lib/localeinfo.c | 50 +++++++++++++++++++++++++++++++++++++--------- lib/localeinfo.h | 6 +++--- modules/localeinfo | 1 + 4 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 3f1ddaa6f0..8853ded780 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,13 @@ 2026-05-05 Paul Eggert <[email protected]> + localeinfo: port to single-byte non-__STDC_ISO_10646__ + * lib/localeinfo.c (CHAR32_T_IS_UNICODE): New macro. + (case_folded_counterparts): In single-byte locales where + char32_t is not known to be Unicode, check all 255 possibilities + instead of hoping that char32_t is Unicode. + * lib/localeinfo.h (CASE_FOLDED_BUFSIZE): Increase to 255. + * modules/localeinfo (Depends-on): Add btoc32. + localeinfo: don’t check U+03F2 in newer glibc * lib/localeinfo.c (lonesome_lower): Omit U+03F2 in recent Unicode. diff --git a/lib/localeinfo.c b/lib/localeinfo.c index 5a51d6105c..9e9bacad9f 100644 --- a/lib/localeinfo.c +++ b/lib/localeinfo.c @@ -127,6 +127,18 @@ static unsigned short int const lonesome_lower[] = c32tolower, and 1 for each entry in LONESOME_LOWER. */ verify (1 + 1 + countof (lonesome_lower) <= CASE_FOLDED_BUFSIZE); +/* Whether char32_t values are Unicode code points. + It is OK if only UTF-16 is supported, + since this file converts only single-byte encodings to char32_t + and in practice these encodings convert to characters in the BMP. */ +#ifdef GL_CHAR32_T_IS_UNICODE +# define CHAR32_T_IS_UNICODE GL_CHAR32_T_IS_UNICODE /* uchar-h-c23 */ +#elif defined __STDC_ISO_10646__ +# define CHAR32_T_IS_UNICODE 1 /* glibc, musl libc, Cygwin */ +#else +# define CHAR32_T_IS_UNICODE 0 +#endif + /* Find the characters equal to C after case-folding, other than C itself, and store them into FOLDED. Return the number of characters stored; this is zero if C is WEOF. */ @@ -136,16 +148,36 @@ case_folded_counterparts (wint_t c, char32_t folded[CASE_FOLDED_BUFSIZE]) { int n = 0; wint_t uc = c32toupper (c); - wint_t lc = c32tolower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && c32toupper (lc) == uc) - folded[n++] = lc; - for (int i = 0; i < countof (lonesome_lower); i++) + + if (CHAR32_T_IS_UNICODE || 1 < MB_CUR_MAX) { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && c32toupper (li) == uc) - folded[n++] = li; + /* char32_t is Unicode, or this is a multibyte locale where + it is impractical to look for all case-folded counterparts + and where guessing Unicode will not produce false positives + though it may miss some case-folded counterparts. */ + wint_t lc = c32tolower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && c32toupper (lc) == uc) + folded[n++] = lc; + for (int i = 0; i < countof (lonesome_lower); i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && c32toupper (li) == uc) + folded[n++] = li; + } } + else if (c != WEOF) + { + /* A single-byte locale where it is not known that char32_t is Unicode, + and C is not WEOF. Check all 255 possibilities for counterparts. */ + for (int i = 1; i <= UCHAR_MAX; i++) + { + wint_t li = btoc32 (i); + if (li != c && c32toupper (li) == uc) + folded[n++] = li; + } + } + return n; } diff --git a/lib/localeinfo.h b/lib/localeinfo.h index f8b3c970d9..6f8e0addfd 100644 --- a/lib/localeinfo.h +++ b/lib/localeinfo.h @@ -54,9 +54,9 @@ struct localeinfo extern void init_localeinfo (struct localeinfo *); /* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is a generous upper bound. */ -enum { CASE_FOLDED_BUFSIZE = 32 }; + counterparts of a single character, not counting the character itself. + Subtract from 256 one for U+0000. This is a generous upper bound. */ +enum { CASE_FOLDED_BUFSIZE = (unsigned char) -1 }; extern int case_folded_counterparts (wint_t, char32_t[CASE_FOLDED_BUFSIZE]); diff --git a/modules/localeinfo b/modules/localeinfo index 2fc871ee10..536a2a715b 100644 --- a/modules/localeinfo +++ b/modules/localeinfo @@ -7,6 +7,7 @@ lib/localeinfo.h Depends-on: bool +btoc32 c32tolower c32toupper c99 -- 2.54.0
