* lib/regcomp.c [!_LIBC]: Include localeinfo.h. (re_set_fastmap): Remove icase arg, since it was not right for initial bytes in multibyte locales. Change ch arg to be unsigned char so that callers need not cast. All callers changed. (re_set_fastmap_icase): New function. (re_compile_fastmap_iter): Use it. * modules/regex (Depends-on): Add localeinfo. * tests/test-regex.c (main): Test for the bug. --- ChangeLog | 10 ++++++++ lib/regcomp.c | 61 ++++++++++++++++++++++++++++++---------------- modules/regex | 1 + tests/test-regex.c | 48 ++++++++++++++++++++++++++++++++++++ 4 files changed, 99 insertions(+), 21 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 8853ded780..d3c43b42a5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,15 @@ 2026-05-05 Paul Eggert <[email protected]> + regex: fix glibc bug 20381 + * lib/regcomp.c [!_LIBC]: Include localeinfo.h. + (re_set_fastmap): Remove icase arg, since it was not right + for initial bytes in multibyte locales. Change ch arg to + be unsigned char so that callers need not cast. All callers changed. + (re_set_fastmap_icase): New function. + (re_compile_fastmap_iter): Use it. + * modules/regex (Depends-on): Add localeinfo. + * tests/test-regex.c (main): Test for the bug. + localeinfo: port to single-byte non-__STDC_ISO_10646__ * lib/localeinfo.c (CHAR32_T_IS_UNICODE): New macro. (case_folded_counterparts): In single-byte locales where diff --git a/lib/regcomp.c b/lib/regcomp.c index 3580ea4aa0..ab8716991c 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -21,6 +21,12 @@ # include <locale/weight.h> #endif +/* The localeinfo-related code fixes glibc bug 20381. + Someday this fix should be merged into glibc. */ +#ifndef _LIBC +# include "localeinfo.h" +#endif + static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax); static void re_compile_fastmap_iter (regex_t *bufp, @@ -267,11 +273,31 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) weak_alias (__re_compile_fastmap, re_compile_fastmap) static __always_inline void -re_set_fastmap (char *fastmap, bool icase, int ch) +re_set_fastmap (char *fastmap, unsigned char ch) { fastmap[ch] = 1; - if (icase) - fastmap[tolower (ch)] = 1; +} + +/* Record in FASTMAP the initial byte of the representations of all + characters that match WC ignoring case, other than WC itself. + Use MBS as a scratch state. */ + +static void +re_set_fastmap_icase (char *fastmap, wchar_t wc, mbstate_t *mbs) +{ +#ifdef _LIBC + wchar_t folded[1] = {__towlower (wc)}; + int nfolded = folded[0] != wc; +#else + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int nfolded = case_folded_counterparts (wc, folded); +#endif + for (int i = 0; i < nfolded; i++) + { + char buf[MB_LEN_MAX]; + if (__wcrtomb (buf, folded[i], mbs) != (size_t) -1) + re_set_fastmap (fastmap, buf[0]); + } } /* Helper function for re_compile_fastmap. @@ -283,7 +309,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, { re_dfa_t *dfa = bufp->buffer; Idx node_cnt; - bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) { Idx node = init_state->nodes.elems[node_cnt]; @@ -291,8 +316,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, if (type == CHARACTER) { - re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + re_set_fastmap (fastmap, dfa->nodes[node].opr.c); + if (bufp->syntax & RE_ICASE) { unsigned char buf[MB_LEN_MAX]; unsigned char *p; @@ -307,10 +332,8 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, *p++ = dfa->nodes[node].opr.c; memset (&state, '\0', sizeof (state)); if (__mbrtowc (&wc, (const char *) buf, p - buf, - &state) == p - buf - && (__wcrtomb ((char *) buf, __towlower (wc), &state) - != (size_t) -1)) - re_set_fastmap (fastmap, false, buf[0]); + &state) == p - buf) + re_set_fastmap_icase (fastmap, wc, &state); } } else if (type == SIMPLE_BRACKET) @@ -322,7 +345,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (w & ((bitset_word_t) 1 << j)) - re_set_fastmap (fastmap, icase, ch); + re_set_fastmap (fastmap, ch); } } else if (type == COMPLEX_BRACKET) @@ -344,7 +367,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); for (i = 0; i < SBC_MAX; ++i) if (table[i] < 0) - re_set_fastmap (fastmap, icase, i); + re_set_fastmap (fastmap, i); } #endif /* _LIBC */ @@ -365,7 +388,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, mbstate_t mbs; memset (&mbs, 0, sizeof (mbs)); if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) - re_set_fastmap (fastmap, false, (int) c); + re_set_fastmap (fastmap, c); } while (++c != 0); } @@ -375,17 +398,13 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, /* ... Else catch all bytes which can start the mbchars. */ for (i = 0; i < cset->nmbchars; ++i) { - char buf[256]; + char buf[MB_LEN_MAX]; mbstate_t state; memset (&state, '\0', sizeof (state)); if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) - re_set_fastmap (fastmap, icase, *(unsigned char *) buf); - if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) - { - if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state) - != (size_t) -1) - re_set_fastmap (fastmap, false, *(unsigned char *) buf); - } + re_set_fastmap (fastmap, buf[0]); + if (bufp->syntax & RE_ICASE) + re_set_fastmap_icase (fastmap, cset->mbchars[i], &state); } } } diff --git a/modules/regex b/modules/regex index acc5d6c534..3026772b89 100644 --- a/modules/regex +++ b/modules/regex @@ -35,6 +35,7 @@ intprops [test $ac_use_included_regex = yes] langinfo-h [test $ac_use_included_regex = yes] libc-config [test $ac_use_included_regex = yes] limits-h [test $ac_use_included_regex = yes] +localeinfo [test $ac_use_included_regex = yes] lock [test $ac_use_included_regex = yes] malloc-gnu [test $ac_use_included_regex = yes] mbrtoc32-regular [test $ac_use_included_regex = yes] diff --git a/tests/test-regex.c b/tests/test-regex.c index 0ab97819d2..ae4b34a300 100644 --- a/tests/test-regex.c +++ b/tests/test-regex.c @@ -18,6 +18,7 @@ #include "regex.h" +#include <ctype.h> #include <locale.h> #include <limits.h> #include <stdarg.h> @@ -255,6 +256,53 @@ main (void) } } + /* Test for glibc bug 20381 + <https://sourceware.org/bugzilla/show_bug.cgi?id=20381>. */ + if (setlocale (LC_ALL, "el_GR.iso88597") + || setlocale (LC_ALL, "el_GR.ISO8859-7") + || setlocale (LC_ALL, "el_GR.iso8859-7")) + { + /* Check this only in Greek locales that seem to be working. + In macOS 26, for example, setlocale (LC_ALL, "el_GR.ISO8859-7") + succeed but acts like the C locale. */ + if (toupper (0xf2) == 0xd3 && toupper (0xf3) == 0xd3) + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) + { + static char const str[3][2] = { "\xd3", "\xf2", "\xf3" }; + regex_t re; + int err = regcomp (&re, str[i], REG_ICASE | REG_NOSUB); + if (err) + { + char buf[500]; + regerror (err, &re, buf, sizeof buf); + report_error ("regcomp \\x%02x failed: %s", + (unsigned char) str[i][0], buf); + continue; + } + + int with = regexec (&re, str[j], 0, NULL, 0); + free (re.fastmap); + re.fastmap = NULL; + re.fastmap_accurate = 0; + int without = regexec (&re, str[j], 0, NULL, 0); + if (with != without) + report_error + ("fastmap mismatch: pattern = \\x%02x, string = \\x%02x," + " with = %d, without = %d", + (unsigned char) str[i][0], (unsigned char) str[j][0], + with, without); + + regfree (&re); + } + + if (! setlocale (LC_ALL, "C")) + { + report_error ("setlocale \"C\" failed"); + return exit_status; + } + } + if (setlocale (LC_ALL, "tr_TR.UTF-8")) { if (really_utf8 () && towupper (L'i') == 0x0130 /* U+0130; see below. */) -- 2.54.0
