From d63abe7ee990ee84dac366564e12d0c1b4102382 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Sun, 2 Mar 2014 23:02:22 -0800 Subject: [PATCH] grep: fix some unlikely bugs in trivial_case_ignore * src/main.c (MBRTOWC, WCRTOMB): Reformat as per usual GNU style. (trivial_case_ignore): Don't overrun buffer in the unusual case when a character has both lowercase and uppercase counterparts. Don't rely on undefined behavior when assigning out-of-range value to an 'int'. Simplify by avoiding unnecessary buffer copies. Work even with shift encodings, by using mbsinit to disable the optimization if we are not in the initial state when we replace B by [BCD]. --- src/main.c | 72 ++++++++++++++++++++++++++++---------------------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/src/main.c b/src/main.c index 2ee585a..14b7be2 100644 --- a/src/main.c +++ b/src/main.c @@ -1867,19 +1867,20 @@ parse_grep_colors (void) return; } +#define MBRTOWC(pwc, s, n, ps) \ + (MB_CUR_MAX == 1 \ + ? (*(pwc) = btowc (*(unsigned char *) (s)), 1) \ + : mbrtowc (pwc, s, n, ps)) +#define WCRTOMB(s, wc, ps) \ + (MB_CUR_MAX == 1 \ + ? (*(s) = wctob ((wint_t) (wc)), 1) \ + : wcrtomb (s, wc, ps)) + /* If the newline-separated regular expressions, KEYS (with length, LEN and no trailing NUL byte), are amenable to transformation into otherwise equivalent case-ignoring ones, perform the transformation, put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN, and return true. Otherwise, return false. */ -#define MBRTOWC(pwc, s, n, ps) \ - (MB_CUR_MAX == 1 ? \ - (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \ - mbrtowc ((pwc), (s), (n), (ps))) -#define WCRTOMB(s, wc, ps) \ - (MB_CUR_MAX == 1 ? \ - (*(s) = wctob ((wint_t) (wc)), 1) : \ - wcrtomb ((s), (wc), (ps))) static bool trivial_case_ignore (size_t len, char const *keys, @@ -1890,21 +1891,23 @@ trivial_case_ignore (size_t len, char const *keys, if (memchr (keys, '\\', len) || memchr (keys, '[', len)) return false; - /* Worst case is that each byte B of KEYS is ASCII alphabetic and each - other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B - maps to [BC], which requires MB_CUR_MAX + 3 bytes. */ - *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1); + /* Worst case is that each byte B of KEYS is ASCII alphabetic and + the two two other_case(B) characters, C and D, each occupies + MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 * + MB_CUR_MAX + 3 bytes; this is bounded above by the constant + expression 2 * MB_LEN_MAX + 3. */ + *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3); char *p = *new_keys; - mbstate_t mb_state; - memset (&mb_state, 0, sizeof mb_state); + mbstate_t mb_state = { 0 }; while (len) { + bool initial_state = mbsinit (&mb_state) != 0; wchar_t wc; - int n = MBRTOWC (&wc, keys, len, &mb_state); + size_t n = MBRTOWC (&wc, keys, len, &mb_state); /* For an invalid, incomplete or L'\0', skip this optimization. */ - if (n <= 0) + if ((size_t) -2 <= n) { skip_case_ignore_optimization: free (*new_keys); @@ -1915,39 +1918,30 @@ trivial_case_ignore (size_t len, char const *keys, keys += n; len -= n; - if (!iswalpha (wc)) + wint_t lc = towlower (wc); + wint_t uc = towupper (wc); + if (lc == wc && uc == wc) { memcpy (p, orig, n); p += n; } + else if (! initial_state) + goto skip_case_ignore_optimization; else { *p++ = '['; memcpy (p, orig, n); p += n; - wint_t folded = towlower (wc); - if (folded != wc) - { - char buf[MB_CUR_MAX]; - int n2 = WCRTOMB (buf, folded, &mb_state); - if (n2 <= 0) - goto skip_case_ignore_optimization; - assert (n2 <= MB_CUR_MAX); - memcpy (p, buf, n2); - p += n2; - } - folded = towupper (wc); - if (folded != wc) - { - char buf[MB_CUR_MAX]; - int n2 = WCRTOMB (buf, folded, &mb_state); - if (n2 <= 0) - goto skip_case_ignore_optimization; - assert (n2 <= MB_CUR_MAX); - memcpy (p, buf, n2); - p += n2; - } + size_t lcbytes = WCRTOMB (p, lc, &mb_state); + if (lcbytes == (size_t) -1) + goto skip_case_ignore_optimization; + p += lcbytes; + + size_t ucbytes = WCRTOMB (p, uc, &mb_state); + if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state)) + goto skip_case_ignore_optimization; + p += ucbytes; *p++ = ']'; } -- 1.8.5.3
Thanks, I tweaked the ChangeLog entries a bit and pushed that. I also
pushed the attached patch, which fixes some new bugs and some bugs that
were reintroduced by the revival of trivial_case_ignore. I wish we
didn't need that function, as it is a bit of a kludge.
- bug#16912: [PATCH] no longer use CSET for non-UTF8 locale ... Norihiro Tanaka
- bug#16912: [PATCH] no longer use CSET for non-UTF8 lo... Paul Eggert
- bug#16912: [PATCH] no longer use CSET for non-UTF... Norihiro Tanaka
- bug#16912: [PATCH] no longer use CSET for non... Norihiro Tanaka
- bug#16912: [PATCH] no longer use CSET for... Paul Eggert
- bug#16912: [PATCH] no longer use CSET for non... Paul Eggert
- bug#16912: [PATCH] no longer use CSET for... Paolo Bonzini
- bug#16912: [PATCH] no longer use CSE... Norihiro Tanaka
- bug#16912: [PATCH] no longer use... Paolo Bonzini
- bug#16912: [PATCH] no longer... Norihiro Tanaka
- bug#16912: [PATCH] no longer... Paolo Bonzini
- bug#16912: [PATCH] no longer... Norihiro Tanaka
