bug#16912: [PATCH] no longer use CSET for non-UTF8 locale in DFA engine

Paul Eggert Sun, 02 Mar 2014 23:08:57 -0800
Thanks, I tweaked the ChangeLog entries a bit and pushed that. I alsopushed the attached patch, which fixes some new bugs and some bugs thatwere reintroduced by the revival of trivial_case_ignore. I wish wedidn't need that function, as it is a bit of a kludge.
From d63abe7ee990ee84dac366564e12d0c1b4102382 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 2 Mar 2014 23:02:22 -0800
Subject: [PATCH] grep: fix some unlikely bugs in trivial_case_ignore

* src/main.c (MBRTOWC, WCRTOMB): Reformat as per usual GNU style.
(trivial_case_ignore): Don't overrun buffer in the unusual case
when a character has both lowercase and uppercase counterparts.
Don't rely on undefined behavior when assigning out-of-range value
to an 'int'.  Simplify by avoiding unnecessary buffer copies.
Work even with shift encodings, by using mbsinit to
disable the optimization if we are not in the initial state
when we replace B by [BCD].
---
 src/main.c | 72 ++++++++++++++++++++++++++++----------------------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/src/main.c b/src/main.c
index 2ee585a..14b7be2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1867,19 +1867,20 @@ parse_grep_colors (void)
       return;
 }
 
+#define MBRTOWC(pwc, s, n, ps) \
+  (MB_CUR_MAX == 1 \
+   ? (*(pwc) = btowc (*(unsigned char *) (s)), 1) \
+   : mbrtowc (pwc, s, n, ps))
+#define WCRTOMB(s, wc, ps) \
+  (MB_CUR_MAX == 1 \
+   ? (*(s) = wctob ((wint_t) (wc)), 1) \
+   : wcrtomb (s, wc, ps))
+
 /* If the newline-separated regular expressions, KEYS (with length, LEN
    and no trailing NUL byte), are amenable to transformation into
    otherwise equivalent case-ignoring ones, perform the transformation,
    put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
    and return true.  Otherwise, return false.  */
-#define MBRTOWC(pwc, s, n, ps) \
-  (MB_CUR_MAX == 1 ? \
-   (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
-   mbrtowc ((pwc), (s), (n), (ps)))
-#define WCRTOMB(s, wc, ps) \
-  (MB_CUR_MAX == 1 ? \
-   (*(s) = wctob ((wint_t) (wc)), 1) : \
-   wcrtomb ((s), (wc), (ps)))
 
 static bool
 trivial_case_ignore (size_t len, char const *keys,
@@ -1890,21 +1891,23 @@ trivial_case_ignore (size_t len, char const *keys,
   if (memchr (keys, '\\', len) || memchr (keys, '[', len))
     return false;
 
-  /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
-     other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
-     maps to [BC], which requires MB_CUR_MAX + 3 bytes.   */
-  *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1);
+  /* Worst case is that each byte B of KEYS is ASCII alphabetic and
+     the two two other_case(B) characters, C and D, each occupies
+     MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 *
+     MB_CUR_MAX + 3 bytes; this is bounded above by the constant
+     expression 2 * MB_LEN_MAX + 3.  */
+  *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3);
   char *p = *new_keys;
 
-  mbstate_t mb_state;
-  memset (&mb_state, 0, sizeof mb_state);
+  mbstate_t mb_state = { 0 };
   while (len)
     {
+      bool initial_state = mbsinit (&mb_state) != 0;
       wchar_t wc;
-      int n = MBRTOWC (&wc, keys, len, &mb_state);
+      size_t n = MBRTOWC (&wc, keys, len, &mb_state);
 
       /* For an invalid, incomplete or L'\0', skip this optimization.  */
-      if (n <= 0)
+      if ((size_t) -2 <= n)
         {
         skip_case_ignore_optimization:
           free (*new_keys);
@@ -1915,39 +1918,30 @@ trivial_case_ignore (size_t len, char const *keys,
       keys += n;
       len -= n;
 
-      if (!iswalpha (wc))
+      wint_t lc = towlower (wc);
+      wint_t uc = towupper (wc);
+      if (lc == wc && uc == wc)
         {
           memcpy (p, orig, n);
           p += n;
         }
+      else if (! initial_state)
+        goto skip_case_ignore_optimization;
       else
         {
           *p++ = '[';
           memcpy (p, orig, n);
           p += n;
 
-          wint_t folded = towlower (wc);
-          if (folded != wc)
-            {
-              char buf[MB_CUR_MAX];
-              int n2 = WCRTOMB (buf, folded, &mb_state);
-              if (n2 <= 0)
-                goto skip_case_ignore_optimization;
-              assert (n2 <= MB_CUR_MAX);
-              memcpy (p, buf, n2);
-              p += n2;
-            }
-          folded = towupper (wc);
-          if (folded != wc)
-            {
-              char buf[MB_CUR_MAX];
-              int n2 = WCRTOMB (buf, folded, &mb_state);
-              if (n2 <= 0)
-                goto skip_case_ignore_optimization;
-              assert (n2 <= MB_CUR_MAX);
-              memcpy (p, buf, n2);
-              p += n2;
-            }
+          size_t lcbytes = WCRTOMB (p, lc, &mb_state);
+          if (lcbytes == (size_t) -1)
+            goto skip_case_ignore_optimization;
+          p += lcbytes;
+
+          size_t ucbytes = WCRTOMB (p, uc, &mb_state);
+          if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state))
+            goto skip_case_ignore_optimization;
+          p += ucbytes;
 
           *p++ = ']';
         }
-- 
1.8.5.3
bug#16912: [PATCH] no longer use CSET for non-UTF8 locale in DFA engine

Reply via email to