From 9cf6688bce1e3aeb3ba61439dac41dd3dcc9b407 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sat, 1 Mar 2014 18:45:04 +0900
Subject: [PATCH 1/2] no longer use CSET for non-UTF8 locales in DFA engine

---
 src/dfa.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 80bb807..eed1020 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1045,7 +1045,8 @@ parse_bracket_exp (void)
                   if (!pred)
                     dfaerror (_("invalid character class"));
 
-                  if (MB_CUR_MAX > 1 && !pred->single_byte_only)
+                  if (MB_CUR_MAX > 1
+                      && (!using_utf8 () || !pred->single_byte_only))
                     {
                       /* Store the character class as wctype_t.  */
                       wctype_t wt = wctype (class);
@@ -1152,21 +1153,21 @@ parse_bracket_exp (void)
       if (case_fold)
         {
           wint_t folded = towlower (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
+          if (folded != wc && (!using_utf8 () || !setbit_wc (folded, ccl)))
             {
               REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
                                     work_mbc->nchars + 1);
               work_mbc->chars[work_mbc->nchars++] = folded;
             }
           folded = towupper (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
+          if (folded != wc && (!using_utf8 () || !setbit_wc (folded, ccl)))
             {
               REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
                                     work_mbc->nchars + 1);
               work_mbc->chars[work_mbc->nchars++] = folded;
             }
         }
-      if (!setbit_wc (wc, ccl))
+      if (!using_utf8 () || !setbit_wc (wc, ccl))
         {
           REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
                                 work_mbc->nchars + 1);
@@ -1600,7 +1601,6 @@ addtok (token t)
 
       /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
       if (work_mbc->invert
-          || (!using_utf8 () && work_mbc->cset != -1)
           || work_mbc->nchars != 0
           || work_mbc->nch_classes != 0
           || work_mbc->nranges != 0
-- 
1.8.5.2


From d9f9a5aedad9e81cf56b6c4e375e5f395e4e213a Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sat, 1 Mar 2014 18:45:18 +0900
Subject: [PATCH 2/2] revert: remove trivial_case_ignore

---
 src/main.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/src/main.c b/src/main.c
index 7c04d77..090fc4f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1867,6 +1867,83 @@ parse_grep_colors (void)
       return;
 }
 
+/* If the newline-separated regular expressions, KEYS (with length, LEN
+   and no trailing NUL byte), are amenable to transformation into
+   otherwise equivalent case-ignoring ones, perform the transformation,
+   put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
+   and return true.  Otherwise, return false.  */
+#define MBRTOWC(pwc, s, n, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
+   mbrtowc ((pwc), (s), (n), (ps)))
+#define WCRTOMB(s, wc, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(s) = wctob ((wint_t) (wc)), 1) : \
+   wcrtomb ((s), (wc), (ps)))
+
+static bool
+trivial_case_ignore (size_t len, char const *keys,
+                     size_t *new_len, char **new_keys)
+{
+  /* FIXME: consider removing the following restriction:
+     Reject if KEYS contain ASCII '\\' or '['.  */
+  if (memchr (keys, '\\', len) || memchr (keys, '[', len))
+    return false;
+
+  /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
+     other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
+     maps to [BC], which requires MB_CUR_MAX + 3 bytes.   */
+  *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1);
+  char *p = *new_keys;
+
+  mbstate_t mb_state;
+  memset (&mb_state, 0, sizeof mb_state);
+  while (len)
+    {
+      wchar_t wc;
+      int n = MBRTOWC (&wc, keys, len, &mb_state);
+
+      /* For an invalid, incomplete or L'\0', skip this optimization.  */
+      if (n <= 0)
+        {
+        skip_case_ignore_optimization:
+          free (*new_keys);
+          return false;
+        }
+
+      char const *orig = keys;
+      keys += n;
+      len -= n;
+
+      if (!iswalpha (wc))
+        {
+          memcpy (p, orig, n);
+          p += n;
+        }
+      else
+        {
+          *p++ = '[';
+          memcpy (p, orig, n);
+          p += n;
+
+          wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc);
+          char buf[MB_CUR_MAX];
+          int n2 = WCRTOMB (buf, wc2, &mb_state);
+          if (n2 <= 0)
+            goto skip_case_ignore_optimization;
+          assert (n2 <= MB_CUR_MAX);
+          memcpy (p, buf, n2);
+          p += n2;
+
+          *p++ = ']';
+        }
+    }
+
+  *new_len = p - *new_keys;
+
+  return true;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2261,6 +2338,35 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
+  /* As currently implemented, case-insensitive matching is expensive in
+     multi-byte locales because of a few outlier locales in which some
+     characters change size when converted to upper or lower case.  To
+     accommodate those, we revert to searching the input one line at a
+     time, rather than using the much more efficient buffer search.
+     However, if we have a regular expression, /foo/i, we can convert
+     it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
+     avoid the expensive read-and-process-a-line-at-a-time requirement.
+     Optimize-away the "-i" option, when possible, converting each
+     candidate alpha, C, in the regexp to [Cc].  */
+  if (match_icase)
+    {
+      size_t new_keycc;
+      char *new_keys;
+      /* It is not possible with -F, not useful with -P (pcre) and there is no
+         point when there is no regexp.  It also depends on which constructs
+         appear in the regexp.  See trivial_case_ignore for those details.  */
+      if (keycc
+          && ! (matcher
+                && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre")))
+          && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
+        {
+          match_icase = 0;
+          free (keys);
+          keys = new_keys;
+          keycc = new_keycc;
+        }
+    }
+
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1)
     build_mbclen_cache ();
-- 
1.8.5.2