From fc50d711dbcd7fcf83f111e61b1c8664d4e80403 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sun, 2 Mar 2014 17:08:36 +0900
Subject: [PATCH 1/2] grep: optimization of bracket expression for non-UTF8 locales

If MBCSET is non-inverted and doesn't include neither character classes
including multibyte characters, range expressions, equivalence classes
nor collating elements, replace it to a simple CSET.

* src/dfa.c (addtok): Apply replacement of MBCSET to simple CSET for not
only UTF8 locale but non-UTF8 locales.
---
 src/dfa.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 80bb807..585a599 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1598,10 +1598,11 @@ addtok (token t)
           work_mbc->nchars = 0;
         }
 
-      /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
+      /* If the MBCSET is non-inverted and doesn't include neither
+         character classes including multibyte characters, range
+         expressions, equivalence classes nor collating elements,
+         it can be replaced to a simple CSET. */
       if (work_mbc->invert
-          || (!using_utf8 () && work_mbc->cset != -1)
-          || work_mbc->nchars != 0
           || work_mbc->nch_classes != 0
           || work_mbc->nranges != 0
           || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
@@ -1616,7 +1617,6 @@ addtok (token t)
              that the mbcset is empty now.  Do nothing in that case.  */
           if (work_mbc->cset != -1)
             {
-              assert (using_utf8 ());
               addtok (CSET + work_mbc->cset);
               if (need_or)
                 addtok (OR);
-- 
1.8.5.2


From 9f294713850fc0bdf748b160f2fa27995c1ff5f1 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sun, 2 Mar 2014 17:19:47 +0900
Subject: [PATCH 2/2] grep: revert removal of trivial_case_ignore.

Revive trivial_case_ignore function in order to be able to use kwset.

* src/main.c (trivial_case_ignore): New function.
---
 src/main.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/src/main.c b/src/main.c
index 7c04d77..2ee585a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1867,6 +1867,97 @@ parse_grep_colors (void)
       return;
 }
 
+/* If the newline-separated regular expressions, KEYS (with length, LEN
+   and no trailing NUL byte), are amenable to transformation into
+   otherwise equivalent case-ignoring ones, perform the transformation,
+   put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
+   and return true.  Otherwise, return false.  */
+#define MBRTOWC(pwc, s, n, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
+   mbrtowc ((pwc), (s), (n), (ps)))
+#define WCRTOMB(s, wc, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(s) = wctob ((wint_t) (wc)), 1) : \
+   wcrtomb ((s), (wc), (ps)))
+
+static bool
+trivial_case_ignore (size_t len, char const *keys,
+                     size_t *new_len, char **new_keys)
+{
+  /* FIXME: consider removing the following restriction:
+     Reject if KEYS contain ASCII '\\' or '['.  */
+  if (memchr (keys, '\\', len) || memchr (keys, '[', len))
+    return false;
+
+  /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
+     other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
+     maps to [BC], which requires MB_CUR_MAX + 3 bytes.   */
+  *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1);
+  char *p = *new_keys;
+
+  mbstate_t mb_state;
+  memset (&mb_state, 0, sizeof mb_state);
+  while (len)
+    {
+      wchar_t wc;
+      int n = MBRTOWC (&wc, keys, len, &mb_state);
+
+      /* For an invalid, incomplete or L'\0', skip this optimization.  */
+      if (n <= 0)
+        {
+        skip_case_ignore_optimization:
+          free (*new_keys);
+          return false;
+        }
+
+      char const *orig = keys;
+      keys += n;
+      len -= n;
+
+      if (!iswalpha (wc))
+        {
+          memcpy (p, orig, n);
+          p += n;
+        }
+      else
+        {
+          *p++ = '[';
+          memcpy (p, orig, n);
+          p += n;
+
+          wint_t folded = towlower (wc);
+          if (folded != wc)
+            {
+              char buf[MB_CUR_MAX];
+              int n2 = WCRTOMB (buf, folded, &mb_state);
+              if (n2 <= 0)
+                goto skip_case_ignore_optimization;
+              assert (n2 <= MB_CUR_MAX);
+              memcpy (p, buf, n2);
+              p += n2;
+            }
+          folded = towupper (wc);
+          if (folded != wc)
+            {
+              char buf[MB_CUR_MAX];
+              int n2 = WCRTOMB (buf, folded, &mb_state);
+              if (n2 <= 0)
+                goto skip_case_ignore_optimization;
+              assert (n2 <= MB_CUR_MAX);
+              memcpy (p, buf, n2);
+              p += n2;
+            }
+
+          *p++ = ']';
+        }
+    }
+
+  *new_len = p - *new_keys;
+
+  return true;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2261,6 +2352,35 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
+  /* As currently implemented, case-insensitive matching is expensive in
+     multi-byte locales because of a few outlier locales in which some
+     characters change size when converted to upper or lower case.  To
+     accommodate those, we revert to searching the input one line at a
+     time, rather than using the much more efficient buffer search.
+     However, if we have a regular expression, /foo/i, we can convert
+     it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
+     avoid the expensive read-and-process-a-line-at-a-time requirement.
+     Optimize-away the "-i" option, when possible, converting each
+     candidate alpha, C, in the regexp to [Cc].  */
+  if (match_icase)
+    {
+      size_t new_keycc;
+      char *new_keys;
+      /* It is not possible with -F, not useful with -P (pcre) and there is no
+         point when there is no regexp.  It also depends on which constructs
+         appear in the regexp.  See trivial_case_ignore for those details.  */
+      if (keycc
+          && ! (matcher
+                && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre")))
+          && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
+        {
+          match_icase = 0;
+          free (keys);
+          keys = new_keys;
+          keycc = new_keycc;
+        }
+    }
+
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1)
     build_mbclen_cache ();
-- 
1.8.5.2