Hmm... it's not as clear-cut as I first thought.
(I built 2.17+ the above patch and put it in a directory named grep-2.18)

The following times 2.16, 2.17 and 2.17+patch two ways:

$ yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k
$ for i in 16 17 18; do echo $i; env LC_ALL=en_US.UTF-8 time
/p/p/grep-2.$i/bin/grep -i foobar k; done
16
       15.96 real        14.57 user         0.12 sys
17
        1.13 real         1.07 user         0.06 sys
18
        1.96 real         1.89 user         0.06 sys

The above search takes more than 70% longer with the proposed patch.

Contrast that with performance in the non-UTF8 ja_JP.eucJP locale:

$ yes $(printf '%078dm' 0)|head -10000 > in
$ for i in 16 17 18; do echo $i; env LC_ALL=ja_JP.eucJP time
/p/p/grep-2.$i/bin/grep -i n in; done
16
        0.03 real         0.02 user         0.00 sys
17
        2.98 real         2.96 user         0.00 sys
18
        0.02 real         0.02 user         0.00 sys

Using the jjj+foobar example, but with only 100k lines, we see there
was a 200x performance regression going from grep-2.16 to 2.17:

$ yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -100000 > k
$ for i in 16 17 18; do echo $i; env LC_ALL=ja_JP.eucJP time
/p/p/grep-2.$i/bin/grep -i foobar k; done
16
        0.15 real         0.14 user         0.00 sys
17
       27.74 real        27.72 user         0.01 sys
18
        0.11 real         0.11 user         0.00 sys

Obviously, I want to retain all of 2.17's performance gain in UTF-8 locales,
while avoiding the 200x penalty in multi-byte non-UTF8 locales like ja_JP.eucJP.
So I have prepared a better patch.
With the two attached commits (on top of 2.17), I get these timings,
i.e., the same 200x improvement with ja_JP.eucJP, and no regression
with en_US.UTF8)

$ for i in 16 17 18; do printf "$i: "; env LC_ALL=ja_JP.eucJP time
/p/p/grep-2.$i/bin/grep -i foobar k; done
16:         0.14 real         0.14 user         0.00 sys
17:        27.97 real        27.95 user         0.01 sys
18:         0.12 real         0.12 user         0.00 sys

$ for i in 16 17 18; do printf "$i: "; env LC_ALL=en_US.UTF-8 time
/p/p/grep-2.$i/bin/grep -i foobar k; done
16:         0.13 real         0.12 user         0.00 sys
17:         0.01 real         0.01 user         0.00 sys
18:         0.01 real         0.01 user         0.00 sys
From 07a4f69da701abfdee047f26c603002c20d4c7d4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:22:24 -0800
Subject: [PATCH 1/2] maint: factor out using_utf8 function for use in main.c

* src/searchutils.c (is_mb_middle): Use using_utf8 rather than
rolling our own.
(using_utf8): New function (copy of the one in dfa.c).
* src/search.h (using_utf8): Declare it.
---
 src/search.h      |  2 ++
 src/searchutils.c | 26 +++++++++++++++++++-------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/search.h b/src/search.h
index 12d0822..167e0e7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -80,4 +80,6 @@ mb_case_map_apply (mb_len_map_t const *map, size_t *off, 
size_t *len)
     }
 }

+int using_utf8 (void);
+
 #endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index 3478417..51bba59 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -234,13 +234,8 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
-#if HAVE_LANGINFO_CODESET
-  static int is_utf8 = -1;
-
-  if (is_utf8 == -1)
-    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");

-  if (is_utf8 && buf - p > MB_CUR_MAX)
+  if (using_utf8 () && buf - p > MB_CUR_MAX)
     {
       for (p = buf; buf - p > MB_CUR_MAX; p--)
         if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
@@ -249,7 +244,6 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
       if (buf - p == MB_CUR_MAX)
         p = buf;
     }
-#endif

   memset (&cur_state, 0, sizeof cur_state);

@@ -283,3 +277,21 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
 }
 #endif /* MBS_SUPPORT */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+   assume in a multibyte encoding.  */
+int
+using_utf8 (void)
+{
+  static int utf8 = -1;
+  if (utf8 == -1)
+    {
+#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
+      utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
+#else
+      utf8 = 0;
+#endif
+    }
+
+  return utf8;
+}
-- 
1.9.0


From 24c1c313e4bfa17ab1275a5d70e0cc18c4aa1b35 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:31:43 -0800
Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte
 non-UTF8 locales

* src/main.c (trivial_case_ignore): Perform this optimization
only for UTF8 locales.  This rectifies a 200x performance
regression in multi-byte non-UTF8 locales like ja_JP.eucJP.
Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50
* NEWS (Bug fixes): Mention it.
---
 NEWS       | 5 +++++
 src/main.c | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/NEWS b/NEWS
index 6785a96..49a17b0 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline 
-*-

 * Noteworthy changes in release ?.? (????-??-??) [?]

+** Bug fixes
+
+  grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower
+  than in 2.16.  [bug introduced in grep-2.17]
+

 * Noteworthy changes in release 2.17 (2014-02-17) [stable]

diff --git a/src/main.c b/src/main.c
index bd20297..d63c133 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1888,6 +1888,9 @@ trivial_case_ignore (size_t len, char const *keys,
   if (memchr (keys, '\\', len) || memchr (keys, '[', len))
     return false;

+  if ( ! using_utf8 ())
+    return false;
+
   /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
      other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
      maps to [BC], which requires MB_CUR_MAX + 3 bytes.   */
-- 
1.9.0

Reply via email to