On Thu, Feb 20, 2014 at 12:42 PM, Jim Meyering <[email protected]> wrote: > Revised commits attached.
One more revision. This is a big enough deal (and subtle enough) that I thought I really should add a test for it. Did that, so now it's 3 commits: I'm going to push these in an hour or so, then see if I have time to make the release this evening.
From 11ce80861109361570cbedda6a966264367f7c76 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Wed, 19 Feb 2014 19:22:24 -0800 Subject: [PATCH 1/3] maint: give dfa.c's using_utf8 function external scope * src/dfa.c (using_utf8): Remove "static inline". * src/dfa.h (using_utf8): Declare it. * src/searchutils.c (is_mb_middle): Use using_utf8 rather than rolling our own. --- src/dfa.c | 2 +- src/dfa.h | 2 ++ src/searchutils.c | 9 ++------- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 9266f6f..8906ed3 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c) /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ -static inline int +int using_utf8 (void) { static int utf8 = -1; diff --git a/src/dfa.h b/src/dfa.h index bacd489..7e0674f 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -99,3 +99,5 @@ extern void dfawarn (const char *); takes a single argument, a NUL-terminated string describing the error. The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); + +extern int using_utf8 (void); diff --git a/src/searchutils.c b/src/searchutils.c index 3478417..7363701 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,6 +19,7 @@ #include <config.h> #include <assert.h> #include "search.h" +#include "dfa.h" #if HAVE_LANGINFO_CODESET # include <langinfo.h> #endif @@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; -#if HAVE_LANGINFO_CODESET - static int is_utf8 = -1; - - if (is_utf8 == -1) - is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); - if (is_utf8 && buf - p > MB_CUR_MAX) + if (using_utf8 () && buf - p > MB_CUR_MAX) { for (p = buf; buf - p > MB_CUR_MAX; p--) if (mbclen_cache[to_uchar (*p)] != (size_t) -1) @@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const char *end, if (buf - p == MB_CUR_MAX) p = buf; } -#endif memset (&cur_state, 0, sizeof cur_state); -- 1.9.0 From c7c8bcdefe7be5f59a242eea63df7f64eacb6a09 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Wed, 19 Feb 2014 19:31:43 -0800 Subject: [PATCH 2/3] grep -i: avoid a performance regression in multibyte non-UTF8 locales * src/main.c: Include dfa.h. (trivial_case_ignore): Perform this optimization only for UTF8 locales. This rectifies a 100-200x performance regression in non-UTF8 multi-byte locales like ja_JP.eucJP. The regression was introduced by the 10x UTF8/grep-i speedup, commit v2.16-4-g97318f5. * NEWS (Bug fixes): Mention it. Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50 --- NEWS | 3 +++ src/main.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/NEWS b/NEWS index 6cbbc46..df5632b 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,9 @@ GNU grep NEWS -*- outline -*- grep no longer mishandles patterns like [^^-~] in unibyte locales. [bug introduced in grep-2.8] + grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower + than in 2.16. [bug introduced in grep-2.17] + * Noteworthy changes in release 2.17 (2014-02-17) [stable] diff --git a/src/main.c b/src/main.c index bd20297..56ec6b3 100644 --- a/src/main.c +++ b/src/main.c @@ -34,6 +34,7 @@ #include "c-ctype.h" #include "closeout.h" #include "colorize.h" +#include "dfa.h" #include "error.h" #include "exclude.h" #include "exitfail.h" @@ -1883,6 +1884,11 @@ static bool trivial_case_ignore (size_t len, char const *keys, size_t *new_len, char **new_keys) { + /* Perform this translation only for UTF-8. Otherwise, this would induce + a 100-200x performance penalty for non-UTF8 multibyte locales. */ + if ( ! using_utf8 ()) + return false; + /* FIXME: consider removing the following restriction: Reject if KEYS contain ASCII '\\' or '['. */ if (memchr (keys, '\\', len) || memchr (keys, '[', len)) -- 1.9.0 From d5bfcc2daa123fa0e8660909052f7ca2ec6b7649 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Thu, 20 Feb 2014 16:06:13 -0800 Subject: [PATCH 3/3] tests: test for the non-UTF8 multi-byte performance regression Test for the just-fixed performance regression. With a 100-200x differential, it is reasonable to expect that a very slow system will be able to complete the designated task in a few seconds, while with the bug, even a very fast system would exceed the timeout. * tests/mb-non-UTF8-performance: New file. * tests/Makefile.am (TESTS): Add it. * tests/init.cfg (require_JP_EUC_locale_): New function. --- tests/Makefile.am | 1 + tests/init.cfg | 16 ++++++++++++++++ tests/mb-non-UTF8-performance | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100755 tests/mb-non-UTF8-performance diff --git a/tests/Makefile.am b/tests/Makefile.am index 331467a..4ffea85 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -72,6 +72,7 @@ TESTS = \ khadafy \ long-line-vs-2GiB-read \ max-count-vs-context \ + mb-non-UTF8-performance \ multibyte-white-space \ empty-line-mb \ unibyte-bracket-expr \ diff --git a/tests/init.cfg b/tests/init.cfg index ee5d537..5555e2d 100644 --- a/tests/init.cfg +++ b/tests/init.cfg @@ -103,6 +103,22 @@ require_unibyte_locale() skip_ 'no unibyte locale found' } +require_JP_EUC_locale_() +{ + local locale=ja_JP.eucJP + path_prepend_ . + case $(get-mb-cur-max $locale) in + [23]) + LC_ALL=$locale && + export LC_ALL && + return + ;; + *) ;; + esac + + skip_ "$loc locale not found" +} + expensive_() { if test "$RUN_EXPENSIVE_TESTS" != yes; then diff --git a/tests/mb-non-UTF8-performance b/tests/mb-non-UTF8-performance new file mode 100755 index 0000000..282f0c4 --- /dev/null +++ b/tests/mb-non-UTF8-performance @@ -0,0 +1,32 @@ +#!/bin/sh +# grep-2.17 would take nearly 200x longer to run the command below. + +# Copyright 2014 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_timeout_ + +fail=0 + +require_JP_EUC_locale_ + +yes $(printf '%078d' 0) | head -50000 > in || framework_failure_ + +# Expect no match, i.e., exit status of 1. Anything else is an error. +timeout 4 grep -i foobar in; st=$? +test $st = 1 || fail=1 + +Exit $fail -- 1.9.0
