From fc956b20ebacf2f8cedad196ba38797d8ec3724e Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 9 Mar 2014 23:43:34 +0900 Subject: [PATCH] grep: optimization for fgrep with changing the macher to grep macher. fgrep macher is only use kwset engine. However, it's very slow for case-insensitive matching in multibyte locales. And so, if the matcher is fgrep and case-insensitive and keys including any alphabets, change it into grep matcher by escape of keys. OTOH, if keys include no alphabet, turn match_icase flag off. I prepare following string to measure the performance. yes $(printf '%078dm' 0)| head -1000000 | tr 0 a > in A=`printf '\xef\xbc\xa1'` # FULLWIDTH LATIN CAPITAL LETTER A I run three tests with this patch (best-of-5 trials): env LC_ALL=en_US.UTF-8 time -p src/fgrep -i "$A" in real 8.54 user 7.13 sys 1.16 Back out that commit (temporarily), recompile, and rerun the experiment: env LC_ALL=en_US.UTF-8 time -p src/fgrep -i "$A" in real 0.07 user 0.02 sys 0.05 * src/fgrep.c (Gcompile) New function. * src/main.c (check_any_alphabets) New function. (fgrep_to_grep_pattern) New function. (main) Use them. --- src/fgrep.c | 7 +++++ src/main.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/src/fgrep.c b/src/fgrep.c index a0940cc..1ceb274 100644 --- a/src/fgrep.c +++ b/src/fgrep.c @@ -1,8 +1,15 @@ #include #include "search.h" +static void +Gcompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES); +} + struct matcher const matchers[] = { { "fgrep", Fcompile, Fexecute }, + { "grep", Gcompile, EGexecute }, { NULL, NULL, NULL }, }; diff --git a/src/main.c b/src/main.c index 4f8f1ef..b80e29e 100644 --- a/src/main.c +++ b/src/main.c @@ -1876,6 +1876,79 @@ parse_grep_colors (void) ? (*(s) = wctob ((wint_t) (wc)), 1) \ : wcrtomb (s, wc, ps)) +/* Check wether a string has any alphabets. */ +static bool +check_any_alphabets (size_t len, char const *keys) +{ + while (len) + { + mbstate_t mb_state = { 0 }; + wchar_t wc; + size_t n = MBRTOWC (&wc, keys, len, &mb_state); + + if ((size_t) -2 <= n) + { + keys++; + len--; + continue; + } + + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int nfolded = case_folded_counterparts (wc, folded); + if (nfolded > 0) + return true; + keys += n; + len -= n; + } + + return false; +} + +/* Change a pattern for fgrep into grep. */ +static void +fgrep_to_grep_pattern (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + *new_keys = xnmalloc (len + 1, 2); + char *p = *new_keys; + + while (len) + { + mbstate_t mb_state = { 0 }; + wchar_t wc; + size_t n = MBRTOWC (&wc, keys, len, &mb_state); + + if ((size_t) -2 <= n) + *p++ = *keys++; + else if (n == 1) + { + switch (*keys) + { + case '\\': + case '[': + case ']': + case '^': + case '$': + case '.': + case '*': + *p++ = '\\'; + default: + *p++ = *keys; + } + } + else + { + memcpy (p, keys, n); + p += n; + } + + keys += n; + len -= n; + } + + *new_len = p - *new_keys; +} + /* If the newline-separated regular expressions, KEYS (with length, LEN and no trailing NUL byte), are amenable to transformation into otherwise equivalent case-ignoring ones, perform the transformation, @@ -2350,6 +2423,28 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); + /* If the matcher is fgrep and case-insensitive and keys including any + alphabets, change it into grep matcher by escape of keys. If keys + include no alphabet, can turn off match_icase flag. */ + if (match_icase && MB_CUR_MAX > 1 && keycc + && ((!matcher && STREQ (matchers[0].name, "fgrep")) + || (matcher && STREQ (matcher, "fgrep")))) + { + if (check_any_alphabets (keycc, keys)) + { + size_t new_keycc; + char *new_keys; + fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys); + free (keys); + keys = new_keys; + keycc = new_keycc; + matcher = NULL; + setmatcher ("grep"); + } + else + match_icase = 0; + } + /* Case-insensitive matching is expensive in multibyte locales because a few characters may change size when converted to upper or lower case. To accommodate those, search the input one line -- 1.8.5.2