From fc50d711dbcd7fcf83f111e61b1c8664d4e80403 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 2 Mar 2014 17:08:36 +0900 Subject: [PATCH 1/2] grep: optimization of bracket expression for non-UTF8 locales If MBCSET is non-inverted and doesn't include neither character classes including multibyte characters, range expressions, equivalence classes nor collating elements, replace it to a simple CSET. * src/dfa.c (addtok): Apply replacement of MBCSET to simple CSET for not only UTF8 locale but non-UTF8 locales. --- src/dfa.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 80bb807..585a599 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1598,10 +1598,11 @@ addtok (token t) work_mbc->nchars = 0; } - /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */ + /* If the MBCSET is non-inverted and doesn't include neither + character classes including multibyte characters, range + expressions, equivalence classes nor collating elements, + it can be replaced to a simple CSET. */ if (work_mbc->invert - || (!using_utf8 () && work_mbc->cset != -1) - || work_mbc->nchars != 0 || work_mbc->nch_classes != 0 || work_mbc->nranges != 0 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0) @@ -1616,7 +1617,6 @@ addtok (token t) that the mbcset is empty now. Do nothing in that case. */ if (work_mbc->cset != -1) { - assert (using_utf8 ()); addtok (CSET + work_mbc->cset); if (need_or) addtok (OR); -- 1.8.5.2 From 9f294713850fc0bdf748b160f2fa27995c1ff5f1 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 2 Mar 2014 17:19:47 +0900 Subject: [PATCH 2/2] grep: revert removal of trivial_case_ignore. Revive trivial_case_ignore function in order to be able to use kwset. * src/main.c (trivial_case_ignore): New function. --- src/main.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/main.c b/src/main.c index 7c04d77..2ee585a 100644 --- a/src/main.c +++ b/src/main.c @@ -1867,6 +1867,97 @@ parse_grep_colors (void) return; } +/* If the newline-separated regular expressions, KEYS (with length, LEN + and no trailing NUL byte), are amenable to transformation into + otherwise equivalent case-ignoring ones, perform the transformation, + put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN, + and return true. Otherwise, return false. */ +#define MBRTOWC(pwc, s, n, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \ + mbrtowc ((pwc), (s), (n), (ps))) +#define WCRTOMB(s, wc, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(s) = wctob ((wint_t) (wc)), 1) : \ + wcrtomb ((s), (wc), (ps))) + +static bool +trivial_case_ignore (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + /* FIXME: consider removing the following restriction: + Reject if KEYS contain ASCII '\\' or '['. */ + if (memchr (keys, '\\', len) || memchr (keys, '[', len)) + return false; + + /* Worst case is that each byte B of KEYS is ASCII alphabetic and each + other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B + maps to [BC], which requires MB_CUR_MAX + 3 bytes. */ + *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1); + char *p = *new_keys; + + mbstate_t mb_state; + memset (&mb_state, 0, sizeof mb_state); + while (len) + { + wchar_t wc; + int n = MBRTOWC (&wc, keys, len, &mb_state); + + /* For an invalid, incomplete or L'\0', skip this optimization. */ + if (n <= 0) + { + skip_case_ignore_optimization: + free (*new_keys); + return false; + } + + char const *orig = keys; + keys += n; + len -= n; + + if (!iswalpha (wc)) + { + memcpy (p, orig, n); + p += n; + } + else + { + *p++ = '['; + memcpy (p, orig, n); + p += n; + + wint_t folded = towlower (wc); + if (folded != wc) + { + char buf[MB_CUR_MAX]; + int n2 = WCRTOMB (buf, folded, &mb_state); + if (n2 <= 0) + goto skip_case_ignore_optimization; + assert (n2 <= MB_CUR_MAX); + memcpy (p, buf, n2); + p += n2; + } + folded = towupper (wc); + if (folded != wc) + { + char buf[MB_CUR_MAX]; + int n2 = WCRTOMB (buf, folded, &mb_state); + if (n2 <= 0) + goto skip_case_ignore_optimization; + assert (n2 <= MB_CUR_MAX); + memcpy (p, buf, n2); + p += n2; + } + + *p++ = ']'; + } + } + + *new_len = p - *new_keys; + + return true; +} + int main (int argc, char **argv) { @@ -2261,6 +2352,35 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); + /* As currently implemented, case-insensitive matching is expensive in + multi-byte locales because of a few outlier locales in which some + characters change size when converted to upper or lower case. To + accommodate those, we revert to searching the input one line at a + time, rather than using the much more efficient buffer search. + However, if we have a regular expression, /foo/i, we can convert + it to an equivalent case-insensitive /[fF][oO][oO]/, and thus + avoid the expensive read-and-process-a-line-at-a-time requirement. + Optimize-away the "-i" option, when possible, converting each + candidate alpha, C, in the regexp to [Cc]. */ + if (match_icase) + { + size_t new_keycc; + char *new_keys; + /* It is not possible with -F, not useful with -P (pcre) and there is no + point when there is no regexp. It also depends on which constructs + appear in the regexp. See trivial_case_ignore for those details. */ + if (keycc + && ! (matcher + && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre"))) + && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys)) + { + match_icase = 0; + free (keys); + keys = new_keys; + keycc = new_keycc; + } + } + #if MBS_SUPPORT if (MB_CUR_MAX > 1) build_mbclen_cache (); -- 1.8.5.2