From 9cf6688bce1e3aeb3ba61439dac41dd3dcc9b407 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 1 Mar 2014 18:45:04 +0900 Subject: [PATCH 1/2] no longer use CSET for non-UTF8 locales in DFA engine --- src/dfa.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 80bb807..eed1020 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1045,7 +1045,8 @@ parse_bracket_exp (void) if (!pred) dfaerror (_("invalid character class")); - if (MB_CUR_MAX > 1 && !pred->single_byte_only) + if (MB_CUR_MAX > 1 + && (!using_utf8 () || !pred->single_byte_only)) { /* Store the character class as wctype_t. */ wctype_t wt = wctype (class); @@ -1152,21 +1153,21 @@ parse_bracket_exp (void) if (case_fold) { wint_t folded = towlower (wc); - if (folded != wc && !setbit_wc (folded, ccl)) + if (folded != wc && (!using_utf8 () || !setbit_wc (folded, ccl))) { REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = folded; } folded = towupper (wc); - if (folded != wc && !setbit_wc (folded, ccl)) + if (folded != wc && (!using_utf8 () || !setbit_wc (folded, ccl))) { REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = folded; } } - if (!setbit_wc (wc, ccl)) + if (!using_utf8 () || !setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, work_mbc->nchars + 1); @@ -1600,7 +1601,6 @@ addtok (token t) /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */ if (work_mbc->invert - || (!using_utf8 () && work_mbc->cset != -1) || work_mbc->nchars != 0 || work_mbc->nch_classes != 0 || work_mbc->nranges != 0 -- 1.8.5.2 From d9f9a5aedad9e81cf56b6c4e375e5f395e4e213a Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 1 Mar 2014 18:45:18 +0900 Subject: [PATCH 2/2] revert: remove trivial_case_ignore --- src/main.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/main.c b/src/main.c index 7c04d77..090fc4f 100644 --- a/src/main.c +++ b/src/main.c @@ -1867,6 +1867,83 @@ parse_grep_colors (void) return; } +/* If the newline-separated regular expressions, KEYS (with length, LEN + and no trailing NUL byte), are amenable to transformation into + otherwise equivalent case-ignoring ones, perform the transformation, + put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN, + and return true. Otherwise, return false. */ +#define MBRTOWC(pwc, s, n, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \ + mbrtowc ((pwc), (s), (n), (ps))) +#define WCRTOMB(s, wc, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(s) = wctob ((wint_t) (wc)), 1) : \ + wcrtomb ((s), (wc), (ps))) + +static bool +trivial_case_ignore (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + /* FIXME: consider removing the following restriction: + Reject if KEYS contain ASCII '\\' or '['. */ + if (memchr (keys, '\\', len) || memchr (keys, '[', len)) + return false; + + /* Worst case is that each byte B of KEYS is ASCII alphabetic and each + other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B + maps to [BC], which requires MB_CUR_MAX + 3 bytes. */ + *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1); + char *p = *new_keys; + + mbstate_t mb_state; + memset (&mb_state, 0, sizeof mb_state); + while (len) + { + wchar_t wc; + int n = MBRTOWC (&wc, keys, len, &mb_state); + + /* For an invalid, incomplete or L'\0', skip this optimization. */ + if (n <= 0) + { + skip_case_ignore_optimization: + free (*new_keys); + return false; + } + + char const *orig = keys; + keys += n; + len -= n; + + if (!iswalpha (wc)) + { + memcpy (p, orig, n); + p += n; + } + else + { + *p++ = '['; + memcpy (p, orig, n); + p += n; + + wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc); + char buf[MB_CUR_MAX]; + int n2 = WCRTOMB (buf, wc2, &mb_state); + if (n2 <= 0) + goto skip_case_ignore_optimization; + assert (n2 <= MB_CUR_MAX); + memcpy (p, buf, n2); + p += n2; + + *p++ = ']'; + } + } + + *new_len = p - *new_keys; + + return true; +} + int main (int argc, char **argv) { @@ -2261,6 +2338,35 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); + /* As currently implemented, case-insensitive matching is expensive in + multi-byte locales because of a few outlier locales in which some + characters change size when converted to upper or lower case. To + accommodate those, we revert to searching the input one line at a + time, rather than using the much more efficient buffer search. + However, if we have a regular expression, /foo/i, we can convert + it to an equivalent case-insensitive /[fF][oO][oO]/, and thus + avoid the expensive read-and-process-a-line-at-a-time requirement. + Optimize-away the "-i" option, when possible, converting each + candidate alpha, C, in the regexp to [Cc]. */ + if (match_icase) + { + size_t new_keycc; + char *new_keys; + /* It is not possible with -F, not useful with -P (pcre) and there is no + point when there is no regexp. It also depends on which constructs + appear in the regexp. See trivial_case_ignore for those details. */ + if (keycc + && ! (matcher + && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre"))) + && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys)) + { + match_icase = 0; + free (keys); + keys = new_keys; + keycc = new_keycc; + } + } + #if MBS_SUPPORT if (MB_CUR_MAX > 1) build_mbclen_cache (); -- 1.8.5.2