From 6b752a604ec85d27d5551384ae6153a924189ec4 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 20 Feb 2014 22:10:00 +0900 Subject: [PATCH] Use DFA engine on fgrep matcher. --- src/fgrep.c | 7 ++++ src/main.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 3 deletions(-) diff --git a/src/fgrep.c b/src/fgrep.c index a0940cc..1ceb274 100644 --- a/src/fgrep.c +++ b/src/fgrep.c @@ -1,8 +1,15 @@ #include #include "search.h" +static void +Gcompile (char const *pattern, size_t size) +{ + GEAcompile (pattern, size, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES); +} + struct matcher const matchers[] = { { "fgrep", Fcompile, Fexecute }, + { "grep", Gcompile, EGexecute }, { NULL, NULL, NULL }, }; diff --git a/src/main.c b/src/main.c index bd20297..efa8e2a 100644 --- a/src/main.c +++ b/src/main.c @@ -1874,6 +1874,83 @@ parse_grep_colors (void) (*(s) = wctob ((wint_t) (wc)), 1) : \ wcrtomb ((s), (wc), (ps))) +/* Check wether any alphabets are included in string */ +static bool +check_any_alphabets (size_t len, char const *keys) +{ + mbstate_t mb_state; + memset (&mb_state, 0, sizeof mb_state); + while (len) + { + wchar_t wc, wc2; + int n = MBRTOWC (&wc, keys, len, &mb_state); + + if (n <= 0) + return true; + + wc2 = iswalpha (wc) ? (iswupper (wc) ? towlower (wc) : towupper (wc)) : wc; + + if (wc2 != wc) + return true; + + keys += n; + len -= n; + } + + return false; +} + +static bool +fgrep_to_grep (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + *new_keys = xmalloc (2 * len + 1); + char *p = *new_keys; + + mbstate_t mb_state; + memset (&mb_state, 0, sizeof mb_state); + while (len) + { + wchar_t wc; + int n = MBRTOWC (&wc, keys, len, &mb_state); + + if (n <= 0) + { + free (*new_keys); + return false; + } + + if (n == 1) + { + switch (*keys) + { + case '\\': + case '[': + case ']': + case '^': + case '$': + case '.': + case '*': + *p++ = '\\'; + default: + *p++ = *keys; + } + } + else + { + memcpy (p, keys, n); + p += n; + } + + keys += n; + len -= n; + } + + *new_len = p - *new_keys; + + return true; +} + /* If the newline-separated regular expressions, KEYS (with length, LEN and no trailing NUL byte), are amenable to transformation into otherwise equivalent case-ignoring ones, perform the transformation, @@ -1898,7 +1975,7 @@ trivial_case_ignore (size_t len, char const *keys, memset (&mb_state, 0, sizeof mb_state); while (len) { - wchar_t wc; + wchar_t wc, wc2; int n = MBRTOWC (&wc, keys, len, &mb_state); /* For an invalid, incomplete or L'\0', skip this optimization. */ @@ -1913,7 +1990,9 @@ trivial_case_ignore (size_t len, char const *keys, keys += n; len -= n; - if (!iswalpha (wc)) + wc2 = iswalpha (wc) ? (iswupper (wc) ? towlower (wc) : towupper (wc)) : wc; + + if (wc2 == wc) { memcpy (p, orig, n); p += n; @@ -1924,7 +2003,6 @@ trivial_case_ignore (size_t len, char const *keys, memcpy (p, orig, n); p += n; - wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc); char buf[MB_CUR_MAX]; int n2 = WCRTOMB (buf, wc2, &mb_state); if (n2 <= 0) @@ -2336,6 +2414,29 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); + /* If Matcher is fgrep and case-insensitive and keys include any alphabets, + escape keys and change matcher into "grep" to use DFA. */ + if (match_icase && MB_CUR_MAX > 1 && keycc + && ((!matcher && STREQ (matchers[0].name, "fgrep")) + || (matcher && STREQ (matcher, "fgrep")))) + { + if (check_any_alphabets (keycc, keys)) + { + size_t new_keycc; + char *new_keys; + if (fgrep_to_grep (keycc, keys, &new_keycc, &new_keys)) + { + matcher = NULL; + setmatcher ("grep"); + free (keys); + keys = new_keys; + keycc = new_keycc; + } + } + else + match_icase = 0; + } + /* As currently implemented, case-insensitive matching is expensive in multi-byte locales because of a few outlier locales in which some characters change size when converted to upper or lower case. To -- 1.8.5.2