Only use a bitset when possible without involving MBCSET. Testcase: yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \ time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]
Before: 51ms (best of three runs); after: 16ms(best of three runs). * src/dfa.c (check_utf8, using_utf8): New. (parse_bracket_exp): For simple bracket expressions under UTF-8, use a CSET. (dfacomp): Call check_utf8. --- src/dfa.c | 33 ++++++++++++++++++++++++++++++++- 1 files changed, 32 insertions(+), 1 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index add6ebd..f17f550 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -84,6 +84,7 @@ /* We can handle multibyte strings. */ # include <wchar.h> # include <wctype.h> +# include <langinfo.h> #endif #include "regex.h" @@ -296,8 +297,27 @@ static wchar_t *inputwcs; /* Wide character representation of input And inputwcs[i] is the codepoint. */ static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */ static unsigned char const *buf_end; /* reference to end in dfaexec(). */ + +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static int using_utf8; + +void +check_utf8 (void) +{ +#ifdef HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + using_utf8 = 1; +#endif +} +#else +void +check_utf8 (void) +{ +} #endif /* MBS_SUPPORT */ + #ifdef MBS_SUPPORT /* Note that characters become unsigned here. */ # define FETCH_WC(c, wc, eoferr) \ @@ -688,7 +708,14 @@ parse_bracket_exp (void) while ((wc = wc1, (c = c1) != L']')); #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) + if (MB_CUR_MAX > 1 + && (!using_utf8 + || invert + || work_mbc->nchars != 0 + || work_mbc->nch_classes != 0 + || work_mbc->nranges != 0 + || work_mbc->nequivs != 0 + || work_mbc->ncoll_elems != 0)) { static charclass zeroclass; work_mbc->invert = invert; @@ -699,6 +726,9 @@ parse_bracket_exp (void) if (invert) { +#ifdef MBS_SUPPORT + assert(MB_CUR_MAX == 1); +#endif notset(ccl); if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit(eolbyte, ccl); @@ -2916,6 +2946,7 @@ dfainit (struct dfa *d) void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { + check_utf8(); dfainit(d); dfaparse(s, len, d); dfamust(d); -- 1.6.6