From 16537826e829941383c61b7765179d12b0d281eb Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 8 Mar 2014 16:05:19 +0900 Subject: [PATCH] grep: optimization with the superset of DFA. By the patch, DFA may be also build the superset of itself, which is the same as the itself expect ANYCHAR, MBCSET and BACKREF are replaced CSET set full bits followed by STAR, and mb_cur_max is equal to 1. For example, if given the pattern `a\(b\)c\1', the tokens of original DFA and its superset is below. original: a b CAT c CAT BACKREF CAT superset: a b CAT c CAT CSET STAR CAT (Full bits are set to CSET.) If a string doesn't matches the superset of original DFA for a pattern, the string will also never match original DFA. By the way, matching with the superset is very fast because it never has ANYCHAR, MBCSET and BACKREF, which are very expensive, and its mb_cur_max is always equal to 1. Therefore, the perfomance for matching with DFA may be dramatically improved without overhead by using the superset. I prepare following string to measure the performance. yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k I run three tests with this patch (best-of-5 trials): env LC_ALL=ja_JP.eucJP time -p src/grep -i foobar k real 1.77 user 1.23 sys 0.47 env LC_ALL=en_US.UTF-8 time -p src/grep -i 'j[a-c]d' k real 1.86 user 1.35 sys 0.45 env LC_ALL=C time -p src/grep '\(j\)\1d' k real 1.92 user 1.40 sys 0.48 Back out that commit (temporarily), recompile, and rerun the experiment: env LC_ALL=ja_JP.eucJP time -p src/grep -i foobar k real 27.21 user 21.15 sys 5.36 env LC_ALL=en_US.UTF-8 time -p src/grep -i 'j[a-c]d' k real 96.35 user 429.80 sys 57.37 env LC_ALL=C time -p src/grep '\(j\)\1d' k real 502.32 user 429.80 sys 57.37 * src/dfa.c (struct dfa) New member `superset'. (dfahint): New function. (dfaoptimize): If succeed in optimization for UTF-8 locale, don't use the superset. (dfasuperset): New function. (dfacomp): Add call of dfasuperset and dfaanalyze functions. (dfafree): Run free only when the value of the pointer isn't NULL. (dfaalloc): Set NULL to member `superset' of DFA clearly. * src/dfa.h: Define prototype for dfahint function. * dfasearch.c: (EGexecute): Use dfahint. --- src/dfa.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++------ src/dfa.h | 3 ++ src/dfasearch.c | 7 +++- 3 files changed, 121 insertions(+), 14 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 5910268..772e350 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -377,6 +377,9 @@ struct dfa size_t nmbcsets; size_t mbcsets_alloc; + /* Fields filled by the superset. */ + struct dfa *superset; /* Hint of the dfa. */ + /* Fields filled by the state builder. */ dfa_state *states; /* States of the dfa. */ state_num sindex; /* Index for adding new states. */ @@ -3516,6 +3519,14 @@ dfaexec (struct dfa *d, char const *begin, char *end, } } +char const * +dfahint (struct dfa *d, char const *begin, char *end, int allow_nl) +{ + if (d->superset == NULL) + return begin; + return dfaexec (d->superset, begin, end, allow_nl, NULL, NULL); +} + static void free_mbdata (struct dfa *d) { @@ -3596,6 +3607,75 @@ dfaoptimize (struct dfa *d) free_mbdata (d); d->mb_cur_max = 1; + + if (d->superset != NULL) + { + dfafree (d->superset); + d->superset = NULL; + } +} + +static void +dfasuperset (struct dfa *d) +{ + size_t i, j; + charclass ccl; + bool have_achar = false; + bool have_nchar = false; + + d->superset = xmalloc (sizeof (struct dfa)); + dfa = d->superset; + + *d->superset = *d; + MALLOC (d->superset->charclasses, d->superset->calloc); + memcpy (d->superset->charclasses, d->charclasses, sizeof (charclass) * d->cindex); + d->superset->tokens = NULL; + d->superset->multibyte_prop = NULL; + d->superset->mbcsets = NULL; + d->superset->superset = NULL; + d->superset->states = NULL; + d->superset->follows = NULL; + d->superset->trans = NULL; + d->superset->realtrans = NULL; + d->superset->fails = NULL; + d->superset->success = NULL; + d->superset->newlines = NULL; + d->superset->musts = NULL; + + d->superset->talloc = d->tindex * 2; + MALLOC (d->superset->tokens, d->superset->talloc); + + for (i = j = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case ANYCHAR: + case MBCSET: + case BACKREF: + zeroset (ccl); + notset (ccl); + d->superset->tokens[j++] = CSET + charclass_index (ccl); + if (d->tokens[++i] != STAR && d->tokens[i] != QMARK + && d->tokens[i] != PLUS) + d->superset->tokens[j++] = STAR; + have_achar = true; + break; + default: + d->superset->tokens[j++] = d->tokens[i]; + have_nchar = true; + break; + } + } + + if (d->mb_cur_max == 1 && (!have_achar || !have_nchar)) + { + dfafree (d->superset); + d->superset = NULL; + return; + } + + d->superset->tindex = j; + d->superset->mb_cur_max = 1; } /* Parse and analyze a single string of the given length. */ @@ -3605,8 +3685,11 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfainit (d); dfaparse (s, len, d); dfamust (d); + dfasuperset (d); dfaoptimize (d); dfaanalyze (d, searchflag); + if (d->superset != NULL) + dfaanalyze (d->superset, searchflag); } /* Free the storage held by the components of a dfa. */ @@ -3622,31 +3705,47 @@ dfafree (struct dfa *d) if (d->mb_cur_max > 1) free_mbdata (d); - for (i = 0; i < d->sindex; ++i) + if (d->states != NULL) + { + for (i = 0; i < d->sindex; ++i) + { + free (d->states[i].elems.elems); + if (MBS_SUPPORT) + free (d->states[i].mbps.elems); + } + free (d->states); + } + if (d->follows != NULL) { - free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + for (i = 0; i < d->tindex; ++i) + free (d->follows[i].elems); + free (d->follows); } - free (d->states); - for (i = 0; i < d->tindex; ++i) - free (d->follows[i].elems); - free (d->follows); - for (i = 0; i < d->tralloc; ++i) + if (d->trans != NULL) { - free (d->trans[i]); - free (d->fails[i]); + for (i = 0; i < d->tralloc; ++i) + free (d->trans[i]); } + if (d->fails != NULL) + { + for (i = 0; i < d->tralloc; ++i) + free (d->fails[i]); + } + free (d->realtrans); free (d->fails); free (d->newlines); free (d->success); + for (dm = d->musts; dm; dm = ndm) { ndm = dm->next; free (dm->must); free (dm); } + + if (d->superset != NULL) + dfafree (d->superset); } /* Having found the postfix representation of the regular expression, @@ -4154,7 +4253,9 @@ done: struct dfa * dfaalloc (void) { - return xmalloc (sizeof (struct dfa)); + struct dfa *d = xmalloc (sizeof (struct dfa)); + d->superset = NULL; + return d; } struct dfamust *_GL_ATTRIBUTE_PURE diff --git a/src/dfa.h b/src/dfa.h index ad2b854..778ef21 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -67,6 +67,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); to decide whether to fall back on a backtracking matcher. */ extern char *dfaexec (struct dfa *d, char const *begin, char *end, int newline, size_t *count, int *backref); +extern char const *dfahint (struct dfa *d, char const *begin, char *end, + int newline); + /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); diff --git a/src/dfasearch.c b/src/dfasearch.c index 0b56960..874dca7 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -248,14 +248,15 @@ EGexecute (char const *buf, size_t size, size_t *match_size, kwsm.size[0])) goto success; } + if (dfahint (dfa, beg, (char *) end, 0) == NULL) + continue; if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) continue; } else { /* No good fixed strings; start with DFA. */ - char const *next_beg = dfaexec (dfa, beg, (char *) buflim, - 0, NULL, &backref); + char const *next_beg = dfahint (dfa, beg, (char *) buflim, 0); /* If there's no match, or if we've matched the sentinel, we're done. */ if (next_beg == NULL || next_beg == buflim) @@ -268,6 +269,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size, end = buflim; while (beg > buf && beg[-1] != eol) --beg; + if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) + continue; } /* Successful, no backreferences encountered! */ if (!backref) -- 1.8.5.2