Hi, I propose three patches intended to be applied to Gawk for dfa.
First, it counts reset of dfa states. If the dfa state is reset many times, an enormous number of dfa states are considered necessary for the pattern, and it causes slowdown. Especially Gawk tends to be specified with complicated patterns as a case exemplified in http://lists.gnu.org/archive/html/bug-gawk/2017-07/msg00002.html. I intend dfa is invalidated dynamically for such pattern. Second, take dfacopysyntax() which only exists dfa in Gawk into dfa in gnulib. Third, suppress warning by compiler. Thanks, Norihiro
From 09a76ca1ee331a566cb1097f4b3dd2b8c4b13639 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Sun, 4 Nov 2018 15:10:51 +0900 Subject: [PATCH 1/3] dfa: count the number of states reset Count the number of states reset. It may be used to determine whether dfa is enough fast or not. * lib/dfa.c (struct dfa): New member 'reset_count'. (dfaexec_main): If dfa states are reset, count up it. (dfaresetcount): New function. * lib/dfa.h (dfaresetcount): Add prototype. --- lib/dfa.c | 10 ++++++++++ 1 files changed, 10 insertions(+), 0 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index ad2716b..4b5c9bb 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -559,6 +559,8 @@ struct dfa ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + size_t reset_count; /* Number of times which the states was reset + in dfaexec. */ /* Information derived from the locale. This is at the end so that a quick memset need not clear it specially. */ @@ -3379,6 +3381,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, d->states[s].mb_trindex = -1; d->mb_trcount = 0; } + + d->reset_count++; } if (!d->tralloc) @@ -3574,6 +3578,12 @@ dfaisfast (struct dfa const *d) return d->fast; } +size_t +dfaresetcount (struct dfa const *d) +{ + return d->reset_count; +} + static void free_mbdata (struct dfa *d) { -- 1.7.1
From 26432ac18342662bb9c793d1a64fa40e00c4b909 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Sat, 10 Nov 2018 08:25:28 +0900 Subject: [PATCH 2/3] dfa: clearly copy the members of struct DFA When superset of a dfa is build, all members of original DFA are copied, and values which are different from original are changed only. This change clearfies members should be copied. * lib/dfa.c (maybe_disable_superset_dfa): Do it. (dfasyntax): No longer initialize here. (dfacopysyntax): Copy syntax from struct DFA to another. (dfassbuild): Clearly copy the members of struct DFA (dfaalloc): Initialize generated struct. * lib/dfa.h (dfacopysyntax): Add prototype. --- lib/dfa.c | 59 ++++++++++++++++++++++++++++++++++++++--------------------- lib/dfa.h | 5 +++++ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index 4b5c9bb..a761a26 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -886,6 +886,23 @@ char_context (struct dfa const *dfa, unsigned char c) return CTX_NONE; } +/* Copy the syntax settings from one dfa instance to another. + Saves considerable computation time if compiling many regular expressions + based on the same setting. */ +void +dfacopysyntax (struct dfa *to, const struct dfa *from) +{ + to->dfaexec = from->dfaexec; + to->simple_locale = from->simple_locale; + to->localeinfo = from->localeinfo; + + to->fast = from->fast; + + to->canychar = from->canychar; + to->lex.cur_mb_len = from->lex.cur_mb_len; + to->syntax = from->syntax; +} + /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, this may happen when folding case in weird Turkish locales where @@ -3669,31 +3686,32 @@ dfassbuild (struct dfa *d) { struct dfa *sup = dfaalloc (); - *sup = *d; + dfacopysyntax (sup, d); + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; - sup->multibyte_prop = NULL; - sup->superset = NULL; - sup->states = NULL; - sup->sindex = 0; - sup->constraints = NULL; - sup->separates = NULL; - sup->follows = NULL; - sup->tralloc = 0; - sup->trans = NULL; - sup->fails = NULL; - sup->success = NULL; - sup->newlines = NULL; - - sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); - if (d->cindex) + + sup->depth = d->depth; + sup->nleaves = d->nleaves; + sup->nregexps = d->nregexps; + + if (d->cindex > 0) { - memcpy (sup->charclasses, d->charclasses, - d->cindex * sizeof *sup->charclasses); + sup->cindex = d->cindex; + sup->calloc = d->cindex; + + sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses); + + memcpy (sup->charclasses, d->charclasses, sup->cindex * sizeof *sup->charclasses); + + for (size_t i = 0; i < 5; i++) + sup->utf8_anychar_classes[i] = d->utf8_anychar_classes[i]; + + sup->canychar = d->canychar; } - sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); sup->talloc = d->tindex * 2; + sup->tokens = xnmalloc (sup->talloc, sizeof *sup->tokens); bool have_achar = false; bool have_nchar = false; @@ -4301,7 +4319,7 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - return xmalloc (sizeof (struct dfa)); + return xzalloc (sizeof (struct dfa)); } /* Initialize DFA. */ @@ -4309,7 +4327,6 @@ void dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, reg_syntax_t bits, int dfaopts) { - memset (dfa, 0, offsetof (struct dfa, dfaexec)); dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; dfa->simple_locale = using_simple_locale (linfo->multibyte); dfa->localeinfo = *linfo; diff --git a/lib/dfa.h b/lib/dfa.h index fea815d..cb9d8dc 100644 --- a/lib/dfa.h +++ b/lib/dfa.h @@ -100,6 +100,11 @@ extern struct dfa *dfasuperset (struct dfa const *d) _GL_ATTRIBUTE_PURE; /* The DFA is likely to be fast. */ extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE; +/* Copy the syntax settings from one dfa instance to another. + Saves considerable computation time if compiling many regular expressions + based on the same setting. */ +extern void dfacopysyntax (struct dfa *to, const struct dfa *from); + /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); -- 1.7.1
From a86e5011ea9304be5a930c38b29ec05eef191d2b Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <[email protected]> Date: Sat, 10 Nov 2018 10:29:47 +0900 Subject: [PATCH 3/3] dfa: removal of unused function * lib/dfa.c (charclass_context): Remove it. --- lib/dfa.c | 21 --------------------- 1 files changed, 0 insertions(+), 21 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index a761a26..f5b9802 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -2319,27 +2319,6 @@ epsclosure (struct dfa const *d) free (tmp.elems); } -/* Returns the set of contexts for which there is at least one - character included in C. */ - -static int -charclass_context (struct dfa const *dfa, charclass const *c) -{ - int context = 0; - - for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j) - { - if (c->w[j] & dfa->syntax.newline.w[j]) - context |= CTX_NEWLINE; - if (c->w[j] & dfa->syntax.letters.w[j]) - context |= CTX_LETTER; - if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j])) - context |= CTX_NONE; - } - - return context; -} - /* Returns the contexts on which the position set S depends. Each context in the set of returned contexts (let's call it SC) may have a different follow set than other contexts in SC, and also different from the -- 1.7.1
