Two patches. First, introduce regex caching, discussed here[1]; on my machine, this shaved another half second off 'autoconf -f' in the latest git coreutils repository using the latest git autoconf (and even better savings with autoconf 2.61). Second, fix a regression in regexp(a,,\\) introduced here[2].
[1] http://lists.gnu.org/archive/html/m4-discuss/2007-09/msg00001.html [2] http://lists.gnu.org/archive/html/m4-discuss/2007-09/msg00003.html I will shortly apply a similar patch to head (there, the regex caching must also account for the flavor of regex it was compiled with). 2007-10-09 Eric Blake <[EMAIL PROTECTED]> Fix regexp regression of 2007-09-29. * src/builtin.c (substitute): Allow NULL regs when no subexpressions were present. (m4_regexp): Handle \ escapes even with empty regex. * doc/m4.texinfo (Regexp, Patsubst): Catch this bug. Cache regex compilation for another autoconf speedup. * src/m4.h (free_macro_sequence): Rename... (free_regex): ...to this. * src/m4.c (main): Update caller. * src/builtin.c (REGEX_CACHE_SIZE, m4_regex, regex_cache): New declarations. (compile_pattern): New function; cache recent regexes. (free_regex): Rename, and clean up additional memory. (m4_regexp, m4_patsubst): Use new function. >From 10f18604f16b6dc60803230289a02151abf9c778 Mon Sep 17 00:00:00 2001 From: Eric Blake <[EMAIL PROTECTED]> Date: Tue, 9 Oct 2007 08:31:33 -0600 Subject: [PATCH] Cache regex compilation for another autoconf speedup. * src/m4.h (free_macro_sequence): Rename... (free_regex): ...to this. * src/m4.c (main): Update caller. * src/builtin.c (REGEX_CACHE_SIZE, m4_regex, regex_cache): New declarations. (compile_pattern): New function; cache recent regexes. (free_regex): Rename, and clean up additional memory. (m4_regexp, m4_patsubst): Use new function. Signed-off-by: Eric Blake <[EMAIL PROTECTED]> --- ChangeLog | 12 +++++ src/builtin.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++----------- src/m4.c | 2 +- src/m4.h | 2 +- 4 files changed, 136 insertions(+), 31 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7c9755a..ae7e0e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2007-10-09 Eric Blake <[EMAIL PROTECTED]> + + Cache regex compilation for another autoconf speedup. + * src/m4.h (free_macro_sequence): Rename... + (free_regex): ...to this. + * src/m4.c (main): Update caller. + * src/builtin.c (REGEX_CACHE_SIZE, m4_regex, regex_cache): New + declarations. + (compile_pattern): New function; cache recent regexes. + (free_regex): Rename, and clean up additional memory. + (m4_regexp, m4_patsubst): Use new function. + 2007-10-02 Eric Blake <[EMAIL PROTECTED]> Document quoting pitfalls in capitalize. diff --git a/src/builtin.c b/src/builtin.c index 8974b1a..10dd38e 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -231,6 +231,102 @@ static struct re_registers macro_sequence_regs; /* True if --warn-macro-sequence is in effect. */ static bool macro_sequence_inuse; +/* Maybe this is worth making runtime tunable. Too small, and nothing + gets cached because the working set of active regex is larger than + the cache, and we are always swapping out entries. Too large, and + the time spent searching the cache for a match overtakes the time + saved by caching. For now, this size proved reasonable for the + typical working set of Autoconf 2.62. */ +#define REGEX_CACHE_SIZE 16 + +/* Structure for caching compiled regex. */ +struct m4_regex { + unsigned count; /* usage counter */ + size_t len; /* length of string */ + char *str; /* copy of compiled string */ + struct re_pattern_buffer *buf; /* compiled regex, allocated */ + struct re_registers regs; /* match registers, reused */ +}; +typedef struct m4_regex m4_regex; + +/* Storage for the cache of regular expressions. */ +static m4_regex regex_cache[REGEX_CACHE_SIZE]; + +/*------------------------------------------------------------------. +| Compile STR, with length LEN, into a regex. On success, set BUF | +| and REGS to the compiled regex. Compilation is cached, so do not | +| free the results here; rather, use free_regex at the end of the | +| program. Return NULL on success, or an error message. | +`------------------------------------------------------------------*/ +static const char * +compile_pattern (const char *str, size_t len, struct re_pattern_buffer **buf, + struct re_registers **regs) +{ + int i; + m4_regex *victim; + unsigned victim_count; + struct re_pattern_buffer *new_buf; + struct re_registers *new_regs; + const char *msg; + + /* First, check if STR is already cached. If so, increase its use + count and return it. */ + for (i = 0; i < REGEX_CACHE_SIZE; i++) + if (len == regex_cache[i].len && regex_cache[i].str + && memcmp (str, regex_cache[i].str, len) == 0) + { + *buf = regex_cache[i].buf; + *regs = ®ex_cache[i].regs; + regex_cache[i].count++; + return NULL; + } + + /* Next, check if STR can be compiled. */ + new_buf = xzalloc (sizeof *new_buf); + msg = re_compile_pattern (str, len, new_buf); + if (msg) + { + regfree (new_buf); + free (new_buf); + return msg; + } + + /* Now, find a victim slot. Decrease the count of all entries, then + prime the count of the victim slot at REGEX_CACHE_SIZE. This + way, frequently used entries and newly created entries are least + likely to be victims next time we have a cache miss. */ + victim = regex_cache; + victim_count = victim->count; + if (victim_count) + victim->count--; + for (i = 1; i < REGEX_CACHE_SIZE; i++) + { + if (regex_cache[i].count < victim_count) + { + victim_count = regex_cache[i].count; + victim = ®ex_cache[i]; + } + if (regex_cache[i].count) + regex_cache[i].count--; + } + victim->count = REGEX_CACHE_SIZE; + victim->len = len; + if (victim->str) + { + free (victim->str); + regfree (victim->buf); + free (victim->buf); + } + victim->str = xstrdup (str); + victim->buf = new_buf; + new_regs = &victim->regs; + re_set_registers (new_buf, new_regs, new_regs->num_regs, + new_regs->start, new_regs->end); + *buf = new_buf; + *regs = new_regs; + return NULL; +} + /*----------------------------------------. | Clean up regular expression variables. | `----------------------------------------*/ @@ -273,14 +369,21 @@ set_macro_sequence (const char *regexp) macro_sequence_inuse = true; } -/*------------------------------------------------------------. -| Free dynamic memory utilized by the define sequence regular | -| expression. | -`------------------------------------------------------------*/ +/*------------------------------------------------------. +| Free dynamic memory utilized by regular expressions. | +`------------------------------------------------------*/ void -free_macro_sequence (void) +free_regex (void) { + int i; free_pattern_buffer (¯o_sequence_buf, ¯o_sequence_regs); + for (i = 0; i < REGEX_CACHE_SIZE; i++) + if (regex_cache[i].str) + { + free (regex_cache[i].str); + free_pattern_buffer (regex_cache[i].buf, ®ex_cache[i].regs); + free (regex_cache[i].buf); + } } /*-------------------------------------------------------------------------. @@ -1965,8 +2068,8 @@ m4_regexp (struct obstack *obs, int argc, token_data **argv) const char *regexp; /* regular expression */ const char *repl; /* replacement string */ - struct re_pattern_buffer buf; /* compiled regular expression */ - struct re_registers regs; /* for subexpression matches */ + struct re_pattern_buffer *buf;/* compiled regular expression */ + struct re_registers *regs; /* for subexpression matches */ const char *msg; /* error message from re_compile_pattern */ int startpos; /* start position of match */ int length; /* length of first argument */ @@ -1993,21 +2096,18 @@ m4_regexp (struct obstack *obs, int argc, token_data **argv) return; } - init_pattern_buffer (&buf, ®s); - msg = re_compile_pattern (regexp, strlen (regexp), &buf); - + msg = compile_pattern (regexp, strlen (regexp), &buf, ®s); if (msg != NULL) { M4ERROR ((warning_status, 0, "bad regular expression: `%s': %s", regexp, msg)); - free_pattern_buffer (&buf, ®s); return; } length = strlen (victim); /* Avoid overhead of allocating regs if we won't use it. */ - startpos = re_search (&buf, victim, length, 0, length, - argc == 3 ? NULL : ®s); + startpos = re_search (buf, victim, length, 0, length, + argc == 3 ? NULL : regs); if (startpos == -2) M4ERROR ((warning_status, 0, @@ -2015,9 +2115,7 @@ m4_regexp (struct obstack *obs, int argc, token_data **argv) else if (argc == 3) shipout_int (obs, startpos); else if (startpos >= 0) - substitute (obs, victim, repl, ®s); - - free_pattern_buffer (&buf, ®s); + substitute (obs, victim, repl, regs); } /*--------------------------------------------------------------------------. @@ -2034,8 +2132,8 @@ m4_patsubst (struct obstack *obs, int argc, token_data **argv) const char *regexp; /* regular expression */ const char *repl; - struct re_pattern_buffer buf; /* compiled regular expression */ - struct re_registers regs; /* for subexpression matches */ + struct re_pattern_buffer *buf;/* compiled regular expression */ + struct re_registers *regs; /* for subexpression matches */ const char *msg; /* error message from re_compile_pattern */ int matchpos; /* start position of match */ int offset; /* current match offset */ @@ -2061,14 +2159,11 @@ m4_patsubst (struct obstack *obs, int argc, token_data **argv) return; } - init_pattern_buffer (&buf, ®s); - msg = re_compile_pattern (regexp, strlen (regexp), &buf); - + msg = compile_pattern (regexp, strlen (regexp), &buf, ®s); if (msg != NULL) { M4ERROR ((warning_status, 0, "bad regular expression `%s': %s", regexp, msg)); - free (buf.buffer); return; } @@ -2078,8 +2173,8 @@ m4_patsubst (struct obstack *obs, int argc, token_data **argv) matchpos = 0; while (offset <= length) { - matchpos = re_search (&buf, victim, length, - offset, length - offset, ®s); + matchpos = re_search (buf, victim, length, + offset, length - offset, regs); if (matchpos < 0) { @@ -2102,19 +2197,17 @@ m4_patsubst (struct obstack *obs, int argc, token_data **argv) /* Handle the part of the string that was covered by the match. */ - substitute (obs, victim, repl, ®s); + substitute (obs, victim, repl, regs); /* Update the offset to the end of the match. If the regexp matched a null string, advance offset one more, to avoid infinite loops. */ - offset = regs.end[0]; - if (regs.start[0] == regs.end[0]) + offset = regs->end[0]; + if (regs->start[0] == regs->end[0]) obstack_1grow (obs, victim[offset++]); } obstack_1grow (obs, '\0'); - - free_pattern_buffer (&buf, ®s); } /* Finally, a placeholder builtin. This builtin is not installed by diff --git a/src/m4.c b/src/m4.c index 2d5ced0..daa15a8 100644 --- a/src/m4.c +++ b/src/m4.c @@ -590,6 +590,6 @@ main (int argc, char *const *argv, char *const *envp) undivert_all (); } output_exit (); - free_macro_sequence (); + free_regex (); exit (retcode); } diff --git a/src/m4.h b/src/m4.h index 3d1178f..38f9d09 100644 --- a/src/m4.h +++ b/src/m4.h @@ -416,7 +416,7 @@ struct re_registers; void builtin_init (void); void define_builtin (const char *, const builtin *, symbol_lookup); void set_macro_sequence (const char *); -void free_macro_sequence (void); +void free_regex (void); void define_user_macro (const char *, const char *, symbol_lookup); void undivert_all (void); void expand_user_macro (struct obstack *, symbol *, int, token_data **); -- 1.5.3.2 >From 4f74b635626f8628301a77b6a4926b1b0738ef2b Mon Sep 17 00:00:00 2001 From: Eric Blake <[EMAIL PROTECTED]> Date: Tue, 9 Oct 2007 10:25:37 -0600 Subject: [PATCH] Fix regexp regression of 2007-09-29. * src/builtin.c (substitute): Allow NULL regs when no subexpressions were present. (m4_regexp): Handle \ escapes even with empty regex. * doc/m4.texinfo (Regexp, Patsubst): Catch this bug. Signed-off-by: Eric Blake <[EMAIL PROTECTED]> --- ChangeLog | 6 ++++++ doc/m4.texinfo | 8 ++++---- src/builtin.c | 34 ++++++++++++++++++---------------- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index ae7e0e0..42bbaeb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2007-10-09 Eric Blake <[EMAIL PROTECTED]> + Fix regexp regression of 2007-09-29. + * src/builtin.c (substitute): Allow NULL regs when no + subexpressions were present. + (m4_regexp): Handle \ escapes even with empty regex. + * doc/m4.texinfo (Regexp, Patsubst): Catch this bug. + Cache regex compilation for another autoconf speedup. * src/m4.h (free_macro_sequence): Rename... (free_regex): ...to this. diff --git a/doc/m4.texinfo b/doc/m4.texinfo index 1bc5be9..64e37ba 100644 --- a/doc/m4.texinfo +++ b/doc/m4.texinfo @@ -4703,8 +4703,8 @@ regexp(`abc') @result{}0 regexp(`abc', `') @result{}0 -regexp(`abc', `', `def') [EMAIL PROTECTED] +regexp(`abc', `', `\\def') [EMAIL PROTECTED] @end example @node Substr @@ -4951,8 +4951,8 @@ patsubst(`abc') @result{}abc patsubst(`abc', `') @result{}abc -patsubst(`abc', `', `-') [EMAIL PROTECTED] +patsubst(`abc', `', `\\-') [EMAIL PROTECTED] @end example @node Format diff --git a/src/builtin.c b/src/builtin.c index 10dd38e..75859bd 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -2009,14 +2009,15 @@ Warning: \\0 will disappear, use \\& instead in replacements")); /* Fall through. */ case '&': - obstack_grow (obs, victim + regs->start[0], - regs->end[0] - regs->start[0]); + if (regs) + obstack_grow (obs, victim + regs->start[0], + regs->end[0] - regs->start[0]); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': ch -= '0'; - if (regs->num_regs - 1 <= ch) + if (!regs || regs->num_regs - 1 <= ch) M4ERROR ((warning_status, 0, "Warning: sub-expression %d not present", ch)); else if (regs->end[ch] > 0) @@ -2054,12 +2055,12 @@ init_pattern_buffer (struct re_pattern_buffer *buf, struct re_registers *regs) } } -/*--------------------------------------------------------------------------. -| Regular expression version of index. Given two arguments, expand to the | -| index of the first match of the second argument (a regexp) in the first. | -| Expand to -1 if here is no match. Given a third argument, is changes | -| the expansion to this argument. | -`--------------------------------------------------------------------------*/ +/*------------------------------------------------------------------. +| Regular expression version of index. Given two arguments, expand | +| to the index of the first match of the second argument (a regexp) | +| in the first. Expand to -1 if there is no match. Given a third | +| argument, a match is substituted according to this argument. | +`------------------------------------------------------------------*/ static void m4_regexp (struct obstack *obs, int argc, token_data **argv) @@ -2092,7 +2093,7 @@ m4_regexp (struct obstack *obs, int argc, token_data **argv) if (argc == 3) shipout_int (obs, 0); else - obstack_grow (obs, repl, strlen (repl)); + substitute (obs, victim, repl, NULL); return; } @@ -2118,12 +2119,13 @@ m4_regexp (struct obstack *obs, int argc, token_data **argv) substitute (obs, victim, repl, regs); } -/*--------------------------------------------------------------------------. -| Substitute all matches of a regexp occuring in a string. Each match of | -| the second argument (a regexp) in the first argument is changed to the | -| third argument, with \& substituted by the matched text, and \N | -| substituted by the text matched by the Nth parenthesized sub-expression. | -`--------------------------------------------------------------------------*/ +/*------------------------------------------------------------------. +| Substitute all matches of a regexp occurring in a string. Each | +| match of the second argument (a regexp) in the first argument is | +| changed to the third argument, with \& substituted by the matched | +| text, and \N substituted by the text matched by the Nth | +| parenthesized sub-expression. | +`------------------------------------------------------------------*/ static void m4_patsubst (struct obstack *obs, int argc, token_data **argv) -- 1.5.3.2 _______________________________________________ M4-patches mailing list [email protected] http://lists.gnu.org/mailman/listinfo/m4-patches
