On 03/16/2012 05:14 AM, Jim Meyering wrote: > A patch would be most welcome.
OK, here's a proposed patch. It assumes syncing from gnulib (to pick up the gnulib regex fix). >From 0dd8ab2b726e6f7486db25cf10ac24f4b961faa0 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Sat, 17 Mar 2012 18:04:50 -0700 Subject: [PATCH] grep: report overflow for ERE a{1000000000} * NEWS: Document this. * src/dfa.c (MIN): New macro. (lex): Lexically analyze the repeat-count operator once, not twice; the double-scan complicated the code and made it harder to understand and fix. Adjust the repeat-count parsing so that it better matches the behavior of the regex code, in three ways: 1. Diagnose too-large repeat counts rather than treating them as literal characters. 2. Use RE_INVALID_INTERVAL_ORD, not RE_NO_BK_BRACES, to decide whether to treat invalid-syntax {...}s as literals. 3. Use the same wording for {...}-related diagnostics that the regex code uses. * tests/bre.tests, tests/ere.tests, tests/repetition-overflow: Adjust to match new behavior, and add a few tests. --- NEWS | 3 + src/dfa.c | 127 +++++++++++++++++---------------------------- tests/bre.tests | 6 ++- tests/ere.tests | 2 + tests/repetition-overflow | 4 +- 5 files changed, 58 insertions(+), 84 deletions(-) diff --git a/NEWS b/NEWS index 6dad608..b219b65 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,9 @@ GNU grep NEWS -*- outline -*- name too long", and it can run much faster when dealing with large directory hierarchies. + grep -E 'a{1000000000}' now reports an overflow error rather than + silently acting like grep -E 'a\{1000000000}'. + ** New features The -R option now has a long-option alias --dereference-recursive. diff --git a/src/dfa.c b/src/dfa.c index 6eb4a11..613f548 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -861,6 +861,10 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */ #endif /* MBS_SUPPORT */ +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1328,90 +1332,53 @@ lex (void) if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) goto normal_char; - if (syntax_bits & RE_NO_BK_BRACES) - { - /* Scan ahead for a valid interval; if it's not valid, - treat it as a literal '{'. */ - int lo = -1, hi = -1; - char const *p = lexptr; - char const *lim = p + lexleft; - for (; p != lim && ISASCIIDIGIT (*p); p++) - { - if (lo < 0) - lo = *p - '0'; - else - { - lo = lo * 10 + *p - '0'; - if (RE_DUP_MAX < lo) - goto normal_char; - } - } - if (p != lim && *p == ',') - while (++p != lim && ISASCIIDIGIT (*p)) - { - if (hi < 0) - hi = *p - '0'; - else - { - hi = hi * 10 + *p - '0'; - if (RE_DUP_MAX < hi) - goto normal_char; - } - } - else - hi = lo; - if (p == lim || *p != '}' || lo < 0 || (0 <= hi && hi < lo)) - goto normal_char; - } - - minrep = 0; /* Cases: {M} - exact count {M,} - minimum count, maximum is infinity + {,N} - 0 through N + {,} - 0 to infinity (same as '*') {M,N} - M through N */ - FETCH (c, _("unfinished repeat count")); - if (ISASCIIDIGIT (c)) - { - minrep = c - '0'; - for (;;) - { - FETCH (c, _("unfinished repeat count")); - if (!ISASCIIDIGIT (c)) - break; - minrep = 10 * minrep + c - '0'; - } - } - else - dfaerror (_("malformed repeat count")); - if (c == ',') - { - FETCH (c, _("unfinished repeat count")); - if (!ISASCIIDIGIT (c)) - maxrep = -1; - else - { - maxrep = c - '0'; - for (;;) - { - FETCH (c, _("unfinished repeat count")); - if (!ISASCIIDIGIT (c)) - break; - maxrep = 10 * maxrep + c - '0'; - } - if (0 <= maxrep && maxrep < minrep) - dfaerror (_("malformed repeat count")); - } - } - else - maxrep = minrep; - if (!(syntax_bits & RE_NO_BK_BRACES)) - { - if (c != '\\') - dfaerror (_("malformed repeat count")); - FETCH (c, _("unfinished repeat count")); - } - if (c != '}') - dfaerror (_("malformed repeat count")); + { + char const *p = lexptr; + char const *lim = p + lexleft; + minrep = maxrep = -1; + for (; p != lim && ISASCIIDIGIT (*p); p++) + { + if (minrep < 0) + minrep = *p - '0'; + else + minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0'); + } + if (p != lim) + { + if (*p != ',') + maxrep = minrep; + else + { + if (minrep < 0) + minrep = 0; + while (++p != lim && ISASCIIDIGIT (*p)) + { + if (maxrep < 0) + maxrep = *p - '0'; + else + maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0'); + } + } + } + if (! ((! backslash || (p != lim && *p++ == '\\')) + && p != lim && *p++ == '}' + && 0 <= minrep && (maxrep < 0 || minrep <= maxrep))) + { + if (syntax_bits & RE_INVALID_INTERVAL_ORD) + goto normal_char; + dfaerror (_("Invalid content of \\{\\}")); + } + if (RE_DUP_MAX < maxrep) + dfaerror (_("Regular expression too big")); + lexptr = p; + lexleft = lim - p; + } laststart = 0; return lasttok = REPMN; diff --git a/tests/bre.tests b/tests/bre.tests index 60ff1b5..9d01a3c 100644 --- a/tests/bre.tests +++ b/tests/bre.tests @@ -42,8 +42,9 @@ 2@a\{1@EBRACE 2@a\{1a@EBRACE 2@a\{1a\}@BADBR -2@a\{,2\}@BADBR -2@a\{,\}@BADBR +0@a\{,2\}@a\{,2\} +0@a\{,\}@a\{,\} +2@a\{\}@BADBR 2@a\{1,x\}@BADBR 2@a\{1,x@EBRACE 2@a\{32768\}@BADBR @@ -60,3 +61,4 @@ 2@a\{1\}*@BADRPT@TO CORRECT 1@a\(b\)?c\1d@acd 0@-\{0,1\}[0-9]*$@-5 +2@b\{1000000000\}@ESIZE diff --git a/tests/ere.tests b/tests/ere.tests index 08b3dba..e0aad2a 100644 --- a/tests/ere.tests +++ b/tests/ere.tests @@ -76,6 +76,7 @@ 0@a{1a}@a{1a} 0@a{,2}@a{,2} 0@a{,}@a{,} +2@a{}@BADBR 0@a{1,*}@a{1,,,} 2@a{1,x@EBRACE@TO CORRECT 2@a{300}@BADBR@TO CORRECT @@ -213,3 +214,4 @@ 0@abcdefghijklmnopqrstuv@abcdefghijklmnopqrstuv 0@CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a@CC11 0@a?b@ab +2@b{1000000000}@ESIZE diff --git a/tests/repetition-overflow b/tests/repetition-overflow index c92de23..66a44a6 100755 --- a/tests/repetition-overflow +++ b/tests/repetition-overflow @@ -11,9 +11,9 @@ fail=0 # range of "unsigned int" would silently wrap around. Hence, 2^32+1 # would be treated just like "1", and both of these would mistakenly match. -echo abc | grep -E "b{$xp1}" > out 2>&1; test $? = 1 || fail=1 +echo abc | grep -E "b{$xp1}" > out 2> /dev/null; test $? = 2 || fail=1 compare out /dev/null || fail=1 -echo abbc | grep -E "b{1,$xp2}" > out 2>&1; test $? = 1 || fail=1 +echo abbc | grep -E "b{1,$xp2}" > out 2> /dev/null; test $? = 2 || fail=1 compare out /dev/null || fail=1 Exit $fail -- 1.7.6.5
