Re: [PATCH] tests: exercise two recently-fixed bugs

Paul Eggert Sat, 17 Mar 2012 18:06:45 -0700

On 03/16/2012 05:14 AM, Jim Meyering wrote:
> A patch would be most welcome.


OK, here's a proposed patch.  It assumes syncing from gnulib
(to pick up the gnulib regex fix).

>From 0dd8ab2b726e6f7486db25cf10ac24f4b961faa0 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 17 Mar 2012 18:04:50 -0700
Subject: [PATCH] grep: report overflow for ERE a{1000000000}

* NEWS: Document this.
* src/dfa.c (MIN): New macro.
(lex): Lexically analyze the repeat-count operator once, not
twice; the double-scan complicated the code and made it harder to
understand and fix.  Adjust the repeat-count parsing so that it
better matches the behavior of the regex code, in three ways:
1. Diagnose too-large repeat counts rather than treating them as
literal characters.  2. Use RE_INVALID_INTERVAL_ORD, not
RE_NO_BK_BRACES, to decide whether to treat invalid-syntax {...}s
as literals.  3. Use the same wording for {...}-related
diagnostics that the regex code uses.
* tests/bre.tests, tests/ere.tests, tests/repetition-overflow:
Adjust to match new behavior, and add a few tests.
---
 NEWS                      |    3 +
 src/dfa.c                 |  127 +++++++++++++++++----------------------------
 tests/bre.tests           |    6 ++-
 tests/ere.tests           |    2 +
 tests/repetition-overflow |    4 +-
 5 files changed, 58 insertions(+), 84 deletions(-)

diff --git a/NEWS b/NEWS
index 6dad608..b219b65 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,9 @@ GNU grep NEWS                                    -*- outline 
-*-
   name too long", and it can run much faster when dealing with large
   directory hierarchies.
 
+  grep -E 'a{1000000000}' now reports an overflow error rather than
+  silently acting like grep -E 'a\{1000000000}'.
+
 ** New features
 
   The -R option now has a long-option alias --dereference-recursive.
diff --git a/src/dfa.c b/src/dfa.c
index 6eb4a11..613f548 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -861,6 +861,10 @@ static unsigned char const *buf_end;    /* reference to 
end in dfaexec().  */
 
 #endif /* MBS_SUPPORT */
 
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1328,90 +1332,53 @@ lex (void)
           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
             goto normal_char;
 
-          if (syntax_bits & RE_NO_BK_BRACES)
-            {
-              /* Scan ahead for a valid interval; if it's not valid,
-                 treat it as a literal '{'.  */
-              int lo = -1, hi = -1;
-              char const *p = lexptr;
-              char const *lim = p + lexleft;
-              for (; p != lim && ISASCIIDIGIT (*p); p++)
-                {
-                  if (lo < 0)
-                    lo = *p - '0';
-                  else
-                    {
-                      lo = lo * 10 + *p - '0';
-                      if (RE_DUP_MAX < lo)
-                        goto normal_char;
-                    }
-                }
-              if (p != lim && *p == ',')
-                while (++p != lim && ISASCIIDIGIT (*p))
-                  {
-                    if (hi < 0)
-                      hi = *p - '0';
-                    else
-                      {
-                        hi = hi * 10 + *p - '0';
-                        if (RE_DUP_MAX < hi)
-                          goto normal_char;
-                      }
-                  }
-              else
-                hi = lo;
-              if (p == lim || *p != '}' || lo < 0 || (0 <= hi && hi < lo))
-                goto normal_char;
-            }
-
-          minrep = 0;
           /* Cases:
              {M} - exact count
              {M,} - minimum count, maximum is infinity
+             {,N} - 0 through N
+             {,} - 0 to infinity (same as '*')
              {M,N} - M through N */
-          FETCH (c, _("unfinished repeat count"));
-          if (ISASCIIDIGIT (c))
-            {
-              minrep = c - '0';
-              for (;;)
-                {
-                  FETCH (c, _("unfinished repeat count"));
-                  if (!ISASCIIDIGIT (c))
-                    break;
-                  minrep = 10 * minrep + c - '0';
-                }
-            }
-          else
-            dfaerror (_("malformed repeat count"));
-          if (c == ',')
-            {
-              FETCH (c, _("unfinished repeat count"));
-              if (!ISASCIIDIGIT (c))
-                maxrep = -1;
-              else
-                {
-                  maxrep = c - '0';
-                  for (;;)
-                    {
-                      FETCH (c, _("unfinished repeat count"));
-                      if (!ISASCIIDIGIT (c))
-                        break;
-                      maxrep = 10 * maxrep + c - '0';
-                    }
-                  if (0 <= maxrep && maxrep < minrep)
-                    dfaerror (_("malformed repeat count"));
-                }
-            }
-          else
-            maxrep = minrep;
-          if (!(syntax_bits & RE_NO_BK_BRACES))
-            {
-              if (c != '\\')
-                dfaerror (_("malformed repeat count"));
-              FETCH (c, _("unfinished repeat count"));
-            }
-          if (c != '}')
-            dfaerror (_("malformed repeat count"));
+          {
+            char const *p = lexptr;
+            char const *lim = p + lexleft;
+            minrep = maxrep = -1;
+            for (; p != lim && ISASCIIDIGIT (*p); p++)
+              {
+                if (minrep < 0)
+                  minrep = *p - '0';
+                else
+                  minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
+              }
+            if (p != lim)
+              {
+                if (*p != ',')
+                  maxrep = minrep;
+                else
+                  {
+                    if (minrep < 0)
+                      minrep = 0;
+                    while (++p != lim && ISASCIIDIGIT (*p))
+                      {
+                        if (maxrep < 0)
+                          maxrep = *p - '0';
+                        else
+                          maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - 
'0');
+                      }
+                  }
+              }
+            if (! ((! backslash || (p != lim && *p++ == '\\'))
+                   && p != lim && *p++ == '}'
+                   && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
+              {
+                if (syntax_bits & RE_INVALID_INTERVAL_ORD)
+                  goto normal_char;
+                dfaerror (_("Invalid content of \\{\\}"));
+              }
+            if (RE_DUP_MAX < maxrep)
+              dfaerror (_("Regular expression too big"));
+            lexptr = p;
+            lexleft = lim - p;
+          }
           laststart = 0;
           return lasttok = REPMN;
 
diff --git a/tests/bre.tests b/tests/bre.tests
index 60ff1b5..9d01a3c 100644
--- a/tests/bre.tests
+++ b/tests/bre.tests
@@ -42,8 +42,9 @@
 2@a\{1@EBRACE
 2@a\{1a@EBRACE
 2@a\{1a\}@BADBR
-2@a\{,2\}@BADBR
-2@a\{,\}@BADBR
+0@a\{,2\}@a\{,2\}
+0@a\{,\}@a\{,\}
+2@a\{\}@BADBR
 2@a\{1,x\}@BADBR
 2@a\{1,x@EBRACE
 2@a\{32768\}@BADBR
@@ -60,3 +61,4 @@
 2@a\{1\}*@BADRPT@TO CORRECT
 1@a\(b\)?c\1d@acd
 0@-\{0,1\}[0-9]*$@-5
+2@b\{1000000000\}@ESIZE
diff --git a/tests/ere.tests b/tests/ere.tests
index 08b3dba..e0aad2a 100644
--- a/tests/ere.tests
+++ b/tests/ere.tests
@@ -76,6 +76,7 @@
 0@a{1a}@a{1a}
 0@a{,2}@a{,2}
 0@a{,}@a{,}
+2@a{}@BADBR
 0@a{1,*}@a{1,,,}
 2@a{1,x@EBRACE@TO CORRECT
 2@a{300}@BADBR@TO CORRECT
@@ -213,3 +214,4 @@
 0@abcdefghijklmnopqrstuv@abcdefghijklmnopqrstuv
 0@CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a@CC11
 0@a?b@ab
+2@b{1000000000}@ESIZE
diff --git a/tests/repetition-overflow b/tests/repetition-overflow
index c92de23..66a44a6 100755
--- a/tests/repetition-overflow
+++ b/tests/repetition-overflow
@@ -11,9 +11,9 @@ fail=0
 # range of "unsigned int" would silently wrap around.   Hence, 2^32+1
 # would be treated just like "1", and both of these would mistakenly match.
 
-echo abc  | grep -E "b{$xp1}"   > out 2>&1; test $? = 1 || fail=1
+echo abc  | grep -E "b{$xp1}"   > out 2> /dev/null; test $? = 2 || fail=1
 compare out /dev/null || fail=1
-echo abbc | grep -E "b{1,$xp2}" > out 2>&1; test $? = 1 || fail=1
+echo abbc | grep -E "b{1,$xp2}" > out 2> /dev/null; test $? = 2 || fail=1
 compare out /dev/null || fail=1
 
 Exit $fail
-- 
1.7.6.5

Re: [PATCH] tests: exercise two recently-fixed bugs

Reply via email to