On 04/16/2014 05:13 AM, Norihiro Tanaka wrote:
http://bugs.exim.org/show_bug.cgi?id=1468

Thanks. The response there makes it clear that if grep passes arbitrary binary data to PCRE, and if grep uses PCRE_NO_UTF8_CHECK, undefined behavior will result (maybe infinite loop, core dump, etc.). We can't have undefined behavior in grep. A simple fix is to avoid using PCRE_NO_UTF8_CHECK so I installed the attached patch to do that. Perhaps we can think of a better way at some point. In the meantime I'm taking the liberty of closing Bug#17245 and Bug#16586.
>From b9a691aa9b7aaa43e07841f11095d779b210448d Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Mon, 21 Apr 2014 10:51:16 -0700
Subject: [PATCH] grep: -P now rejects invalid input sequences in UTF-8 locales

See <http://bugs.gnu.org/17245> and <http://bugs.exim.org/1468>.
* NEWS: Document this.
* src/pcresearch.c (Pexecute): Do not use PCRE_NO_UTF8_CHECK,
as this leads to undefined behavior when the input is not UTF-8.
* tests/pcre-infloop, tests/pcre-invalid-utf8-input:
Exit status is now 2, not 1, when grep -P is given invalid UTF-8
data in a UTF-8 locale.
---
 NEWS                          |  4 ++++
 src/pcresearch.c              | 17 ++++-------------
 tests/pcre-infloop            |  2 +-
 tests/pcre-invalid-utf8-input |  5 ++---
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/NEWS b/NEWS
index fbb782b..2d3e12a 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,10 @@ GNU grep NEWS                                    -*- outline -*-
   grep -f no longer mishandles patterns containing NUL bytes.
   [bug introduced in grep-2.11]
 
+  grep -P now reports an error and exits when given invalid UTF-8 data.
+  Previously it was unreliable, and sometimes crashed or looped.
+  [bug introduced in grep-2.16]
+
   grep -P now works with -w and -x and backreferences. Before,
   echo aa|grep -Pw '(.)\1' would fail to match, yet
   echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a5e953f..9f63f37 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -52,19 +52,14 @@ Pcompile (char const *pattern, size_t size)
   int e;
   char const *ep;
   char *re = xnmalloc (4, size + 7);
-  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+  int flags = (PCRE_MULTILINE
+               | (match_icase ? PCRE_CASELESS : 0)
+               | (using_utf8 () ? PCRE_UTF8 : 0));
   char const *patlim = pattern + size;
   char *n = re;
   char const *p;
   char const *pnul;
 
-  if (using_utf8 ())
-    {
-      /* Enable PCRE's UTF-8 matching.  Note also the use of
-         PCRE_NO_UTF8_CHECK when calling pcre_extra, below.   */
-      flags |= PCRE_UTF8;
-    }
-
   /* FIXME: Remove these restrictions.  */
   if (memchr (pattern, '\n', size))
     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
@@ -154,10 +149,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
        e == PCRE_ERROR_NOMATCH && line_next < buf + size;
        start_ofs -= line_next - line_buf)
     {
-      /* Disable the check that would make an invalid byte
-         seqence *in the input* trigger a failure.   */
-      int options = PCRE_NO_UTF8_CHECK;
-
       line_buf = line_next;
       line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
       if (line_end == NULL)
@@ -172,7 +163,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
-                     start_ofs < 0 ? 0 : start_ofs, options,
+                     start_ofs < 0 ? 0 : start_ofs, 0,
                      sub, sizeof sub / sizeof *sub);
     }
 
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 57b67ae..febf356 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
 fail=0
 
 LC_ALL=en_US.utf8 timeout 3 grep -P 'a.?..b' in
-test $? = 1 || fail_ "libpcre's match function appears to infloop"
+test $? = 2 || fail_ "libpcre's match function appears to infloop"
 
 Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index ccf3caf..913e8ee 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -15,8 +15,7 @@ fail=0
 
 printf 'j\202\nj\n' > in || framework_failure_
 
-LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1
-compare in out || fail=1
-compare /dev/null err || fail=1
+LC_ALL=en_US.UTF-8 grep -P j in
+test $? -eq 2 || fail=1
 
 Exit $fail
-- 
1.9.0

Reply via email to