Thanks for looking into this. I added that test case, but took a
more-conservative approach to fixing the bug, by disabling the
optimization that's causing this problem; please see attached patches.
The optimization was a hack anyway, and these bugs suggest that it's not
a hack worth keeping.
>From c4d6d577820f6d40cc0f34be56fc8d8795381fef Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 24 Oct 2014 13:27:29 -0700
Subject: [PATCH 1/2] grep: fix grep -P crash
Reported by Shlomi Fish in: http://bugs.gnu.org/18806
Commit 9fa500407137f49f6edc3c6b4ee6c7096f0190c5 (2014-09-16) is a
hack that I put in to speed up 'grep -P'. Unfortunately, not only
is it violation of modularity, it's also a bug magnet, as we have
found out with Bug#18738 and Bug#18806. Remove the optimization
instead of applying more bandaids. Perhaps we can think of a
better way of doing the optimization, or perhaps we can just live
with a slower grep -P (as -P is inherently slower anyway...).
* src/grep.c, src/grep.h (validated_boundary):
Remove. All uses removed.
* src/pcresearch.c (Pexecute): Do not worry about validated_boundary.
---
src/grep.c | 3 ---
src/grep.h | 4 ----
src/pcresearch.c | 37 ++++++++++++++-----------------------
3 files changed, 14 insertions(+), 30 deletions(-)
diff --git a/src/grep.c b/src/grep.c
index a0f2620..0a4ac27 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -352,7 +352,6 @@ bool match_words;
bool match_lines;
char eolbyte;
enum textbin input_textbin;
-char const *validated_boundary;
static char const *matcher;
@@ -1226,7 +1225,6 @@ grepbuf (char const *beg, char const *lim)
intmax_t outleft0 = outleft;
char const *p;
char const *endp;
- validated_boundary = beg;
for (p = beg; p < lim; p = endp)
{
@@ -2516,7 +2514,6 @@ main (int argc, char **argv)
/* We need one byte prior and one after. */
char eolbytes[3] = { 0, eolbyte, 0 };
size_t match_size;
- validated_boundary = eolbytes + 1;
skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0)
== out_invert);
diff --git a/src/grep.h b/src/grep.h
index 86259fb..02052b4 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -47,8 +47,4 @@ enum textbin
/* Input file type. */
extern enum textbin input_textbin;
-/* Validation boundary. Earlier bytes have already been validated by
- the PCRE matcher, which cares about this sort of thing. */
-extern char const *validated_boundary;
-
#endif
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1fd5bde..5451029 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -156,7 +156,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
char const *line_start = buf;
int e = PCRE_ERROR_NOMATCH;
char const *line_end;
- char const *validated = validated_boundary;
/* If the input type is unknown, the caller is still testing the
input, which means the current buffer cannot contain encoding
@@ -210,34 +209,28 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
int options = 0;
if (!bol)
options |= PCRE_NOTBOL;
- if (multiline || p + search_bytes <= validated)
+ if (multiline)
options |= PCRE_NO_UTF8_CHECK;
- int valid_bytes = validated - p;
- if (valid_bytes <= 0)
+ e = pcre_exec (cre, extra, p, search_bytes, 0,
+ options, sub, NSUB);
+ if (e != PCRE_ERROR_BADUTF8)
{
- e = pcre_exec (cre, extra, p, search_bytes, 0,
- options, sub, NSUB);
- if (e != PCRE_ERROR_BADUTF8)
+ if (0 < e && multiline && sub[1] - sub[0] != 0)
{
- validated = p + search_bytes;
- if (0 < e && multiline && sub[1] - sub[0] != 0)
+ char const *nl = memchr (p + sub[0], eolbyte,
+ sub[1] - sub[0]);
+ if (nl)
{
- char const *nl = memchr (p + sub[0], eolbyte,
- sub[1] - sub[0]);
- if (nl)
- {
- /* This match crosses a line boundary; reject it. */
- p += sub[0];
- line_end = nl;
- continue;
- }
+ /* This match crosses a line boundary; reject it. */
+ p += sub[0];
+ line_end = nl;
+ continue;
}
- break;
}
- valid_bytes = sub[0];
- validated = p + valid_bytes;
+ break;
}
+ int valid_bytes = sub[0];
/* Try to match the string before the encoding error.
Again, handle the empty-match case specially, for speed. */
@@ -263,8 +256,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
bol = true;
}
- validated_boundary = validated;
-
if (e <= 0)
{
switch (e)
--
1.9.3
>From f79cd1bbd135f7c1751059be9d1d785598a7759b Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Fri, 24 Oct 2014 13:33:04 -0700
Subject: [PATCH 2/2] tests: add test for grep -P fix
* tests/pcre-o: New test for this change.
* tests/Makefile.am (TESTS): Add it.
---
tests/Makefile.am | 1 +
tests/pcre-o | 17 +++++++++++++++++
2 files changed, 18 insertions(+)
create mode 100755 tests/pcre-o
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c298835..f6f051c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -90,6 +90,7 @@ TESTS = \
pcre-abort \
pcre-infloop \
pcre-invalid-utf8-input \
+ pcre-o \
pcre-utf8 \
pcre-w \
pcre-wx-backref \
diff --git a/tests/pcre-o b/tests/pcre-o
new file mode 100755
index 0000000..3d0677c
--- /dev/null
+++ b/tests/pcre-o
@@ -0,0 +1,17 @@
+#! /bin/sh
+# Ensure that grep -oP doesn't cause internal error at match.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_pcre_
+
+fail=0
+
+echo ab | grep -oP 'a' || fail=1
+
+Exit $fail
--
1.9.3