Thanks for catching that bug and for the fix. I found some ways to
simplify it, and to fix a glitch with CRLF handling that I discovered
while reviewing it, and installed your patch along with the attached two
fixup patches.
From 2e4192963c6ffc34756bfa603fcd03ff7aa3a297 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 11 Apr 2014 10:57:08 -0700
Subject: [PATCH 1/2] grep: cleanup for empty-string fix
* NEWS: Document it.
* src/dfasearch.c (GEAcompile):
* src/kwsearch.c (Fcompile):
Use C99-style decls to simplify. Avoid duplicate code.
* tests/empty-line: Add some more tests like this.
---
NEWS | 3 +++
src/dfasearch.c | 32 ++++++++++----------------------
src/kwsearch.c | 12 ++++--------
tests/empty-line | 38 +++++++++++++++++++++++++++++++-------
4 files changed, 48 insertions(+), 37 deletions(-)
diff --git a/NEWS b/NEWS
index 2a62e7b..92ce95e 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,9 @@ GNU grep NEWS -*- outline -*-
mishandles patterns like [^a] in locales that have multicharacter
collating sequences so that [^a] can match a string of two characters.
+ grep no longer mishandles an empty pattern at the end of a pattern list.
+ [bug introduced in grep-2.5]
+
grep -P now works with -w and -x and backreferences. Before,
echo aa|grep -Pw '(.)\1' would fail to match, yet
echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 39ea442..1266c80 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -110,8 +110,6 @@ kwsmusts (void)
void
GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
{
- const char *err;
- const char *p, *sep;
size_t total = size;
char *motif;
@@ -120,15 +118,15 @@ GEAcompile (char const *pattern, size_t size,
reg_syntax_t syntax_bits)
re_set_syntax (syntax_bits);
dfasyntax (syntax_bits, match_icase, eolbyte);
- /* For GNU regex compiler we have to pass the patterns separately to detect
- errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
- GNU regex should have raise a syntax error. The same for backref, where
- the backref should have been local to each pattern. */
- p = pattern;
+ /* For GNU regex, pass the patterns separately to detect errors like
+ "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
+ this should be a syntax error. The same for backref, where the
+ backref should be local to each pattern. */
+ char const *p = pattern;
do
{
size_t len;
- sep = memchr (p, '\n', total);
+ char const *sep = memchr (p, '\n', total);
if (sep)
{
len = sep - p;
@@ -144,24 +142,14 @@ GEAcompile (char const *pattern, size_t size,
reg_syntax_t syntax_bits)
patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
patterns[pcount] = patterns0;
- if ((err = re_compile_pattern (p, len,
- &(patterns[pcount].regexbuf))) != NULL)
+ char const *err = re_compile_pattern (p, len,
+ &(patterns[pcount].regexbuf));
+ if (err)
error (EXIT_TROUBLE, 0, "%s", err);
pcount++;
-
p = sep;
- } while (sep && total != 0);
-
- if (sep)
- {
- patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
- patterns[pcount] = patterns0;
-
- if ((err = re_compile_pattern ("", 0,
- &(patterns[pcount].regexbuf))) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- pcount++;
}
+ while (p);
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7fe8e48..cf8df3c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -32,7 +32,6 @@ static kwset_t kwset;
void
Fcompile (char const *pattern, size_t size)
{
- char const *p, *sep;
size_t total = size;
mb_len_map_t *map = NULL;
char const *pat = (match_icase && MB_CUR_MAX > 1
@@ -41,11 +40,11 @@ Fcompile (char const *pattern, size_t size)
kwsinit (&kwset);
- p = pat;
+ char const *p = pat;
do
{
size_t len;
- sep = memchr (p, '\n', total);
+ char const *sep = memchr (p, '\n', total);
if (sep)
{
len = sep - p;
@@ -63,12 +62,9 @@ Fcompile (char const *pattern, size_t size)
}
kwsincr (kwset, p, len);
-
p = sep;
- } while (sep && total != 0);
-
- if (sep)
- kwsincr (kwset, "", 0);
+ }
+ while (p);
kwsprep (kwset);
}
diff --git a/tests/empty-line b/tests/empty-line
index aeaa6ca..25e9509 100755
--- a/tests/empty-line
+++ b/tests/empty-line
@@ -1,17 +1,41 @@
#! /bin/sh
-# This would fail for grep-2.18
+# Test that the empty pattern matches everything.
+# Some of these tests failed in grep 2.18.
. "${srcdir=.}/init.sh"; path_prepend_ ../src
fail=0
printf 'abc\n' >in || framework_failure_
+nl='
+'
-printf 'foo\n\n' >pat || framework_failure_
-grep -F -f pat in >out || fail=1
-compare in out || fail=1
+for opt in '' -E -F; do
+ case $opt in
+ '') prefix='\(\)\1';;
+ -E) prefix='()\1';;
+ -F) prefix="foo$nl";;
+ esac
-printf '\(\)\\1foo\n\n' >pat || framework_failure_
-grep -f pat in >out || fail=1
-compare in out || fail=1
+ for pattern in "" "$nl" "---$nl" "${nl}foo"; do
+ for pat in "$pattern" "$prefix$pattern"; do
+ grep $opt -e "$pat" in >out || fail=1
+ compare in out || fail=1
+
+ printf -- '%s\n' "$pat" >pat || framework_failure_
+ grep $opt -f pat in >out || fail=1
+ compare in out || fail=1
+
+ # Check that pattern files that end in non-newlines
+ # are treated as if a newline were appended.
+ case $pattern in
+ '' | *"$nl") ;;
+ *)
+ printf -- '%s' "$pat" >pat || framework_failure_
+ grep $opt -f pat in >out || fail=1
+ compare in out || fail=1;;
+ esac
+ done
+ done
+done
Exit $fail
--
1.9.0
From 1614968f1e6429cf2a55b58cdb4f7f3f353431db Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 11 Apr 2014 12:48:48 -0700
Subject: [PATCH 2/2] grep: cleanup for HAS_DOS_FILE_CONTENTS issue
While cleaning up the empty-string fix, I noticed that one part of
the code worried about CRLF in pattern files whereas another part
did not. Fix this by using the same approach in both places,
and make the CRLF code more modular in the process.
* src/dosbuf.c (dos_binary, dos_unix_byte_offsets): New functions.
(undossify_input, dossified_pos): Do nothing if ! O_BINARY.
* src/grep.c: Always include dosbuf.c so that the code is
checked statically even on non-DOS hosts.
(dos_binary, dos_unix_byte_offsets): New decls.
(undossify_input): Declare unconditionally.
* src/grep.c (fillbuf, print_line_head, main):
* src/kwsearch.c (Fcompile):
Simplify by not worrying about HAVE_DOS_FILE_CONTENTS.
* src/grep.c (main): fopen with "rt" if O_TEXT; this is simpler
than worrying about HAVE_DOS_FILE_CONTENTS elsewhere.
* src/system.h (HAVE_DOS_FILE_CONTENTS): Remove.
---
src/dosbuf.c | 22 ++++++++++++++++++++++
src/grep.c | 26 ++++++++------------------
src/kwsearch.c | 4 ----
src/system.h | 4 ----
4 files changed, 30 insertions(+), 26 deletions(-)
diff --git a/src/dosbuf.c b/src/dosbuf.c
index 3648371..9ac2d13 100644
--- a/src/dosbuf.c
+++ b/src/dosbuf.c
@@ -49,6 +49,22 @@ static int dos_pos_map_size = 0;
static int dos_pos_map_used = 0;
static int inp_map_idx = 0, out_map_idx = 1;
+/* Set default DOS file type to binary. */
+static void
+dos_binary (void)
+{
+ if (O_BINARY)
+ dos_use_file_type = DOS_BINARY;
+}
+
+/* Tell DOS routines to report Unix offset. */
+static void
+dos_unix_byte_offsets (void)
+{
+ if (O_BINARY)
+ dos_report_unix_offset = 1;
+}
+
/* Guess DOS file type by looking at its contents. */
static File_type
guess_type (char *buf, size_t buflen)
@@ -79,6 +95,9 @@ guess_type (char *buf, size_t buflen)
static int
undossify_input (char *buf, size_t buflen)
{
+ if (! O_BINARY)
+ return buflen;
+
int chars_left = 0;
if (totalcc == 0)
@@ -167,6 +186,9 @@ undossify_input (char *buf, size_t buflen)
static off_t
dossified_pos (off_t byteno)
{
+ if (! O_BINARY)
+ return byteno;
+
off_t pos_lo;
off_t pos_hi;
diff --git a/src/grep.c b/src/grep.c
index 8bd6c49..d58ef03 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -395,9 +395,10 @@ static enum
static int grepfile (int, char const *, int, int);
static int grepdesc (int, int);
-#if defined HAVE_DOS_FILE_CONTENTS
+
+static void dos_binary (void);
+static void dos_unix_byte_offsets (void);
static int undossify_input (char *, size_t);
-#endif
static int
is_device_mode (mode_t m)
@@ -648,10 +649,7 @@ fillbuf (size_t save, struct stat const *st)
if (fillsize < 0)
fillsize = cc = 0;
bufoffset += fillsize;
-#if defined HAVE_DOS_FILE_CONTENTS
- if (fillsize)
- fillsize = undossify_input (readbuf, fillsize);
-#endif
+ fillsize = undossify_input (readbuf, fillsize);
buflim = readbuf + fillsize;
return cc;
}
@@ -695,9 +693,7 @@ static intmax_t pending; /* Pending lines of output.
static int done_on_match; /* Stop scanning file on first match. */
static int exit_on_match; /* Exit on first match. */
-#if defined HAVE_DOS_FILE_CONTENTS
-# include "dosbuf.c"
-#endif
+#include "dosbuf.c"
/* Add two numbers that count input bytes or lines, and report an
error if the addition overflows. */
@@ -803,9 +799,7 @@ print_line_head (char const *beg, char const *lim, int sep)
if (out_byte)
{
uintmax_t pos = add_count (totalcc, beg - bufbeg);
-#if defined HAVE_DOS_FILE_CONTENTS
pos = dossified_pos (pos);
-#endif
if (pending_sep)
print_sep (sep);
print_offset (pos, 6, byte_num_color);
@@ -2043,15 +2037,11 @@ main (int argc, char **argv)
break;
case 'U':
-#if defined HAVE_DOS_FILE_CONTENTS
- dos_use_file_type = DOS_BINARY;
-#endif
+ dos_binary ();
break;
case 'u':
-#if defined HAVE_DOS_FILE_CONTENTS
- dos_report_unix_offset = 1;
-#endif
+ dos_unix_byte_offsets ();
break;
case 'V':
@@ -2086,7 +2076,7 @@ main (int argc, char **argv)
break;
case 'f':
- fp = STREQ (optarg, "-") ? stdin : fopen (optarg, "r");
+ fp = STREQ (optarg, "-") ? stdin : fopen (optarg, O_TEXT ? "rt" : "r");
if (!fp)
error (EXIT_TROUBLE, errno, "%s", optarg);
for (keyalloc = 1; keyalloc <= keycc + 1; keyalloc *= 2)
diff --git a/src/kwsearch.c b/src/kwsearch.c
index cf8df3c..5f3f233 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -50,10 +50,6 @@ Fcompile (char const *pattern, size_t size)
len = sep - p;
sep++;
total -= (len + 1);
-#if HAVE_DOS_FILE_CONTENTS
- if (sep[-1] == '\r')
- --len;
-#endif
}
else
{
diff --git a/src/system.h b/src/system.h
index 4c85409..7da1d8d 100644
--- a/src/system.h
+++ b/src/system.h
@@ -29,10 +29,6 @@
#include "minmax.h"
#include "same-inode.h"
-#if O_BINARY
-# define HAVE_DOS_FILE_CONTENTS 1
-#endif
-
#include <stdlib.h>
#include <stddef.h>
#include <limits.h>
--
1.9.0