I've installed the attached patch, which fixes the bug for me, and am
marking this bug report as done.
From 08f2702d2c3db8ebe37f0cb586b45462f4e28d38 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 10 May 2014 16:26:21 -0700
Subject: [PATCH] dfa: fix bug with \< etc in multibyte locales
Problem reported by Stephane Chazelas in: http://bugs.gnu.org/16867
* NEWS: Document the fix.
* src/dfa.c (dfaoptimize): Remove any superset if changing from
UTF-8 to unibyte, and if the pattern has no backreferences.
(dfassbuild): In multibyte locales, treat \< \> \b \B as
backreferences in the DFA, since the DFA relies on unibyte
tests to check them.
(dfacomp): Optimize after building the superset, so that
dfassbuild can depend on d->multibyte. A downside is that
dfaoptimize must remove supersets that are likely slower than the
DFA after optimization, but that's been done in the
above-described change.
* tests/Makefile.am (XFAIL_TESTS): Remove word-delim-multibyte,
since the test works now.
---
NEWS | 2 ++
src/dfa.c | 19 +++++++++++++++++--
tests/Makefile.am | 4 +---
3 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/NEWS b/NEWS
index 685ce9b..64539c0 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ GNU grep NEWS -*- outline
-*-
grep -w no longer mishandles a potential match adjacent to a letter that
takes up two or more bytes in a multibyte encoding.
+ Similarly, the patterns '\<', '\>', '\b', and '\B' no longer
+ mishandle word-boundary matches in multibyte locales.
[bug present since "the beginning"]
grep -P now reports an error and exits when given invalid UTF-8 data.
diff --git a/src/dfa.c b/src/dfa.c
index 0a221f7..ba19a72 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3484,6 +3484,7 @@ static void
dfaoptimize (struct dfa *d)
{
size_t i;
+ bool have_backref = false;
if (!using_utf8 ())
return;
@@ -3495,6 +3496,9 @@ dfaoptimize (struct dfa *d)
case ANYCHAR:
/* Lowered. */
abort ();
+ case BACKREF:
+ have_backref = true;
+ break;
case MBCSET:
/* Requires multi-byte algorithm. */
return;
@@ -3503,6 +3507,14 @@ dfaoptimize (struct dfa *d)
}
}
+ if (!have_backref && d->superset)
+ {
+ /* The superset DFA is not likely to be much faster, so remove it. */
+ dfafree (d->superset);
+ free (d->superset);
+ d->superset = NULL;
+ }
+
free_mbdata (d);
d->multibyte = false;
}
@@ -3560,8 +3572,11 @@ dfassbuild (struct dfa *d)
case NOTLIMWORD:
if (d->multibyte)
{
- /* Ignore these constraints. */
+ /* These constraints aren't supported in a multibyte locale.
+ Ignore them in the superset DFA, and treat them as
+ backreferences in the main DFA. */
sup->tokens[j++] = EMPTY;
+ d->tokens[i] = BACKREF;
break;
}
default:
@@ -3591,8 +3606,8 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfambcache (d);
dfaparse (s, len, d);
dfamust (d);
- dfaoptimize (d);
dfassbuild (d);
+ dfaoptimize (d);
dfaanalyze (d, searchflag);
if (d->superset)
{
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f3450f3..626b25a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,9 +22,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
-# Remove this definition once the failing test passes.
-XFAIL_TESTS = \
- word-delim-multibyte
+XFAIL_TESTS =
# Equivalence classes are only supported when using the system
# matcher (which means only with glibc).
--
1.9.0