I've installed the attached patch, which fixes the bug for me, and am marking this bug report as done.
From 08f2702d2c3db8ebe37f0cb586b45462f4e28d38 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 10 May 2014 16:26:21 -0700
Subject: [PATCH] dfa: fix bug with \< etc in multibyte locales

Problem reported by Stephane Chazelas in: http://bugs.gnu.org/16867
* NEWS: Document the fix.
* src/dfa.c (dfaoptimize): Remove any superset if changing from
UTF-8 to unibyte, and if the pattern has no backreferences.
(dfassbuild): In multibyte locales, treat \< \> \b \B as
backreferences in the DFA, since the DFA relies on unibyte
tests to check them.
(dfacomp): Optimize after building the superset, so that
dfassbuild can depend on d->multibyte.  A downside is that
dfaoptimize must remove supersets that are likely slower than the
DFA after optimization, but that's been done in the
above-described change.
* tests/Makefile.am (XFAIL_TESTS): Remove word-delim-multibyte,
since the test works now.
---
 NEWS              |  2 ++
 src/dfa.c         | 19 +++++++++++++++++--
 tests/Makefile.am |  4 +---
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index 685ce9b..64539c0 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ GNU grep NEWS                                    -*- outline 
-*-
 
   grep -w no longer mishandles a potential match adjacent to a letter that
   takes up two or more bytes in a multibyte encoding.
+  Similarly, the patterns '\<', '\>', '\b', and '\B' no longer
+  mishandle word-boundary matches in multibyte locales.
   [bug present since "the beginning"]
 
   grep -P now reports an error and exits when given invalid UTF-8 data.
diff --git a/src/dfa.c b/src/dfa.c
index 0a221f7..ba19a72 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3484,6 +3484,7 @@ static void
 dfaoptimize (struct dfa *d)
 {
   size_t i;
+  bool have_backref = false;
 
   if (!using_utf8 ())
     return;
@@ -3495,6 +3496,9 @@ dfaoptimize (struct dfa *d)
         case ANYCHAR:
           /* Lowered.  */
           abort ();
+        case BACKREF:
+          have_backref = true;
+          break;
         case MBCSET:
           /* Requires multi-byte algorithm.  */
           return;
@@ -3503,6 +3507,14 @@ dfaoptimize (struct dfa *d)
         }
     }
 
+  if (!have_backref && d->superset)
+    {
+      /* The superset DFA is not likely to be much faster, so remove it.  */
+      dfafree (d->superset);
+      free (d->superset);
+      d->superset = NULL;
+    }
+
   free_mbdata (d);
   d->multibyte = false;
 }
@@ -3560,8 +3572,11 @@ dfassbuild (struct dfa *d)
         case NOTLIMWORD:
           if (d->multibyte)
             {
-              /* Ignore these constraints.  */
+              /* These constraints aren't supported in a multibyte locale.
+                 Ignore them in the superset DFA, and treat them as
+                 backreferences in the main DFA.  */
               sup->tokens[j++] = EMPTY;
+              d->tokens[i] = BACKREF;
               break;
             }
         default:
@@ -3591,8 +3606,8 @@ dfacomp (char const *s, size_t len, struct dfa *d, int 
searchflag)
   dfambcache (d);
   dfaparse (s, len, d);
   dfamust (d);
-  dfaoptimize (d);
   dfassbuild (d);
+  dfaoptimize (d);
   dfaanalyze (d, searchflag);
   if (d->superset)
     {
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f3450f3..626b25a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,9 +22,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
 AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
 LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
 
-# Remove this definition once the failing test passes.
-XFAIL_TESTS = \
-  word-delim-multibyte
+XFAIL_TESTS =
 
 # Equivalence classes are only supported when using the system
 # matcher (which means only with glibc).
-- 
1.9.0

Reply via email to