bug#17376: [PATCH] grep: fix the different behaviour for a invalid sequence between KWset and DFA

Paul Eggert Sun, 04 May 2014 19:43:16 -0700

Thanks for that fix. I found a few glitches, mostly having to do withstoring WEOF in a wchar_t, which is not portable. There were also someopportunities for simplification and clarification. I installed thepatch, and followed it up with the attached three patches. The secondpatch is the only one that fixes any bugs.

From 87a84a103685be58b1cc218410156df7eff15662 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 4 May 2014 18:46:40 -0700
Subject: [PATCH 1/3] tests: improve coverage for prefix-of-multibyte


* tests/prefix-of-multibyte: Also test the regex version.
---
 tests/prefix-of-multibyte | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/prefix-of-multibyte b/tests/prefix-of-multibyte
index 70a924e..2ab9a99 100755
--- a/tests/prefix-of-multibyte
+++ b/tests/prefix-of-multibyte
@@ -13,11 +13,16 @@ encode aABC >exp2
 fail=0
 
 for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do
-  for opt in '' '-F'; do
-    out=out-$opt-$LOC
-    LC_ALL=$LOC grep $opt "$(encode A)" exp1 >$out || fail=1
+  for type in dfa fgrep regex; do
+    case $type in
+      dfa) opt= prefix= ;;
+      fgrep) opt=-F prefix= ;;
+      regex) opt= prefix='\(\)\1' ;;
+    esac
+    out=out-$type-$LOC
+    LC_ALL=$LOC grep $opt "$prefix$(encode A)" exp1 >$out || fail=1
     compare exp1 $out || fail=1
-    LC_ALL=$LOC grep $opt "$(encode aA)" exp2 >$out || fail=1
+    LC_ALL=$LOC grep $opt "$prefix$(encode aA)" exp2 >$out || fail=1
     compare exp2 $out || fail=1
   done
 done
-- 
1.9.0

From 3ba30d3ecc809fca6148d09b30edcb7d59636e9a Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 4 May 2014 18:59:51 -0700
Subject: [PATCH 2/3] grep: simplify and fix problems with KWset-DFA agreement
 patch

* src/dfa.c (dfambcache, parse_bracket_exp): Simplify.
(mbs_to_wchar, wctok, FETCH_WC, match_anychar, match_mb_charset)
(check_matching_with_multibyte_ops, transit_state_consume_1char)
(transit_state, dfaexec): Use wint_t, not wchar_t, so that
WEOF is treated correctly on platforms where WEOF is not a valid
wchar_t value.
(ctok, lex): Use int, not unsigned int, for characters,
so that EOF is treated more naturally.
(parse_bracket_exp): Use NOTCHAR to mark uninitialized char, since
FETCH_WC can now set the char to EOF.
(lex): Remove unnecessary test for EOF.
(parse_bracket_exp, atom): Swap then and else parts, to put
the small one first; this is more readable here.
* src/searchutils.c (is_mb_middle): Simplify.
---
 src/dfa.c         | 117 ++++++++++++++++++++++++++----------------------------
 src/searchutils.c |  12 ++----
 2 files changed, 60 insertions(+), 69 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 1c220e2..9b7d115 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -363,9 +363,8 @@ struct dfa
   int *multibyte_prop;
 
   /* A table indexed by byte values that contains the corresponding wide
-     character (if any) for that byte.  WEOF means the byte is the
-     leading byte of a multibyte character.  Invalid and null bytes are
-     mapped to themselves.  */
+     character (if any) for that byte.  WEOF means the byte is not a
+     valid single-byte character.  */
   wint_t mbrtowc_cache[NOTCHAR];
 
   /* Array of the bracket expression in the DFA.  */
@@ -453,29 +452,19 @@ dfambcache (struct dfa *d)
       unsigned char uc = i;
       mbstate_t s = { 0 };
       wchar_t wc;
-      wint_t wi;
-      switch (mbrtowc (&wc, &c, 1, &s))
-        {
-        default:
-          wi = wc;
-          break;
-        case (size_t) -1:
-        case (size_t) -2:
-          wi = WEOF;
-          break;
-        }
-      d->mbrtowc_cache[uc] = wi;
+      d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
     }
 }
 
 /* Store into *PWC the result of converting the leading bytes of the
    multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
    and updating the conversion state in *D.  On conversion error,
-   convert just a single byte as-is.  Return the number of bytes
+   convert just a single byte, to WEOF.  Return the number of bytes
    converted.
 
    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 
+   * PWC points to wint_t, not to wchar_t.
    * The last arg is a dfa *D instead of merely a multibyte conversion
      state D->mbs.  D also contains an mbrtowc_cache for speed.
    * N must be at least 1.
@@ -485,16 +474,20 @@ dfambcache (struct dfa *d)
    * D->mbs is always valid afterwards.
    * *PWC is always set to something.  */
 static size_t
-mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d)
+mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
   wint_t wc = d->mbrtowc_cache[uc];
 
   if (wc == WEOF)
     {
-      size_t nbytes = mbrtowc (pwc, s, n, &d->mbs);
+      wchar_t wch;
+      size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
       if (0 < nbytes && nbytes < (size_t) -2)
-        return nbytes;
+        {
+          *pwc = wch;
+          return nbytes;
+        }
       memset (&d->mbs, 0, sizeof d->mbs);
     }
 
@@ -848,13 +841,21 @@ static int minrep, maxrep;      /* Repeat counts for 
{m,n}.  */
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
 /* These variables are used only if (MB_CUR_MAX > 1).  */
-static wchar_t wctok;           /* Wide character representation of the current
-                                   multibyte character.  */
-static unsigned int ctok;       /* Single character representation of the 
current
-                                   multibyte character.  */
-
-
-/* Note that characters become unsigned here.  */
+static wint_t wctok;           /* Wide character representation of the current
+                                  multibyte character, or WEOF if there was
+                                   an encoding error.  */
+static int ctok;               /* Current input byte if it is an entire
+                                   character or is an encoding error,
+                                   EOF otherwise.  */
+
+
+/* Fetch the next lexical input character.  Set C (of type int) to the
+   next input byte, except set C to EOF if the input is a multibyte
+   character of length greater than 1.  Set WC (of type wint_t) to the
+   value of the input if it is a valid multibyte character (possibly
+   of length 1); otherwise set WC to WEOF.  If there is no more input,
+   report EOFERR if EOFERR is not null, and return lasttok = END
+   otherwise.  */
 # define FETCH_WC(c, wc, eoferr)               \
   do {                                         \
     if (! lexleft)                             \
@@ -866,7 +867,7 @@ static unsigned int ctok;       /* Single character 
representation of the curren
       }                                                \
     else                                       \
       {                                                \
-        wchar_t _wc;                           \
+        wint_t _wc;                            \
         size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
         cur_mb_len = nbytes;                   \
         (wc) = _wc;                            \
@@ -1027,7 +1028,7 @@ parse_bracket_exp (void)
   colon_warning_state = (c == ':');
   do
     {
-      c1 = EOF;                 /* mark c1 is not initialized".  */
+      c1 = NOTCHAR;    /* Mark c1 as not initialized.  */
       colon_warning_state &= ~2;
 
       /* Note that if we're looking at some other [:...:] construct,
@@ -1105,7 +1106,7 @@ parse_bracket_exp (void)
       if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
         FETCH_WC (c, wc, _("unbalanced ["));
 
-      if (c1 == EOF)
+      if (c1 == NOTCHAR)
         FETCH_WC (c1, wc1, _("unbalanced ["));
 
       if (c1 == '-')
@@ -1192,28 +1193,24 @@ parse_bracket_exp (void)
           continue;
         }
 
-      if (wc != WEOF)
+      if (wc == WEOF)
+        setbit (c, ccl);
+      else
         {
-          if (case_fold)
-            {
-              wchar_t folded[CASE_FOLDED_BUFSIZE];
-              int i, n = case_folded_counterparts (wc, folded);
-              work_mbc->chars = maybe_realloc (work_mbc->chars,
-                                               work_mbc->nchars + n, &chars_al,
-                                               sizeof *work_mbc->chars);
-              for (i = 0; i < n; i++)
-                if (!setbit_wc (folded[i], ccl))
-                  work_mbc->chars[work_mbc->nchars++] = folded[i];
-            }
-          else if (!setbit_wc (wc, ccl))
-            {
-              work_mbc->chars = maybe_realloc (work_mbc->chars, 
work_mbc->nchars,
-                                               &chars_al, sizeof 
*work_mbc->chars);
-              work_mbc->chars[work_mbc->nchars++] = wc;
-            }
+          wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
+          int i;
+          int n = (case_fold ? case_folded_counterparts (wc, folded + 1) + 1
+                   : 1);
+          folded[0] = wc;
+          for (i = 0; i < n; i++)
+            if (!setbit_wc (folded[i], ccl))
+              {
+                work_mbc->chars
+                  = maybe_realloc (work_mbc->chars, work_mbc->nchars,
+                                   &chars_al, sizeof *work_mbc->chars);
+                work_mbc->chars[work_mbc->nchars++] = folded[i];
+              }
         }
-      else
-        setbit (c, ccl);
     }
   while ((wc = wc1, (c = c1) != ']'));
 
@@ -1245,7 +1242,7 @@ parse_bracket_exp (void)
 static token
 lex (void)
 {
-  unsigned int c, c2;
+  int c, c2;
   bool backslash = false;
   charclass ccl;
   int i;
@@ -1260,8 +1257,6 @@ lex (void)
     {
       FETCH_WC (ctok, wctok, NULL);
       c = ctok;
-      if (c == (unsigned int) EOF)
-        goto normal_char;
 
       switch (c)
         {
@@ -1790,7 +1785,9 @@ atom (void)
 {
   if (tok == WCHAR)
     {
-      if (wctok != WEOF)
+      if (wctok == WEOF)
+        addtok_mb (ctok, 3);
+      else
         {
           addtok_wc (wctok);
 
@@ -1805,8 +1802,6 @@ atom (void)
                 }
             }
         }
-      else
-        addtok_mb (ctok, 3);
 
       tok = lex ();
     }
@@ -2957,7 +2952,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, 
unsigned char const *p,
    match, in bytes.  POS is the position of the ".".  */
 static int
 match_anychar (struct dfa *d, state_num s, position pos,
-               wchar_t wc, size_t mbclen)
+               wint_t wc, size_t mbclen)
 {
   int context;
 
@@ -2987,7 +2982,7 @@ match_anychar (struct dfa *d, state_num s, position pos,
    POS is the position of the bracket expression.  */
 static int
 match_mb_charset (struct dfa *d, state_num s, position pos,
-                  char const *p, wchar_t wc, size_t match_len)
+                  char const *p, wint_t wc, size_t match_len)
 {
   size_t i;
   bool match;              /* Matching succeeded.  */
@@ -3090,7 +3085,7 @@ charset_matched:
    The caller MUST free the array which this function return.  */
 static int *
 check_matching_with_multibyte_ops (struct dfa *d, state_num s,
-                                   char const *p, wchar_t wc, size_t mbclen)
+                                   char const *p, wint_t wc, size_t mbclen)
 {
   size_t i;
   int *rarray;
@@ -3125,7 +3120,7 @@ check_matching_with_multibyte_ops (struct dfa *d, 
state_num s,
 static status_transit_state
 transit_state_consume_1char (struct dfa *d, state_num s,
                              unsigned char const **pp,
-                             wchar_t wc, size_t mbclen,
+                             wint_t wc, size_t mbclen,
                              int *match_lens)
 {
   size_t i, j;
@@ -3176,7 +3171,7 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp,
   int *match_lens = NULL;
   size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
   unsigned char const *p1 = *pp;
-  wchar_t wc;
+  wint_t wc;
 
   if (nelem > 0)
     /* This state has (a) multibyte operator(s).
@@ -3305,7 +3300,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
                      state must skip the bytes that are not a single
                      byte character nor the first byte of a multibyte
                      character.  */
-                  wchar_t wc;
+                  wint_t wc;
                   while (mbp < p)
                     mbp += mbs_to_wchar (&wc, (char const *) mbp,
                                          end - (char const *) mbp, d);
diff --git a/src/searchutils.c b/src/searchutils.c
index 3c78f31..76005bf 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -249,10 +249,10 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
       if (mbclen == (size_t) -2)
         mbclen = mbrlen (p, end - p, &cur_state);
 
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+      if (! (0 < mbclen && mbclen < (size_t) -2))
         {
-          /* An invalid sequence, or a truncated multibyte character.
-             We treat it as a single byte character.  */
+          /* An invalid sequence, or a truncated multibyte character, or
+             a null wide character.  Treat it as a single byte character.  */
           mbclen = 1;
           memset (&cur_state, 0, sizeof cur_state);
         }
@@ -261,9 +261,5 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
 
   *good = p;
 
-  if (p > buf)
-    return true;
-
-  /* P == BUF here.  */
-  return false;
+  return buf < p;
 }
-- 
1.9.0

From 17683df11fbea7aa01c9d60f1b45874c9ea5e26a Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 4 May 2014 19:34:30 -0700
Subject: [PATCH 3/3] dfa: minor simplification

* src/dfa.c (parse_bracket_exp): Use enum, not macro, and move var
to just the scope it's needed.
---
 src/dfa.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 9b7d115..5211087 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1037,13 +1037,13 @@ parse_bracket_exp (void)
          dfa is ever called.  */
       if (c == '[')
         {
-#define MAX_BRACKET_STRING_LEN 32
-          char str[MAX_BRACKET_STRING_LEN + 1];
           FETCH_WC (c1, wc1, _("unbalanced ["));
 
           if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
               || c1 == '.' || c1 == '=')
             {
+              enum { MAX_BRACKET_STRING_LEN = 32 };
+              char str[MAX_BRACKET_STRING_LEN + 1];
               size_t len = 0;
               for (;;)
                 {
-- 
1.9.0

bug#17376: [PATCH] grep: fix the different behaviour for a invalid sequence between KWset and DFA

Reply via email to