Norihiro Tanaka wrote:
I rebased this patch, and add a bug fix to it.

Thanks. Paolo wrote it up in <http://bugs.gnu.org/17156#11>, and I just now tweaked its ChangeLog and merged the code and installed it (patch attached). I followed up with minor cleanups (2nd patch attached).
From f66e89b3358d7de7dfb5c51e7a253df04fdf08a9 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 5 Apr 2014 18:52:01 -0700
Subject: [PATCH 1/2] grep: reuse multibyte DFA buffers in non-UTF8 locales

* src/dfa.c (struct dfa): New members 'mblen_buf', 'nmblen_buf',
'inputwcs', 'ninputwcs', 'mb_follows' and 'mb_match_lens'.
(mblen_buf, inputwcs): Remove static vars.
(SKIP_REMAINS_MB_IF_INITIAL_STATE, match_anychar, match_mb_charset)
(transit_state_consume_1char, transit_state, prepare_wc_buf):
Use new members instead of global variables.
(check_matching_with_multibyte_ops): Use new members
instead of new allocation.
(dfaexec): Initialize new members.
(free_mbdata): Free new members.
---
 src/dfa.c | 134 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 69 insertions(+), 65 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 0d7eab5..96fbcb3 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -420,6 +420,30 @@ struct dfa
   struct dfamust *musts;        /* List of strings, at least one of which
                                    is known to appear in any r.e. matching
                                    the dfa.  */
+  unsigned char *mblen_buf;     /* Correspond to the input buffer in dfaexec.
+                                   Each element stores the number of remaining
+                                   bytes of the corresponding multibyte
+                                   character in the input string.  A element's
+                                   value is 0 if the corresponding character is
+                                   single-byte.
+                                   e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
+                                   mblen_buf   :  0,       3,       2,       1
+                                 */
+  size_t nmblen_buf;            /* Length of the mblen buffer currently
+                                   allocated.  */
+  wchar_t *inputwcs;            /* Wide character representation of the input
+                                   string in dfaexec.
+                                   The length of this array is the same as
+                                   the length of input string (char array).
+                                   inputstring[i] is a single-byte char,
+                                   or the first byte of a multibyte char;
+                                   inputwcs[i] is the codepoint.  */
+  size_t ninputwcs;             /* Length of the input wide characters
+                                   currently allocated.  */
+  position_set *mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
+                                   on demand.  */
+  int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
+                                   MBCSET.  */
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -852,22 +876,6 @@ static int cur_mb_len = 1;      /* Length of the multibyte 
representation of
 static mbstate_t mbs;           /* mbstate for mbrtowc.  */
 static wchar_t wctok;           /* Wide character representation of the current
                                    multibyte character.  */
-static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
-                                   Each element stores the number of remaining
-                                   bytes of the corresponding multibyte
-                                   character in the input string.  A element's
-                                   value is 0 if the corresponding character is
-                                   single-byte.
-                                   e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
-                                   mblen_buf   :  0,       3,       2,       1
-                                 */
-static wchar_t *inputwcs;       /* Wide character representation of the input
-                                   string in dfaexec.
-                                   The length of this array is the same as
-                                   the length of input string (char array).
-                                   inputstring[i] is a single-byte char,
-                                   or the first byte of a multibyte char;
-                                   inputwcs[i] is the codepoint.  */
 static unsigned char const *buf_begin;  /* reference to begin in dfaexec.  */
 static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
 
@@ -2899,14 +2907,12 @@ build_state_zero (struct dfa *d)
 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)         \
   if (s == 0)                                          \
     {                                                  \
-      while (inputwcs[p - buf_begin] == 0              \
-            && mblen_buf[p - buf_begin] > 0            \
+      while (d->inputwcs[p - buf_begin] == 0           \
+            && d->mblen_buf[p - buf_begin] > 0         \
             && (unsigned char const *) p < buf_end)    \
         ++p;                                           \
       if ((char *) p >= end)                           \
         {                                              \
-          free (mblen_buf);                            \
-          free (inputwcs);                             \
           *end = saved_end;                            \
           return NULL;                                 \
         }                                              \
@@ -3000,8 +3006,8 @@ match_anychar (struct dfa *d, state_num s, position pos, 
size_t idx)
   wchar_t wc;
   int mbclen;
 
-  wc = inputwcs[idx];
-  mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+  wc = d->inputwcs[idx];
+  mbclen = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
 
   /* Check syntax bits.  */
   if (wc == (wchar_t) eolbyte)
@@ -3042,7 +3048,7 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
   int context;
   wchar_t wc;                   /* Current referring character.  */
 
-  wc = inputwcs[idx];
+  wc = d->inputwcs[idx];
 
   /* Check syntax bits.  */
   if (wc == (wchar_t) eolbyte)
@@ -3063,7 +3069,7 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
   /* Assign the current referring operator to work_mbc.  */
   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
   match = !work_mbc->invert;
-  match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
+  match_len = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
 
   /* Match in range 0-255?  */
   if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3141,7 +3147,7 @@ check_matching_with_multibyte_ops (struct dfa *d, 
state_num s, size_t idx)
   size_t i;
   int *rarray;
 
-  MALLOC (rarray, d->states[s].mbps.nelem);
+  rarray = d->mb_match_lens;
   for (i = 0; i < d->states[s].mbps.nelem; ++i)
     {
       position pos = d->states[s].mbps.elems[i];
@@ -3171,7 +3177,7 @@ check_matching_with_multibyte_ops (struct dfa *d, 
state_num s, size_t idx)
 static status_transit_state
 transit_state_consume_1char (struct dfa *d, state_num s,
                              unsigned char const **pp,
-                             int *match_lens, int *mbclen, position_set * pps)
+                             int *match_lens, int *mbclen)
 {
   size_t i, j;
   int k;
@@ -3181,7 +3187,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
 
   /* Calculate the length of the (single/multi byte) character
      to which p points.  */
-  *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
+  *mbclen = (d->mblen_buf[*pp - buf_begin] == 0) ? 1 : d->mblen_buf[*pp - 
buf_begin];
 
   /* Calculate the state which can be reached from the state 's' by
      consuming '*mbclen' single bytes from the buffer.  */
@@ -3191,8 +3197,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
       s2 = s1;
       rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
     }
-  /* Copy the positions contained by 's1' to the set 'pps'.  */
-  copy (&(d->states[s1].elems), pps);
+  /* Copy the positions contained by 's1' to the set 'd->mb_follows'.  */
+  copy (&(d->states[s1].elems), d->mb_follows);
 
   /* Check (input) match_lens, and initialize if it is NULL.  */
   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
@@ -3207,12 +3213,10 @@ transit_state_consume_1char (struct dfa *d, state_num s,
       if (work_mbls[i] == *mbclen)
         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
              j++)
-          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
+          insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
+            d->mb_follows);
     }
 
-  if (match_lens == NULL && work_mbls != NULL)
-    free (work_mbls);
-
   /* FIXME: this return value is always ignored.  */
   return rs;
 }
@@ -3229,7 +3233,6 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp)
   size_t i, j;
   int *match_lens = NULL;
   size_t nelem = d->states[s].mbps.nelem;       /* Just a alias.  */
-  position_set follows;
   unsigned char const *p1 = *pp;
   wchar_t wc;
 
@@ -3260,26 +3263,25 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp)
       if (rs == TRANSIT_STATE_DONE)
         ++*pp;
 
-      free (match_lens);
       return s1;
     }
 
   /* This state has some operators which can match a multibyte character.  */
-  alloc_position_set (&follows, d->nleaves);
+  d->mb_follows->nelem = 0;
 
   /* 'maxlen' may be longer than the length of a character, because it may
      not be a character but a (multi character) collating element.
      We enumerate all of the positions which 's' can reach by consuming
      'maxlen' bytes.  */
-  transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
+  transit_state_consume_1char (d, s, pp, match_lens, &mbclen);
 
-  wc = inputwcs[*pp - mbclen - buf_begin];
-  s1 = state_index (d, &follows, wchar_context (wc));
+  wc = d->inputwcs[*pp - mbclen - buf_begin];
+  s1 = state_index (d, d->mb_follows, wchar_context (wc));
   realloc_trans_if_necessary (d, s1);
 
   while (*pp - p1 < maxlen)
     {
-      transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
+      transit_state_consume_1char (d, s1, pp, NULL, &mbclen);
 
       for (i = 0; i < nelem; i++)
         {
@@ -3287,15 +3289,13 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp)
             for (j = 0;
                  j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
               insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
-                      &follows);
+                      d->mb_follows);
         }
 
-      wc = inputwcs[*pp - mbclen - buf_begin];
-      s1 = state_index (d, &follows, wchar_context (wc));
+      wc = d->inputwcs[*pp - mbclen - buf_begin];
+      s1 = state_index (d, d->mb_follows, wchar_context (wc));
       realloc_trans_if_necessary (d, s1);
     }
-  free (match_lens);
-  free (follows.elems);
   return s1;
 }
 
@@ -3313,21 +3313,22 @@ prepare_wc_buf (struct dfa *d, const char *begin, const 
char *end)
 
   for (i = 0; i < ilim; i++)
     {
-      size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, 
&mbs);
-      mblen_buf[i] = nbytes - (nbytes == 1);
+      size_t nbytes = mbs_to_wchar (d, d->inputwcs + i, begin + i, ilim - i,
+                                    &mbs);
+      d->mblen_buf[i] = nbytes - (nbytes == 1);
       if (begin[i] == eol)
         break;
       while (--nbytes != 0)
         {
           i++;
-          mblen_buf[i] = nbytes;
-          inputwcs[i] = 0;
+          d->mblen_buf[i] = nbytes;
+          d->inputwcs[i] = 0;
         }
     }
 
   buf_end = (unsigned char *) (begin + i);
-  mblen_buf[i] = 0;
-  inputwcs[i] = 0;              /* sentinel */
+  d->mblen_buf[i] = 0;
+  d->inputwcs[i] = 0;              /* sentinel */
 }
 
 /* Search through a buffer looking for a match to the given struct dfa.
@@ -3364,10 +3365,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 
   if (d->mb_cur_max > 1)
     {
-      MALLOC (mblen_buf, end - begin + 2);
-      MALLOC (inputwcs, end - begin + 2);
+      static bool mb_alloc = false;
+      REALLOC_IF_NECESSARY (d->mblen_buf, d->nmblen_buf, end - begin + 2);
+      REALLOC_IF_NECESSARY (d->inputwcs, d->ninputwcs, end - begin + 2);
       memset (&mbs, 0, sizeof (mbstate_t));
       prepare_wc_buf (d, (const char *) p, end);
+      if (!mb_alloc)
+        {
+          MALLOC (d->mb_match_lens, d->nleaves);
+          MALLOC (d->mb_follows, 1);
+          alloc_position_set (d->mb_follows, d->nleaves);
+          mb_alloc = true;
+        }
     }
 
   for (;;)
@@ -3394,8 +3403,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
               if (backref)
                 {
                   *backref = 1;
-                  free (mblen_buf);
-                  free (inputwcs);
                   *end = saved_end;
                   return (char *) p;
                 }
@@ -3428,11 +3435,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
             {
               if (backref)
                 *backref = (d->states[s].backref != 0);
-              if (d->mb_cur_max > 1)
-                {
-                  free (mblen_buf);
-                  free (inputwcs);
-                }
               *end = saved_end;
               return (char *) p;
             }
@@ -3463,11 +3465,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       /* Check if we've run off the end of the buffer.  */
       if ((char *) p > end)
         {
-          if (d->mb_cur_max > 1)
-            {
-              free (mblen_buf);
-              free (inputwcs);
-            }
           *end = saved_end;
           return NULL;
         }
@@ -3519,6 +3516,13 @@ free_mbdata (struct dfa *d)
   free (d->mbcsets);
   d->mbcsets = NULL;
   d->nmbcsets = 0;
+
+  free (d->mblen_buf);
+  free (d->inputwcs);
+  if (d->mb_follows != NULL)
+    free (d->mb_follows->elems);
+  free (d->mb_follows);
+  free (d->mb_match_lens);
 }
 
 /* Initialize the components of a dfa that the other routines don't
-- 
1.9.0

From 9bbe55e92c65a1a16c9b37dfabedcb5452586400 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sat, 5 Apr 2014 22:01:39 -0700
Subject: [PATCH 2/2] grep: minor improvements to previous patch

* src/dfa.c (MAX): New macro.
(match_anychar, match_mb_charset, transit_state_consume_1char):
Use it to simplify assignments.
(SKIP_REMAINS_MB_IF_INITIAL_STATE): Prefer != 0 for unsigned.
(free_mbdata): Omit an unnecessary 'free'.
---
 src/dfa.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 96fbcb3..ef5c8a9 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -429,8 +429,7 @@ struct dfa
                                    e.g., input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
                                    mblen_buf   :  0,       3,       2,       1
                                  */
-  size_t nmblen_buf;            /* Length of the mblen buffer currently
-                                   allocated.  */
+  size_t nmblen_buf;            /* Allocated size of mblen_buf.  */
   wchar_t *inputwcs;            /* Wide character representation of the input
                                    string in dfaexec.
                                    The length of this array is the same as
@@ -438,8 +437,7 @@ struct dfa
                                    inputstring[i] is a single-byte char,
                                    or the first byte of a multibyte char;
                                    inputwcs[i] is the codepoint.  */
-  size_t ninputwcs;             /* Length of the input wide characters
-                                   currently allocated.  */
+  size_t ninputwcs;             /* Allocated number of inputwcs elements.  */
   position_set *mb_follows;     /* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
   int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
@@ -905,6 +903,9 @@ static unsigned char const *buf_end;    /* reference to end 
in dfaexec.  */
 #ifndef MIN
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
+#ifndef MAX
+# define MAX(a,b) ((a) < (b) ? (b) : (a))
+#endif
 
 /* The set of wchar_t values C such that there's a useful locale
    somewhere where C != towupper (C) && C != towlower (towupper (C)).
@@ -2908,8 +2909,8 @@ build_state_zero (struct dfa *d)
   if (s == 0)                                          \
     {                                                  \
       while (d->inputwcs[p - buf_begin] == 0           \
-            && d->mblen_buf[p - buf_begin] > 0         \
-            && (unsigned char const *) p < buf_end)    \
+             && d->mblen_buf[p - buf_begin] != 0       \
+             && (unsigned char const *) p < buf_end)   \
         ++p;                                           \
       if ((char *) p >= end)                           \
         {                                              \
@@ -3007,7 +3008,7 @@ match_anychar (struct dfa *d, state_num s, position pos, 
size_t idx)
   int mbclen;
 
   wc = d->inputwcs[idx];
-  mbclen = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
+  mbclen = MAX (1, d->mblen_buf[idx]);
 
   /* Check syntax bits.  */
   if (wc == (wchar_t) eolbyte)
@@ -3069,7 +3070,7 @@ match_mb_charset (struct dfa *d, state_num s, position 
pos, size_t idx)
   /* Assign the current referring operator to work_mbc.  */
   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
   match = !work_mbc->invert;
-  match_len = (d->mblen_buf[idx] == 0) ? 1 : d->mblen_buf[idx];
+  match_len = MAX (1, d->mblen_buf[idx]);
 
   /* Match in range 0-255?  */
   if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3187,7 +3188,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
 
   /* Calculate the length of the (single/multi byte) character
      to which p points.  */
-  *mbclen = (d->mblen_buf[*pp - buf_begin] == 0) ? 1 : d->mblen_buf[*pp - 
buf_begin];
+  *mbclen = MAX (1, d->mblen_buf[*pp - buf_begin]);
 
   /* Calculate the state which can be reached from the state 's' by
      consuming '*mbclen' single bytes from the buffer.  */
@@ -3214,7 +3215,7 @@ transit_state_consume_1char (struct dfa *d, state_num s,
         for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
              j++)
           insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
-            d->mb_follows);
+                  d->mb_follows);
     }
 
   /* FIXME: this return value is always ignored.  */
@@ -3519,9 +3520,11 @@ free_mbdata (struct dfa *d)
 
   free (d->mblen_buf);
   free (d->inputwcs);
-  if (d->mb_follows != NULL)
-    free (d->mb_follows->elems);
-  free (d->mb_follows);
+  if (d->mb_follows)
+    {
+      free (d->mb_follows->elems);
+      free (d->mb_follows);
+    }
   free (d->mb_match_lens);
 }
 
-- 
1.9.0

Reply via email to