Greetings. Per earlier email, attached are a rough cut at splitting the gawk 
fixes
to dfa into two patches. (I simply created two copies of the diff and edited 
them
manually.)  APPLY THESE BY HAND, I am sure they will not go through patch.

The first is multibyte bug fixes and the other is to restore matching newlines.
I hope that I didn't break anything by splitting these up.

Be sure that if both get applied that you compare the final product to what's
in the gawk CVS and resolve any differences.

As mentioned, the dfa->broken fixes do not need to stay inside #ifdef GAWK.

Thanks,

Arnold
-- 
Aharon (Arnold) Robbins                                 arnold AT skeeve DOT com
P.O. Box 354            Home Phone: +972  8 979-0381    Fax: +1 206 202 4333
Nof Ayalon              Cell Phone: +972 50  729-7545
D.N. Shimshon 99785     ISRAEL
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.h     2007-06-28 21:57:19.000000000 
+0300
+++ dfa.h       2007-09-03 06:30:12.000000000 +0300
@@ -364,9 +364,4 @@
                                   on a state that potentially could do so. */
   int *success;                        /* Table of acceptance conditions used 
in
                                   dfaexec and computed in build_state. */
   struct dfamust *musts;       /* List of strings, at least one of which
                                   is known to appear in any r.e. matching
                                   the dfa. */
+#ifdef GAWK
+  int broken;                  /* True if using a feature where there
+                                  are bugs and gawk should use regex. */
+#endif
 };
 
 /* Some macros for user access to dfa internals. */
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.c     2007-06-28 21:57:19.000000000 
+0300
+++ dfa.c       2007-09-03 06:30:12.000000000 +0300
@@ -95,24 +96,13 @@
    host does not conform to Posix.  */
 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
 
-/* Don't use gettext if ENABLE_NLS is not defined */
-/* If we (don't) have I18N.  */
-/* glibc defines _ */
-#ifdef ENABLE_NLS
-# ifndef _
-#  ifdef HAVE_LIBINTL_H
-#   include <libintl.h>
-#   ifndef _
-#    define _(Str) gettext (Str)
-#   endif
-#  endif
-# endif
-#endif
-#ifndef _
-# define _(Str) (Str)
-#endif
+/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
+#include "gettext.h"
+#define _(str) gettext (str)
 
+#ifndef NO_MBSUPPORT
 #include "mbsupport.h"  /* defines MBS_SUPPORT if appropriate */
+#endif
 #ifdef MBS_SUPPORT
 /* We can handle multibyte strings. */
 # include <wchar.h>
@@ -595,6 +585,9 @@
                {
                  wctype_t wt;
                  /* Query the character class as wctype_t.  */
+                 if (case_fold && (strcmp(str, "upper") == 0 || strcmp(str, 
"lower") == 0))
+                    strcpy(str, "alpha");
+
                  wt = wctype (str);
 
                  if (ch_classes_al == 0)
@@ -681,6 +674,28 @@
          REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
                               range_ends_al, work_mbc->nranges + 1);
          work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+         if (case_fold && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
+                         && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2)))
+           {
+               wint_t altcase;
+               altcase = wc;
+               if (iswlower((wint_t)wc))
+                  altcase = towupper((wint_t)wc);
+               else
+                  altcase = towlower((wint_t)wc);
+               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                               range_sts_al, work_mbc->nranges + 1);
+               work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
+
+               altcase = wc2;
+               if (iswlower((wint_t)wc2))
+                  altcase = towupper((wint_t)wc2);
+               else
+                  altcase = towlower((wint_t)wc2);
+               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                               range_ends_al, work_mbc->nranges + 1);
+               work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
+          }
        }
       else if (wc != WEOF)
        /* build normal characters.  */
@@ -688,6 +703,13 @@
          REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                               work_mbc->nchars + 1);
          work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
+         if (case_fold && (iswlower(wc) || iswupper(wc)))
+           {
+             REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+                                  work_mbc->nchars + 1);
+             work_mbc->chars[work_mbc->nchars++] =
+               (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
+           }
        }
     }
   while ((wc = wc1) != L']');
@@ -962,6 +984,9 @@
          if (c != '}')
            dfaerror(_("malformed repeat count"));
          laststart = 0;
+#ifdef GAWK
+         dfa->broken = (minrep == maxrep && minrep == 0);
+#endif
          return lasttok = REPMN;
 
        case '|':
@@ -1017,6 +1042,21 @@
          laststart = 0;
          return lasttok = CSET + charclass_index(ccl);
 
+#ifndef GAWK
+       case 's':
+       case 'S':
+         if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
+           goto normal_char;
+         zeroset(ccl);
+         for (c2 = 0; c2 < NOTCHAR; ++c2)
+           if (ISSPACE(c2))
+             setbit(c2, ccl);
+         if (c == 'S')
+           notset(ccl);
+         laststart = 0;
+         return lasttok = CSET + charclass_index(ccl);
+#endif
+
        case 'w':
        case 'W':
          if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
@@ -1338,7 +1378,14 @@
   int i;
 
   for (i = 0; i < ntokens; ++i)
-    addtok(dfa->tokens[tindex + i]);
+    {
+      addtok(dfa->tokens[tindex + i]);
+#ifdef MBS_SUPPORT
+      /* Update index into multibyte csets.  */
+      if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
+       dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
+#endif
+    }
 }
 
 static void
@@ -1567,8 +1614,8 @@
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
 #ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
-    d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.elems = NULL;
 #endif
   for (j = 0; j < s->nelem; ++j)
     if (d->tokens[s->elems[j].index] < 0)
@@ -2335,6 +2382,7 @@
        d->trans = d->realtrans + 1;
        REALLOC(d->fails, int *, d->tralloc);
        REALLOC(d->success, int, d->tralloc);
+       REALLOC(d->newlines, int, d->tralloc);
        while (oldalloc < d->tralloc)
          {
            d->trans[oldalloc] = NULL;
@@ -2992,13 +3063,19 @@
   d->tralloc = 0;
 
   d->musts = 0;
+  d->realtrans = 0;
+  d->fails = 0;
+  d->success = 0;
+#ifdef GAWK
+  d->broken = 0;
+#endif
 }
 
 /* Parse and analyze a single string of the given length. */
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
-  if (case_fold)       /* dummy folding in service of dfamust() */
+  if (case_fold && len)        /* dummy folding in service of dfamust() */
     {
       char *lcopy;
       int i;
@@ -3074,8 +3152,13 @@
     }
 #endif /* MBS_SUPPORT */
 
-  for (i = 0; i < d->sindex; ++i)
+  for (i = 0; i < d->sindex; ++i) {
     free((ptr_t) d->states[i].elems.elems);
+#ifdef MBS_SUPPORT
+    if (d->states[i].mbps.nelem > 0)
+      free((ptr_t) d->states[i].mbps.elems);
+#endif /* MBS_SUPPORT */
+  }
   free((ptr_t) d->states);
   for (i = 0; i < d->tindex; ++i)
     if (d->follows[i].elems)
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.h     2007-06-28 21:57:19.000000000 
+0300
+++ dfa.h       2007-09-03 06:30:12.000000000 +0300
@@ -364,9 +364,16 @@
                                   on a state that potentially could do so. */
   int *success;                        /* Table of acceptance conditions used 
in
                                   dfaexec and computed in build_state. */
+  int *newlines;               /* Transitions on newlines.  The entry for a
+                                  newline in any transition table is always
+                                  -1 so we can count lines without wasting
+                                  too many cycles.  The transition for a
+                                  newline is stored separately and handled
+                                  as a special case.  Newline is also used
+                                  as a sentinel at the end of the buffer. */
   struct dfamust *musts;       /* List of strings, at least one of which
                                   is known to appear in any r.e. matching
                                   the dfa. */
 };
 
 /* Some macros for user access to dfa internals. */
@@ -398,13 +409,18 @@
 extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
 
 /* Execute the given struct dfa on the buffer of characters.  The
-   last byte of the buffer must equal the end-of-line byte.
-   The final argument points to a flag that will
+   first char * points to the beginning, and the second points to the
+   first character after the end of the buffer, which must be a writable
+   place so a sentinel end-of-buffer marker can be stored there.  The
+   second-to-last argument is a flag telling whether to allow newlines to
+   be part of a string matching the regexp.  The next-to-last argument,
+   if non-NULL, points to a place to increment every time we see a
+   newline.  The final argument, if non-NULL, points to a flag that will
    be set if further examination by a backtracking matcher is needed in
    order to verify backreferencing; otherwise the flag will be cleared.
-   Returns (size_t) -1 if no match is found, or the offset of the first
+   Returns NULL if no match is found, or a pointer to the first
    character after the first & shortest matching string in the buffer. */
-extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *));
+extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *, 
int *));
 
 /* Free the storage held by the components of a struct dfa. */
 extern void dfafree PARAMS ((struct dfa *));
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.c     2007-06-28 21:57:19.000000000 
+0300
+++ dfa.c       2007-09-03 06:30:12.000000000 +0300
@@ -2335,6 +2382,7 @@
        d->trans = d->realtrans + 1;
        REALLOC(d->fails, int *, d->tralloc);
        REALLOC(d->success, int, d->tralloc);
+       REALLOC(d->newlines, int, d->tralloc);
        while (oldalloc < d->tralloc)
          {
            d->trans[oldalloc] = NULL;
@@ -2342,7 +2390,9 @@
          }
       }
 
-  /* Newline is a sentinel.  */
+  /* Keep the newline transition in a special place so we can use it as
+     a sentinel. */
+  d->newlines[s] = trans[eolbyte];
   trans[eolbyte] = -1;
 
   if (ACCEPTING(s, *d))
@@ -2360,6 +2410,7 @@
   d->trans = d->realtrans + 1;
   CALLOC(d->fails, int *, d->tralloc);
   MALLOC(d->success, int, d->tralloc);
+  MALLOC(d->newlines, int, d->tralloc);
   build_state(0, d);
 }
 
@@ -2378,13 +2429,13 @@
     {                                                  \
       while (inputwcs[p - buf_begin] == 0              \
             && mblen_buf[p - buf_begin] > 0            \
-           && p < buf_end)                             \
+            && (unsigned char const *)p < buf_end)     \
         ++p;                                           \
-      if (p >= end)                                    \
+      if ((char *)p >= end)                            \
        {                                               \
           free(mblen_buf);                             \
           free(inputwcs);                              \
-         return (size_t) -1;                           \
+         return NULL;                                  \
        }                                               \
     }
 
@@ -2403,6 +2454,7 @@
       d->trans = d->realtrans + 1;
       REALLOC(d->fails, int *, d->tralloc);
       REALLOC(d->success, int, d->tralloc);
+      REALLOC(d->newlines, int, d->tralloc);
       while (oldalloc < d->tralloc)
        {
          d->trans[oldalloc] = NULL;
@@ -2791,19 +2843,23 @@
 
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the buffer,
-   and the shortest possible version thereof.  Return the offset of the first
-   character after the match, or (size_t) -1 if none is found.  BEGIN points to
-   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
-   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
+   and the shortest possible version thereof.  Return a pointer to the first
+   character after the match, or NULL if none is found.  Begin points to
+   the beginning of the buffer, and end points to the first character after
+   its end.  We store a newline in *end to act as a sentinel, so end had
+   better point somewhere valid.  Newline is a flag indicating whether to
+   allow newlines to be in the matching string.  If count is non-
+   NULL it points to a place we're supposed to increment every time we
+   see a newline.  Finally, if backref is non-NULL it points to a place
    where we're supposed to store a 1 if backreferencing happened and the
    match needs to be verified by a backtracking matcher.  Otherwise
    we store a 0 in *backref. */
-size_t
-dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
+char *
+dfaexec (struct dfa *d, char const *begin, char *end,
+        int newline, int *count, int *backref)
 {
-  register int s;      /* Current state. */
+  register int s, s1, tmp;     /* Current state. */
   register unsigned char const *p; /* Current input character. */
-  register unsigned char const *end; /* One past the last input character.  */
   register int **trans, *t;    /* Copy of d->trans so it can be optimized
                                   into a register. */
   register unsigned char eol = eolbyte;        /* Likewise for eolbyte.  */
@@ -2823,10 +2879,10 @@
   if (! d->tralloc)
     build_state_zero(d);
 
-  s = 0;
+  s = s1 = 0;
   p = (unsigned char const *) begin;
-  end = p + size;
   trans = d->trans;
+  *end = eol;
 
 #ifdef MBS_SUPPORT
   if (MB_CUR_MAX > 1)
@@ -2836,18 +2892,18 @@
       buf_end = end;
 
       /* initialize mblen_buf, and inputwcs.  */
-      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
-      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
+      MALLOC(mblen_buf, unsigned char, end - begin + 2);
+      MALLOC(inputwcs, wchar_t, end - begin + 2);
       memset(&mbs, 0, sizeof(mbstate_t));
       remain_bytes = 0;
-      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
+      for (i = 0; i < end - begin + 1; i++)
        {
          if (remain_bytes == 0)
            {
              remain_bytes
-               = mbrtowc(inputwcs + i, begin + i,
-                         end - (unsigned char const *)begin - i + 1, &mbs);
-             if (remain_bytes <= 1)
+               = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+             if (remain_bytes < 1
+                 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
                {
                  remain_bytes = 0;
                  inputwcs[i] = (wchar_t)begin[i];
@@ -2877,6 +2933,9 @@
       if (MB_CUR_MAX > 1)
        while ((t = trans[s]))
          {
+           if ((char *) p > end)
+             break;
+           s1 = s;
            if (d->states[s].mbps.nelem != 0)
              {
                /* Can match with a multibyte character (and multi character
@@ -2887,7 +2946,7 @@
 
                nextp = p;
                s = transit_state(d, s, &nextp);
-               p = nextp;
+               p = (unsigned char *)nextp;
 
                /* Trans table might be updated.  */
                trans = d->trans;
@@ -2900,25 +2959,16 @@
          }
       else
 #endif /* MBS_SUPPORT */
-        while ((t = trans[s]))
-         s = t[*p++];
-
-      if (s < 0)
-       {
-         if (p == end)
-           {
-#ifdef MBS_SUPPORT
-             if (MB_CUR_MAX > 1)
-               {
-                 free(mblen_buf);
-                 free(inputwcs);
-               }
-#endif /* MBS_SUPPORT */
-             return (size_t) -1;
-           }
-         s = 0;
+      while ((t = trans[s]) != 0) { /* hand-optimized loop */
+       s1 = t[*p++];
+       if ((t = trans[s1]) == 0) {
+         tmp = s ; s = s1 ; s1 = tmp ; /* swap */
+         break;
        }
-      else if ((t = d->fails[s]))
+       s = t[*p++];
+      }
+
+      if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
        {
          if (d->success[s] & sbit[*p])
            {
@@ -2931,37 +2981,58 @@
                  free(inputwcs);
                }
 #endif /* MBS_SUPPORT */
-             return (char const *) p - begin;
+             return (char *) p;
            }
 
+         s1 = s;
 #ifdef MBS_SUPPORT
          if (MB_CUR_MAX > 1)
            {
-               SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
-               if (d->states[s].mbps.nelem != 0)
-                 {
-                   /* Can match with a multibyte character (and multi
-                      character collating element).  */
                    unsigned char const *nextp;
                    nextp = p;
                    s = transit_state(d, s, &nextp);
-                   p = nextp;
+                   p = (unsigned char *)nextp;
 
                    /* Trans table might be updated.  */
                    trans = d->trans;
-                 }
-               else
-               s = t[*p++];
            }
          else
 #endif /* MBS_SUPPORT */
-         s = t[*p++];
+         s = d->fails[s][*p++];
+         continue;
        }
-      else
+
+      /* If the previous character was a newline, count it. */
+      if (count && (char *) p <= end && p[-1] == eol)
+       ++*count;
+
+      /* Check if we've run off the end of the buffer. */
+      if ((char *) p > end)
+       {
+#ifdef MBS_SUPPORT
+         if (MB_CUR_MAX > 1)
+           {
+             free(mblen_buf);
+             free(inputwcs);
+           }
+#endif /* MBS_SUPPORT */
+         return NULL;
+       }
+
+      if (s >= 0)
        {
          build_state(s, d);
          trans = d->trans;
+         continue;
        }
+
+      if (p[-1] == eol && newline)
+       {
+         s = d->newlines[s1];
+         continue;
+       }
+
+      s = 0;
     }
 }
 
@@ -2992,13 +3063,14 @@
   d->tralloc = 0;
 
   d->musts = 0;
+  d->newlines = 0;
 }
 
 /* Parse and analyze a single string of the given length. */
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
  if (case_fold)        /* dummy folding in service of dfamust() */
     {
       char *lcopy;
       int i;
@@ -3088,6 +3171,7 @@
       free((ptr_t) d->fails[i]);
   if (d->realtrans) free((ptr_t) d->realtrans);
   if (d->fails) free((ptr_t) d->fails);
+  if (d->newlines) free((ptr_t) d->newlines);
   if (d->success) free((ptr_t) d->success);
   for (dm = d->musts; dm; dm = ndm)
     {

Reply via email to