dfa.h / dfa.c diff versus gawk attached

Aharon Robbins Thu, 06 Sep 2007 07:55:23 -0700

Greetings.

Attached is a diff of the grep 2.5.3 dfa.h and dfa.c against the current
version of same in the gawk CVS. (Or, it'll be in CVS within an hour or
so. :-)


The changes fall into two categories: bug fixes, mostly having to do
with multibyte character sets, and reviving the DFA matcher's ability
to match across newlines, which grep doesn't need but which gawk does.
This latter changes the interface to dfaexec.

I believe that the grep developers have had most of these changes in the
pipeline for a while, but I thought it wouldn't hurt to submit a fresh
set of diffs.

One new thing is that I have added the ability to let the caller of
the dfa routines know that the matcher is broken in certain cases. The
only case I know of at the moment is

        (foo){0}
        (foo){0,0}

which the DFA matcher treats as (foo){1} whereas regex correctly does
not match "foo".  This is a problem in the DFA parsing as it builds the
parse tree that represents the DFA ... I could not see how to work
around it there, or anywhere else in the code. (Fixes welcome!)

It remains my hope that "one day" the grep distribution will return
to being the canonical source for dfa.h and dfa.c, and that I can
synchronize from it (as I do with GLIBC, for example) rather than the
other way around.

Thanks,

Arnold
-- 
Aharon (Arnold) Robbins                                 arnold AT skeeve DOT com
P.O. Box 354            Home Phone: +972  8 979-0381    Fax: +1 206 350 8765
Nof Ayalon              Cell Phone: +972 50  729-7545
D.N. Shimshon 99785     ISRAEL

--- dfa.h.grep  2007-08-21 17:36:26.000000000 +0300
+++ dfa.h       2007-08-21 17:49:43.000000000 +0300
@@ -1,5 +1,5 @@
 /* dfa.h - declarations for GNU deterministic regexp compiler
-   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
+   Copyright (C) 1988, 1998, 2002, 2004 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -364,9 +364,20 @@
                                   on a state that potentially could do so. */
   int *success;                        /* Table of acceptance conditions used 
in
                                   dfaexec and computed in build_state. */
+  int *newlines;               /* Transitions on newlines.  The entry for a
+                                  newline in any transition table is always
+                                  -1 so we can count lines without wasting
+                                  too many cycles.  The transition for a
+                                  newline is stored separately and handled
+                                  as a special case.  Newline is also used
+                                  as a sentinel at the end of the buffer. */
   struct dfamust *musts;       /* List of strings, at least one of which
                                   is known to appear in any r.e. matching
                                   the dfa. */
+#ifdef GAWK
+  int broken;                  /* True if using a feature where there
+                                  are bugs and gawk should use regex. */
+#endif
 };
 
 /* Some macros for user access to dfa internals. */
@@ -398,13 +409,18 @@
 extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
 
 /* Execute the given struct dfa on the buffer of characters.  The
-   last byte of the buffer must equal the end-of-line byte.
-   The final argument points to a flag that will
+   first char * points to the beginning, and the second points to the
+   first character after the end of the buffer, which must be a writable
+   place so a sentinel end-of-buffer marker can be stored there.  The
+   second-to-last argument is a flag telling whether to allow newlines to
+   be part of a string matching the regexp.  The next-to-last argument,
+   if non-NULL, points to a place to increment every time we see a
+   newline.  The final argument, if non-NULL, points to a flag that will
    be set if further examination by a backtracking matcher is needed in
    order to verify backreferencing; otherwise the flag will be cleared.
-   Returns (size_t) -1 if no match is found, or the offset of the first
+   Returns NULL if no match is found, or a pointer to the first
    character after the first & shortest matching string in the buffer. */
-extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *));
+extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *, 
int *));
 
 /* Free the storage held by the components of a struct dfa. */
 extern void dfafree PARAMS ((struct dfa *));
--- dfa.c.grep  2007-08-21 17:36:30.000000000 +0300
+++ dfa.c       2007-08-21 17:49:47.000000000 +0300
@@ -1,5 +1,6 @@
 /* dfa.c - deterministic extended regexp routines for GNU
-   Copyright 1988, 1998, 2000, 2002, 2004 Free Software Foundation, Inc.
+   Copyright 1988, 1998, 2000, 2002, 2004, 2005, 2007
+   Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -95,24 +96,13 @@
    host does not conform to Posix.  */
 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
 
-/* Don't use gettext if ENABLE_NLS is not defined */
-/* If we (don't) have I18N.  */
-/* glibc defines _ */
-#ifdef ENABLE_NLS
-# ifndef _
-#  ifdef HAVE_LIBINTL_H
-#   include <libintl.h>
-#   ifndef _
-#    define _(Str) gettext (Str)
-#   endif
-#  endif
-# endif
-#endif
-#ifndef _
-# define _(Str) (Str)
-#endif
+/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
+#include "gettext.h"
+#define _(str) gettext (str)
 
+#ifndef NO_MBSUPPORT
 #include "mbsupport.h"  /* defines MBS_SUPPORT if appropriate */
+#endif
 #ifdef MBS_SUPPORT
 /* We can handle multibyte strings. */
 # include <wchar.h>
@@ -595,6 +585,9 @@
                {
                  wctype_t wt;
                  /* Query the character class as wctype_t.  */
+                 if (case_fold && (strcmp(str, "upper") == 0 || strcmp(str, 
"lower") == 0))
+                    strcpy(str, "alpha");
+
                  wt = wctype (str);
 
                  if (ch_classes_al == 0)
@@ -681,6 +674,28 @@
          REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
                               range_ends_al, work_mbc->nranges + 1);
          work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+         if (case_fold && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
+                         && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2)))
+           {
+               wint_t altcase;
+               altcase = wc;
+               if (iswlower((wint_t)wc))
+                  altcase = towupper((wint_t)wc);
+               else
+                  altcase = towlower((wint_t)wc);
+               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                               range_sts_al, work_mbc->nranges + 1);
+               work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
+
+               altcase = wc2;
+               if (iswlower((wint_t)wc2))
+                  altcase = towupper((wint_t)wc2);
+               else
+                  altcase = towlower((wint_t)wc2);
+               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                               range_ends_al, work_mbc->nranges + 1);
+               work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
+          }
        }
       else if (wc != WEOF)
        /* build normal characters.  */
@@ -688,6 +703,13 @@
          REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                               work_mbc->nchars + 1);
          work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
+         if (case_fold && (iswlower(wc) || iswupper(wc)))
+           {
+             REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+                                  work_mbc->nchars + 1);
+             work_mbc->chars[work_mbc->nchars++] =
+               (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
+           }
        }
     }
   while ((wc = wc1) != L']');
@@ -962,6 +984,9 @@
          if (c != '}')
            dfaerror(_("malformed repeat count"));
          laststart = 0;
+#ifdef GAWK
+         dfa->broken = (minrep == maxrep && minrep == 0);
+#endif
          return lasttok = REPMN;
 
        case '|':
@@ -1017,6 +1042,21 @@
          laststart = 0;
          return lasttok = CSET + charclass_index(ccl);
 
+#ifndef GAWK
+       case 's':
+       case 'S':
+         if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
+           goto normal_char;
+         zeroset(ccl);
+         for (c2 = 0; c2 < NOTCHAR; ++c2)
+           if (ISSPACE(c2))
+             setbit(c2, ccl);
+         if (c == 'S')
+           notset(ccl);
+         laststart = 0;
+         return lasttok = CSET + charclass_index(ccl);
+#endif
+
        case 'w':
        case 'W':
          if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
@@ -1338,7 +1378,14 @@
   int i;
 
   for (i = 0; i < ntokens; ++i)
-    addtok(dfa->tokens[tindex + i]);
+    {
+      addtok(dfa->tokens[tindex + i]);
+#ifdef MBS_SUPPORT
+      /* Update index into multibyte csets.  */
+      if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
+       dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
+#endif
+    }
 }
 
 static void
@@ -1567,8 +1614,8 @@
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
 #ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
-    d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.elems = NULL;
 #endif
   for (j = 0; j < s->nelem; ++j)
     if (d->tokens[s->elems[j].index] < 0)
@@ -2335,6 +2382,7 @@
        d->trans = d->realtrans + 1;
        REALLOC(d->fails, int *, d->tralloc);
        REALLOC(d->success, int, d->tralloc);
+       REALLOC(d->newlines, int, d->tralloc);
        while (oldalloc < d->tralloc)
          {
            d->trans[oldalloc] = NULL;
@@ -2342,7 +2390,9 @@
          }
       }
 
-  /* Newline is a sentinel.  */
+  /* Keep the newline transition in a special place so we can use it as
+     a sentinel. */
+  d->newlines[s] = trans[eolbyte];
   trans[eolbyte] = -1;
 
   if (ACCEPTING(s, *d))
@@ -2360,6 +2410,7 @@
   d->trans = d->realtrans + 1;
   CALLOC(d->fails, int *, d->tralloc);
   MALLOC(d->success, int, d->tralloc);
+  MALLOC(d->newlines, int, d->tralloc);
   build_state(0, d);
 }
 
@@ -2378,13 +2429,13 @@
     {                                                  \
       while (inputwcs[p - buf_begin] == 0              \
             && mblen_buf[p - buf_begin] > 0            \
-           && p < buf_end)                             \
+            && (unsigned char const *)p < buf_end)     \
         ++p;                                           \
-      if (p >= end)                                    \
+      if ((char *)p >= end)                            \
        {                                               \
           free(mblen_buf);                             \
           free(inputwcs);                              \
-         return (size_t) -1;                           \
+         return NULL;                                  \
        }                                               \
     }
 
@@ -2403,6 +2454,7 @@
       d->trans = d->realtrans + 1;
       REALLOC(d->fails, int *, d->tralloc);
       REALLOC(d->success, int, d->tralloc);
+      REALLOC(d->newlines, int, d->tralloc);
       while (oldalloc < d->tralloc)
        {
          d->trans[oldalloc] = NULL;
@@ -2791,19 +2843,23 @@
 
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the buffer,
-   and the shortest possible version thereof.  Return the offset of the first
-   character after the match, or (size_t) -1 if none is found.  BEGIN points to
-   the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
-   is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
+   and the shortest possible version thereof.  Return a pointer to the first
+   character after the match, or NULL if none is found.  Begin points to
+   the beginning of the buffer, and end points to the first character after
+   its end.  We store a newline in *end to act as a sentinel, so end had
+   better point somewhere valid.  Newline is a flag indicating whether to
+   allow newlines to be in the matching string.  If count is non-
+   NULL it points to a place we're supposed to increment every time we
+   see a newline.  Finally, if backref is non-NULL it points to a place
    where we're supposed to store a 1 if backreferencing happened and the
    match needs to be verified by a backtracking matcher.  Otherwise
    we store a 0 in *backref. */
-size_t
-dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
+char *
+dfaexec (struct dfa *d, char const *begin, char *end,
+        int newline, int *count, int *backref)
 {
-  register int s;      /* Current state. */
+  register int s, s1, tmp;     /* Current state. */
   register unsigned char const *p; /* Current input character. */
-  register unsigned char const *end; /* One past the last input character.  */
   register int **trans, *t;    /* Copy of d->trans so it can be optimized
                                   into a register. */
   register unsigned char eol = eolbyte;        /* Likewise for eolbyte.  */
@@ -2823,10 +2879,10 @@
   if (! d->tralloc)
     build_state_zero(d);
 
-  s = 0;
+  s = s1 = 0;
   p = (unsigned char const *) begin;
-  end = p + size;
   trans = d->trans;
+  *end = eol;
 
 #ifdef MBS_SUPPORT
   if (MB_CUR_MAX > 1)
@@ -2836,18 +2892,18 @@
       buf_end = end;
 
       /* initialize mblen_buf, and inputwcs.  */
-      MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
-      MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
+      MALLOC(mblen_buf, unsigned char, end - begin + 2);
+      MALLOC(inputwcs, wchar_t, end - begin + 2);
       memset(&mbs, 0, sizeof(mbstate_t));
       remain_bytes = 0;
-      for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
+      for (i = 0; i < end - begin + 1; i++)
        {
          if (remain_bytes == 0)
            {
              remain_bytes
-               = mbrtowc(inputwcs + i, begin + i,
-                         end - (unsigned char const *)begin - i + 1, &mbs);
-             if (remain_bytes <= 1)
+               = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+             if (remain_bytes < 1
+                 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
                {
                  remain_bytes = 0;
                  inputwcs[i] = (wchar_t)begin[i];
@@ -2877,6 +2933,9 @@
       if (MB_CUR_MAX > 1)
        while ((t = trans[s]))
          {
+           if ((char *) p > end)
+             break;
+           s1 = s;
            if (d->states[s].mbps.nelem != 0)
              {
                /* Can match with a multibyte character (and multi character
@@ -2887,7 +2946,7 @@
 
                nextp = p;
                s = transit_state(d, s, &nextp);
-               p = nextp;
+               p = (unsigned char *)nextp;
 
                /* Trans table might be updated.  */
                trans = d->trans;
@@ -2900,25 +2959,16 @@
          }
       else
 #endif /* MBS_SUPPORT */
-        while ((t = trans[s]))
-         s = t[*p++];
-
-      if (s < 0)
-       {
-         if (p == end)
-           {
-#ifdef MBS_SUPPORT
-             if (MB_CUR_MAX > 1)
-               {
-                 free(mblen_buf);
-                 free(inputwcs);
-               }
-#endif /* MBS_SUPPORT */
-             return (size_t) -1;
-           }
-         s = 0;
+      while ((t = trans[s]) != 0) { /* hand-optimized loop */
+       s1 = t[*p++];
+       if ((t = trans[s1]) == 0) {
+         tmp = s ; s = s1 ; s1 = tmp ; /* swap */
+         break;
        }
-      else if ((t = d->fails[s]))
+       s = t[*p++];
+      }
+
+      if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
        {
          if (d->success[s] & sbit[*p])
            {
@@ -2931,37 +2981,58 @@
                  free(inputwcs);
                }
 #endif /* MBS_SUPPORT */
-             return (char const *) p - begin;
+             return (char *) p;
            }
 
+         s1 = s;
 #ifdef MBS_SUPPORT
          if (MB_CUR_MAX > 1)
            {
-               SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
-               if (d->states[s].mbps.nelem != 0)
-                 {
-                   /* Can match with a multibyte character (and multi
-                      character collating element).  */
                    unsigned char const *nextp;
                    nextp = p;
                    s = transit_state(d, s, &nextp);
-                   p = nextp;
+                   p = (unsigned char *)nextp;
 
                    /* Trans table might be updated.  */
                    trans = d->trans;
-                 }
-               else
-               s = t[*p++];
            }
          else
 #endif /* MBS_SUPPORT */
-         s = t[*p++];
+         s = d->fails[s][*p++];
+         continue;
        }
-      else
+
+      /* If the previous character was a newline, count it. */
+      if (count && (char *) p <= end && p[-1] == eol)
+       ++*count;
+
+      /* Check if we've run off the end of the buffer. */
+      if ((char *) p > end)
+       {
+#ifdef MBS_SUPPORT
+         if (MB_CUR_MAX > 1)
+           {
+             free(mblen_buf);
+             free(inputwcs);
+           }
+#endif /* MBS_SUPPORT */
+         return NULL;
+       }
+
+      if (s >= 0)
        {
          build_state(s, d);
          trans = d->trans;
+         continue;
        }
+
+      if (p[-1] == eol && newline)
+       {
+         s = d->newlines[s1];
+         continue;
+       }
+
+      s = 0;
     }
 }
 
@@ -2992,13 +3063,20 @@
   d->tralloc = 0;
 
   d->musts = 0;
+  d->realtrans = 0;
+  d->fails = 0;
+  d->newlines = 0;
+  d->success = 0;
+#ifdef GAWK
+  d->broken = 0;
+#endif
 }
 
 /* Parse and analyze a single string of the given length. */
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
-  if (case_fold)       /* dummy folding in service of dfamust() */
+  if (case_fold && len)        /* dummy folding in service of dfamust() */
     {
       char *lcopy;
       int i;
@@ -3074,8 +3152,13 @@
     }
 #endif /* MBS_SUPPORT */
 
-  for (i = 0; i < d->sindex; ++i)
+  for (i = 0; i < d->sindex; ++i) {
     free((ptr_t) d->states[i].elems.elems);
+#ifdef MBS_SUPPORT
+    if (d->states[i].mbps.nelem > 0)
+      free((ptr_t) d->states[i].mbps.elems);
+#endif /* MBS_SUPPORT */
+  }
   free((ptr_t) d->states);
   for (i = 0; i < d->tindex; ++i)
     if (d->follows[i].elems)
@@ -3088,6 +3171,7 @@
       free((ptr_t) d->fails[i]);
   if (d->realtrans) free((ptr_t) d->realtrans);
   if (d->fails) free((ptr_t) d->fails);
+  if (d->newlines) free((ptr_t) d->newlines);
   if (d->success) free((ptr_t) d->success);
   for (dm = d->musts; dm; dm = ndm)
     {

dfa.h / dfa.c diff versus gawk attached

Reply via email to