Greetings. Per earlier email, attached are a rough cut at splitting the gawk
fixes
to dfa into two patches. (I simply created two copies of the diff and edited
them
manually.) APPLY THESE BY HAND, I am sure they will not go through patch.
The first is multibyte bug fixes and the other is to restore matching newlines.
I hope that I didn't break anything by splitting these up.
Be sure that if both get applied that you compare the final product to what's
in the gawk CVS and resolve any differences.
As mentioned, the dfa->broken fixes do not need to stay inside #ifdef GAWK.
Thanks,
Arnold
--
Aharon (Arnold) Robbins arnold AT skeeve DOT com
P.O. Box 354 Home Phone: +972 8 979-0381 Fax: +1 206 202 4333
Nof Ayalon Cell Phone: +972 50 729-7545
D.N. Shimshon 99785 ISRAEL
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.h 2007-06-28 21:57:19.000000000
+0300
+++ dfa.h 2007-09-03 06:30:12.000000000 +0300
@@ -364,9 +364,4 @@
on a state that potentially could do so. */
int *success; /* Table of acceptance conditions used
in
dfaexec and computed in build_state. */
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
+#ifdef GAWK
+ int broken; /* True if using a feature where there
+ are bugs and gawk should use regex. */
+#endif
};
/* Some macros for user access to dfa internals. */
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.c 2007-06-28 21:57:19.000000000
+0300
+++ dfa.c 2007-09-03 06:30:12.000000000 +0300
@@ -95,24 +96,13 @@
host does not conform to Posix. */
#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
-/* Don't use gettext if ENABLE_NLS is not defined */
-/* If we (don't) have I18N. */
-/* glibc defines _ */
-#ifdef ENABLE_NLS
-# ifndef _
-# ifdef HAVE_LIBINTL_H
-# include <libintl.h>
-# ifndef _
-# define _(Str) gettext (Str)
-# endif
-# endif
-# endif
-#endif
-#ifndef _
-# define _(Str) (Str)
-#endif
+/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
+#include "gettext.h"
+#define _(str) gettext (str)
+#ifndef NO_MBSUPPORT
#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
+#endif
#ifdef MBS_SUPPORT
/* We can handle multibyte strings. */
# include <wchar.h>
@@ -595,6 +585,9 @@
{
wctype_t wt;
/* Query the character class as wctype_t. */
+ if (case_fold && (strcmp(str, "upper") == 0 || strcmp(str,
"lower") == 0))
+ strcpy(str, "alpha");
+
wt = wctype (str);
if (ch_classes_al == 0)
@@ -681,6 +674,28 @@
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
range_ends_al, work_mbc->nranges + 1);
work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+ if (case_fold && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
+ && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2)))
+ {
+ wint_t altcase;
+ altcase = wc;
+ if (iswlower((wint_t)wc))
+ altcase = towupper((wint_t)wc);
+ else
+ altcase = towlower((wint_t)wc);
+ REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+ range_sts_al, work_mbc->nranges + 1);
+ work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
+
+ altcase = wc2;
+ if (iswlower((wint_t)wc2))
+ altcase = towupper((wint_t)wc2);
+ else
+ altcase = towlower((wint_t)wc2);
+ REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ range_ends_al, work_mbc->nranges + 1);
+ work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
+ }
}
else if (wc != WEOF)
/* build normal characters. */
@@ -688,6 +703,13 @@
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
+ if (case_fold && (iswlower(wc) || iswupper(wc)))
+ {
+ REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] =
+ (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
+ }
}
}
while ((wc = wc1) != L']');
@@ -962,6 +984,9 @@
if (c != '}')
dfaerror(_("malformed repeat count"));
laststart = 0;
+#ifdef GAWK
+ dfa->broken = (minrep == maxrep && minrep == 0);
+#endif
return lasttok = REPMN;
case '|':
@@ -1017,6 +1042,21 @@
laststart = 0;
return lasttok = CSET + charclass_index(ccl);
+#ifndef GAWK
+ case 's':
+ case 'S':
+ if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
+ goto normal_char;
+ zeroset(ccl);
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if (ISSPACE(c2))
+ setbit(c2, ccl);
+ if (c == 'S')
+ notset(ccl);
+ laststart = 0;
+ return lasttok = CSET + charclass_index(ccl);
+#endif
+
case 'w':
case 'W':
if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
@@ -1338,7 +1378,14 @@
int i;
for (i = 0; i < ntokens; ++i)
- addtok(dfa->tokens[tindex + i]);
+ {
+ addtok(dfa->tokens[tindex + i]);
+#ifdef MBS_SUPPORT
+ /* Update index into multibyte csets. */
+ if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
+ dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
+#endif
+ }
}
static void
@@ -1567,8 +1614,8 @@
d->states[i].constraint = 0;
d->states[i].first_end = 0;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.nelem = 0;
+ d->states[i].mbps.elems = NULL;
#endif
for (j = 0; j < s->nelem; ++j)
if (d->tokens[s->elems[j].index] < 0)
@@ -2335,6 +2382,7 @@
d->trans = d->realtrans + 1;
REALLOC(d->fails, int *, d->tralloc);
REALLOC(d->success, int, d->tralloc);
+ REALLOC(d->newlines, int, d->tralloc);
while (oldalloc < d->tralloc)
{
d->trans[oldalloc] = NULL;
@@ -2992,13 +3063,19 @@
d->tralloc = 0;
d->musts = 0;
+ d->realtrans = 0;
+ d->fails = 0;
+ d->success = 0;
+#ifdef GAWK
+ d->broken = 0;
+#endif
}
/* Parse and analyze a single string of the given length. */
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
- if (case_fold) /* dummy folding in service of dfamust() */
+ if (case_fold && len) /* dummy folding in service of dfamust() */
{
char *lcopy;
int i;
@@ -3074,8 +3152,13 @@
}
#endif /* MBS_SUPPORT */
- for (i = 0; i < d->sindex; ++i)
+ for (i = 0; i < d->sindex; ++i) {
free((ptr_t) d->states[i].elems.elems);
+#ifdef MBS_SUPPORT
+ if (d->states[i].mbps.nelem > 0)
+ free((ptr_t) d->states[i].mbps.elems);
+#endif /* MBS_SUPPORT */
+ }
free((ptr_t) d->states);
for (i = 0; i < d->tindex; ++i)
if (d->follows[i].elems)
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.h 2007-06-28 21:57:19.000000000
+0300
+++ dfa.h 2007-09-03 06:30:12.000000000 +0300
@@ -364,9 +364,16 @@
on a state that potentially could do so. */
int *success; /* Table of acceptance conditions used
in
dfaexec and computed in build_state. */
+ int *newlines; /* Transitions on newlines. The entry for a
+ newline in any transition table is always
+ -1 so we can count lines without wasting
+ too many cycles. The transition for a
+ newline is stored separately and handled
+ as a special case. Newline is also used
+ as a sentinel at the end of the buffer. */
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
};
/* Some macros for user access to dfa internals. */
@@ -398,13 +409,18 @@
extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
/* Execute the given struct dfa on the buffer of characters. The
- last byte of the buffer must equal the end-of-line byte.
- The final argument points to a flag that will
+ first char * points to the beginning, and the second points to the
+ first character after the end of the buffer, which must be a writable
+ place so a sentinel end-of-buffer marker can be stored there. The
+ second-to-last argument is a flag telling whether to allow newlines to
+ be part of a string matching the regexp. The next-to-last argument,
+ if non-NULL, points to a place to increment every time we see a
+ newline. The final argument, if non-NULL, points to a flag that will
be set if further examination by a backtracking matcher is needed in
order to verify backreferencing; otherwise the flag will be cleared.
- Returns (size_t) -1 if no match is found, or the offset of the first
+ Returns NULL if no match is found, or a pointer to the first
character after the first & shortest matching string in the buffer. */
-extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *));
+extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *,
int *));
/* Free the storage held by the components of a struct dfa. */
extern void dfafree PARAMS ((struct dfa *));
--- /usr/local/src/Gnu/grep-2.5.3/src/dfa.c 2007-06-28 21:57:19.000000000
+0300
+++ dfa.c 2007-09-03 06:30:12.000000000 +0300
@@ -2335,6 +2382,7 @@
d->trans = d->realtrans + 1;
REALLOC(d->fails, int *, d->tralloc);
REALLOC(d->success, int, d->tralloc);
+ REALLOC(d->newlines, int, d->tralloc);
while (oldalloc < d->tralloc)
{
d->trans[oldalloc] = NULL;
@@ -2342,7 +2390,9 @@
}
}
- /* Newline is a sentinel. */
+ /* Keep the newline transition in a special place so we can use it as
+ a sentinel. */
+ d->newlines[s] = trans[eolbyte];
trans[eolbyte] = -1;
if (ACCEPTING(s, *d))
@@ -2360,6 +2410,7 @@
d->trans = d->realtrans + 1;
CALLOC(d->fails, int *, d->tralloc);
MALLOC(d->success, int, d->tralloc);
+ MALLOC(d->newlines, int, d->tralloc);
build_state(0, d);
}
@@ -2378,13 +2429,13 @@
{ \
while (inputwcs[p - buf_begin] == 0 \
&& mblen_buf[p - buf_begin] > 0 \
- && p < buf_end) \
+ && (unsigned char const *)p < buf_end) \
++p; \
- if (p >= end) \
+ if ((char *)p >= end) \
{ \
free(mblen_buf); \
free(inputwcs); \
- return (size_t) -1; \
+ return NULL; \
} \
}
@@ -2403,6 +2454,7 @@
d->trans = d->realtrans + 1;
REALLOC(d->fails, int *, d->tralloc);
REALLOC(d->success, int, d->tralloc);
+ REALLOC(d->newlines, int, d->tralloc);
while (oldalloc < d->tralloc)
{
d->trans[oldalloc] = NULL;
@@ -2791,19 +2843,23 @@
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the buffer,
- and the shortest possible version thereof. Return the offset of the first
- character after the match, or (size_t) -1 if none is found. BEGIN points to
- the beginning of the buffer, and SIZE is the size of the buffer. If SIZE
- is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place
+ and the shortest possible version thereof. Return a pointer to the first
+ character after the match, or NULL if none is found. Begin points to
+ the beginning of the buffer, and end points to the first character after
+ its end. We store a newline in *end to act as a sentinel, so end had
+ better point somewhere valid. Newline is a flag indicating whether to
+ allow newlines to be in the matching string. If count is non-
+ NULL it points to a place we're supposed to increment every time we
+ see a newline. Finally, if backref is non-NULL it points to a place
where we're supposed to store a 1 if backreferencing happened and the
match needs to be verified by a backtracking matcher. Otherwise
we store a 0 in *backref. */
-size_t
-dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
+char *
+dfaexec (struct dfa *d, char const *begin, char *end,
+ int newline, int *count, int *backref)
{
- register int s; /* Current state. */
+ register int s, s1, tmp; /* Current state. */
register unsigned char const *p; /* Current input character. */
- register unsigned char const *end; /* One past the last input character. */
register int **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
@@ -2823,10 +2879,10 @@
if (! d->tralloc)
build_state_zero(d);
- s = 0;
+ s = s1 = 0;
p = (unsigned char const *) begin;
- end = p + size;
trans = d->trans;
+ *end = eol;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
@@ -2836,18 +2892,18 @@
buf_end = end;
/* initialize mblen_buf, and inputwcs. */
- MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
- MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
+ MALLOC(mblen_buf, unsigned char, end - begin + 2);
+ MALLOC(inputwcs, wchar_t, end - begin + 2);
memset(&mbs, 0, sizeof(mbstate_t));
remain_bytes = 0;
- for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
+ for (i = 0; i < end - begin + 1; i++)
{
if (remain_bytes == 0)
{
remain_bytes
- = mbrtowc(inputwcs + i, begin + i,
- end - (unsigned char const *)begin - i + 1, &mbs);
- if (remain_bytes <= 1)
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+ if (remain_bytes < 1
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
{
remain_bytes = 0;
inputwcs[i] = (wchar_t)begin[i];
@@ -2877,6 +2933,9 @@
if (MB_CUR_MAX > 1)
while ((t = trans[s]))
{
+ if ((char *) p > end)
+ break;
+ s1 = s;
if (d->states[s].mbps.nelem != 0)
{
/* Can match with a multibyte character (and multi character
@@ -2887,7 +2946,7 @@
nextp = p;
s = transit_state(d, s, &nextp);
- p = nextp;
+ p = (unsigned char *)nextp;
/* Trans table might be updated. */
trans = d->trans;
@@ -2900,25 +2959,16 @@
}
else
#endif /* MBS_SUPPORT */
- while ((t = trans[s]))
- s = t[*p++];
-
- if (s < 0)
- {
- if (p == end)
- {
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- free(mblen_buf);
- free(inputwcs);
- }
-#endif /* MBS_SUPPORT */
- return (size_t) -1;
- }
- s = 0;
+ while ((t = trans[s]) != 0) { /* hand-optimized loop */
+ s1 = t[*p++];
+ if ((t = trans[s1]) == 0) {
+ tmp = s ; s = s1 ; s1 = tmp ; /* swap */
+ break;
}
- else if ((t = d->fails[s]))
+ s = t[*p++];
+ }
+
+ if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
{
if (d->success[s] & sbit[*p])
{
@@ -2931,37 +2981,58 @@
free(inputwcs);
}
#endif /* MBS_SUPPORT */
- return (char const *) p - begin;
+ return (char *) p;
}
+ s1 = s;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
- SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
- if (d->states[s].mbps.nelem != 0)
- {
- /* Can match with a multibyte character (and multi
- character collating element). */
unsigned char const *nextp;
nextp = p;
s = transit_state(d, s, &nextp);
- p = nextp;
+ p = (unsigned char *)nextp;
/* Trans table might be updated. */
trans = d->trans;
- }
- else
- s = t[*p++];
}
else
#endif /* MBS_SUPPORT */
- s = t[*p++];
+ s = d->fails[s][*p++];
+ continue;
}
- else
+
+ /* If the previous character was a newline, count it. */
+ if (count && (char *) p <= end && p[-1] == eol)
+ ++*count;
+
+ /* Check if we've run off the end of the buffer. */
+ if ((char *) p > end)
+ {
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ free(mblen_buf);
+ free(inputwcs);
+ }
+#endif /* MBS_SUPPORT */
+ return NULL;
+ }
+
+ if (s >= 0)
{
build_state(s, d);
trans = d->trans;
+ continue;
}
+
+ if (p[-1] == eol && newline)
+ {
+ s = d->newlines[s1];
+ continue;
+ }
+
+ s = 0;
}
}
@@ -2992,13 +3063,14 @@
d->tralloc = 0;
d->musts = 0;
+ d->newlines = 0;
}
/* Parse and analyze a single string of the given length. */
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
if (case_fold) /* dummy folding in service of dfamust() */
{
char *lcopy;
int i;
@@ -3088,6 +3171,7 @@
free((ptr_t) d->fails[i]);
if (d->realtrans) free((ptr_t) d->realtrans);
if (d->fails) free((ptr_t) d->fails);
+ if (d->newlines) free((ptr_t) d->newlines);
if (d->success) free((ptr_t) d->success);
for (dm = d->musts; dm; dm = ndm)
{