> Here are the one-line summaries: > > dfa: manually merge gawk's dfaexec > dfa: make search.c use the new dfaexec API > tests: add test cases for dfaexec bug > > I've just noticed that part of the 2nd change (dfa.h prototype tweak) > snuck into the first one, so the comment in 2nd isn't right. > I'll move param-name addition to the 2nd, where it belongs.
The body of the commit log for 2/3 was outdated. Here's the same series, but with the above change and a corrected log comment. >From 4b5d6197e98450eb01efb301aabea907a96cb36f Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Tue, 9 Mar 2010 15:31:22 +0100 Subject: [PATCH 1/3] dfa: manually merge gawk's dfaexec * src/dfa.c (dfaexec): Adjust API: return pointer, not offset, and take an "end" pointer parameter, rather than integral "size". Adjust comment accordingly. (build_state): Maintain d->newlines. (copytoks): Update multibyte_prop indices. (SKIP_REMAINS_MB_IF_INITIAL_STATE): Update a cast. Return NULL, rather than (size_t) -1. (realloc_trans_if_necessary): Realloc d->newlines. * src/dfa.h (struct dfa): New member, "newlines". (struct dfa) [GAWK]: New member, "broken". (dfaexec): Update prototype and copy the new comment from dfa.c. --- src/dfa.c | 150 ++++++++++++++++++++++++++++++++++++++++--------------------- src/dfa.h | 23 ++++++++-- 2 files changed, 117 insertions(+), 56 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 780b782..76c6a0e 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1295,7 +1295,14 @@ copytoks (int tindex, int ntokens) int i; for (i = 0; i < ntokens; ++i) - addtok(dfa->tokens[tindex + i]); + { + addtok(dfa->tokens[tindex + i]); +#ifdef MBS_SUPPORT + /* Update index into multibyte csets. */ + if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET) + dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i]; +#endif + } } static void @@ -2291,6 +2298,7 @@ build_state (int s, struct dfa *d) d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2298,7 +2306,9 @@ build_state (int s, struct dfa *d) } } - /* Newline is a sentinel. */ + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans[eolbyte]; trans[eolbyte] = -1; if (ACCEPTING(s, *d)) @@ -2316,6 +2326,7 @@ build_state_zero (struct dfa *d) d->trans = d->realtrans + 1; CALLOC(d->fails, int *, d->tralloc); MALLOC(d->success, int, d->tralloc); + MALLOC(d->newlines, int, d->tralloc); build_state(0, d); } @@ -2336,11 +2347,11 @@ build_state_zero (struct dfa *d) && mblen_buf[p - buf_begin] > 0 \ && p < buf_end) \ ++p; \ - if (p >= end) \ + if ((char const *) p >= end) \ { \ free(mblen_buf); \ free(inputwcs); \ - return (size_t) -1; \ + return NULL; \ } \ } @@ -2359,6 +2370,7 @@ realloc_trans_if_necessary(struct dfa *d, int new_state) d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -2746,21 +2758,26 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, - and the shortest possible version thereof. Return the offset of the first - character after the match, or (size_t) -1 if none is found. BEGIN points to - the beginning of the buffer, and SIZE is the size of the buffer. If SIZE - is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place + and the shortest possible version thereof. Return a pointer to the first + character after the match, or NULL if none is found. Begin points to + the beginning of the buffer, and end points to the first character after + its end. We store a newline in *end to act as a sentinel, so end had + better point somewhere valid. Newline is a flag indicating whether to + allow newlines to be in the matching string. If count is non- + NULL it points to a place we're supposed to increment every time we + see a newline. Finally, if backref is non-NULL it points to a place where we're supposed to store a 1 if backreferencing happened and the match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ -size_t -dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) +char * +dfaexec (struct dfa *d, char const *begin, char *end, + int newline, int *count, int *backref) { - int s; /* Current state. */ + int s, s1, tmp; /* Current state. */ unsigned char const *p; /* Current input character. */ - unsigned char const *end; /* One past the last input character. */ - int **trans, *t; /* Local copy of d->trans. */ - unsigned char eol = eolbyte; /* Hoping it goes into a register. */ + int **trans, *t; /* Copy of d->trans so it can be optimized + into a register. */ + unsigned char eol = eolbyte; /* Likewise for eolbyte. */ static int sbit[NOTCHAR]; /* Table for anding with d->success. */ static int sbit_init; @@ -2777,31 +2794,31 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) if (! d->tralloc) build_state_zero(d); - s = 0; + s = s1 = 0; p = (unsigned char const *) begin; - end = p + size; trans = d->trans; + *end = eol; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { int remain_bytes, i; buf_begin = (unsigned char const *) begin; - buf_end = end; + buf_end = (unsigned char const *) end; /* initialize mblen_buf, and inputwcs. */ - MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); - MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); + MALLOC(mblen_buf, unsigned char, end - begin + 2); + MALLOC(inputwcs, wchar_t, end - begin + 2); memset(&mbs, 0, sizeof(mbstate_t)); remain_bytes = 0; - for (i = 0; i < end - (unsigned char const *)begin; i++) + for (i = 0; i < end - begin + 1; i++) { if (remain_bytes == 0) { remain_bytes - = mbrtowc(inputwcs + i, begin + i, - end - (unsigned char const *)begin - i, &mbs); - if (remain_bytes <= 1) + = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); + if (remain_bytes < 1 + || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) { remain_bytes = 0; inputwcs[i] = (wchar_t)begin[i]; @@ -2831,6 +2848,9 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) if (MB_CUR_MAX > 1) while ((t = trans[s])) { + if ((char *) p > end) + break; + s1 = s; SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); if (d->states[s].mbps.nelem == 0) @@ -2846,25 +2866,16 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) } else #endif /* MBS_SUPPORT */ - while ((t = trans[s])) - s = t[*p++]; - - if (s < 0) - { - if (p == end) - { -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - free(mblen_buf); - free(inputwcs); - } -#endif /* MBS_SUPPORT */ - return (size_t) -1; - } - s = 0; + while ((t = trans[s]) != 0) { /* hand-optimized loop */ + s1 = t[*p++]; + if ((t = trans[s1]) == 0) { + tmp = s ; s = s1 ; s1 = tmp ; /* swap */ + break; } - else if ((t = d->fails[s])) + s = t[*p++]; + } + + if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) { if (d->success[s] & sbit[*p]) { @@ -2877,19 +2888,13 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) free(inputwcs); } #endif /* MBS_SUPPORT */ - return (char const *) p - begin; + return (char *) p; } + s1 = s; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { - SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); - if (d->states[s].mbps.nelem == 0) - { - s = t[*p++]; - continue; - } - /* Can match with a multibyte character (and multicharacter collating element). Transition table might be updated. */ s = transit_state(d, s, &p); @@ -2897,13 +2902,41 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) } else #endif /* MBS_SUPPORT */ - s = t[*p++]; + s = d->fails[s][*p++]; + continue; } - else + + /* If the previous character was a newline, count it. */ + if (count && (char *) p <= end && p[-1] == eol) + ++*count; + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return NULL; + } + + if (s >= 0) { build_state(s, d); trans = d->trans; + continue; } + + if (p[-1] == eol && newline) + { + s = d->newlines[s1]; + continue; + } + + s = 0; } } @@ -2934,6 +2967,13 @@ dfainit (struct dfa *d) d->tralloc = 0; d->musts = 0; + d->realtrans = 0; + d->fails = 0; + d->newlines = 0; + d->success = 0; +#ifdef GAWK + d->broken = 0; +#endif } /* Parse and analyze a single string of the given length. */ @@ -3016,8 +3056,13 @@ dfafree (struct dfa *d) } #endif /* MBS_SUPPORT */ - for (i = 0; i < d->sindex; ++i) + for (i = 0; i < d->sindex; ++i) { free((ptr_t) d->states[i].elems.elems); +#ifdef MBS_SUPPORT + if (d->states[i].mbps.nelem > 0) + free((ptr_t) d->states[i].mbps.elems); +#endif /* MBS_SUPPORT */ + } free((ptr_t) d->states); for (i = 0; i < d->tindex; ++i) if (d->follows[i].elems) @@ -3030,6 +3075,7 @@ dfafree (struct dfa *d) free((ptr_t) d->fails[i]); if (d->realtrans) free((ptr_t) d->realtrans); if (d->fails) free((ptr_t) d->fails); + if (d->newlines) free((ptr_t) d->newlines); if (d->success) free((ptr_t) d->success); for (dm = d->musts; dm; dm = ndm) { diff --git a/src/dfa.h b/src/dfa.h index ce8b951..df558b6 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -355,9 +355,19 @@ struct dfa on a state that potentially could do so. */ int *success; /* Table of acceptance conditions used in dfaexec and computed in build_state. */ + int *newlines; /* Transitions on newlines. The entry for a + newline in any transition table is always + -1 so we can count lines without wasting + too many cycles. The transition for a + newline is stored separately and handled + as a special case. */ struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ +#ifdef GAWK + int broken; /* True if using a feature where there + are bugs and gawk should use regex. */ +#endif }; /* Some macros for user access to dfa internals. */ @@ -389,13 +399,18 @@ extern void dfasyntax (reg_syntax_t, int, unsigned char); extern void dfacomp (char const *, size_t, struct dfa *, int); /* Execute the given struct dfa on the buffer of characters. The - last byte of the buffer must equal the end-of-line byte. - The final argument points to a flag that will + first char * points to the beginning, and the second points to the + first character after the end of the buffer, which must be a writable + place so a sentinel end-of-buffer marker can be stored there. The + second-to-last argument is a flag telling whether to allow newlines to + be part of a string matching the regexp. The next-to-last argument, + if non-NULL, points to a place to increment every time we see a + newline. The final argument, if non-NULL, points to a flag that will be set if further examination by a backtracking matcher is needed in order to verify backreferencing; otherwise the flag will be cleared. - Returns (size_t) -1 if no match is found, or the offset of the first + Returns NULL if no match is found, or a pointer to the first character after the first & shortest matching string in the buffer. */ -extern size_t dfaexec (struct dfa *, char const *, size_t, int *); +extern char *dfaexec (struct dfa *, char const *, char *, int , int *, int *); /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); -- 1.7.0.2.393.gfb6b >From 4f406ca9d86326220b699b6b93b81e47a8483f23 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Sun, 7 Mar 2010 10:38:29 +0100 Subject: [PATCH 2/3] dfa: make search.c use the new dfaexec API * src/search.c: Adjust to new dfaexec API. Now, dfaexec returns a pointer, not an integer, and the third parameter is END, not buffer size. Rewrite the function's comment. * src/dfa.h (dfaexec): Update comment; include parameter names. --- src/dfa.c | 43 ++++++++++++++++++++++++------------------- src/dfa.h | 27 ++++++++++++++------------- src/search.c | 12 +++++++----- 3 files changed, 45 insertions(+), 37 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 76c6a0e..d4663e3 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -2345,12 +2345,13 @@ build_state_zero (struct dfa *d) { \ while (inputwcs[p - buf_begin] == 0 \ && mblen_buf[p - buf_begin] > 0 \ - && p < buf_end) \ + && (unsigned char const *) p < buf_end) \ ++p; \ - if ((char const *) p >= end) \ + if ((char *) p >= end) \ { \ free(mblen_buf); \ free(inputwcs); \ + *end = saved_end; \ return NULL; \ } \ } @@ -2413,8 +2414,10 @@ transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, else if (works < 0) { if (p == buf_end) - /* At the moment, it must not happen. */ - return TRANSIT_STATE_END_BUFFER; + { + /* At the moment, it must not happen. */ + abort (); + } works = 0; } else if (d->fails[works]) @@ -2757,18 +2760,17 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) #endif /* MBS_SUPPORT */ /* Search through a buffer looking for a match to the given struct dfa. - Find the first occurrence of a string matching the regexp in the buffer, - and the shortest possible version thereof. Return a pointer to the first - character after the match, or NULL if none is found. Begin points to - the beginning of the buffer, and end points to the first character after - its end. We store a newline in *end to act as a sentinel, so end had - better point somewhere valid. Newline is a flag indicating whether to - allow newlines to be in the matching string. If count is non- - NULL it points to a place we're supposed to increment every time we - see a newline. Finally, if backref is non-NULL it points to a place - where we're supposed to store a 1 if backreferencing happened and the - match needs to be verified by a backtracking matcher. Otherwise - we store a 0 in *backref. */ + Find the first occurrence of a string matching the regexp in the + buffer, and the shortest possible version thereof. Return a pointer to + the first character after the match, or NULL if none is found. BEGIN + points to the beginning of the buffer, and END points to the first byte + after its end. Note however that we store a sentinel byte (usually + newline) in *END, so the actual buffer must be one byte longer. + When NEWLINE is nonzero, newlines may appear in the matching string. + If COUNT is non-NULL, increment *COUNT once for each newline processed. + Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we + encountered a back-reference (1) or not (0). The caller may use this + to decide whether to fall back on a backtracking matcher. */ char * dfaexec (struct dfa *d, char const *begin, char *end, int newline, int *count, int *backref) @@ -2797,14 +2799,15 @@ dfaexec (struct dfa *d, char const *begin, char *end, s = s1 = 0; p = (unsigned char const *) begin; trans = d->trans; + unsigned char saved_end = *(unsigned char *) end; *end = eol; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { int remain_bytes, i; - buf_begin = (unsigned char const *) begin; - buf_end = (unsigned char const *) end; + buf_begin = (unsigned char *) begin; + buf_end = (unsigned char *) end; /* initialize mblen_buf, and inputwcs. */ MALLOC(mblen_buf, unsigned char, end - begin + 2); @@ -2875,7 +2878,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, s = t[*p++]; } - if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) + if (s >= 0 && (char *) p <= end && d->fails[s]) { if (d->success[s] & sbit[*p]) { @@ -2888,6 +2891,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, free(inputwcs); } #endif /* MBS_SUPPORT */ + *end = saved_end; return (char *) p; } @@ -2920,6 +2924,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, free(inputwcs); } #endif /* MBS_SUPPORT */ + *end = saved_end; return NULL; } diff --git a/src/dfa.h b/src/dfa.h index df558b6..f13a257 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -398,19 +398,20 @@ extern void dfasyntax (reg_syntax_t, int, unsigned char); exact matcher. */ extern void dfacomp (char const *, size_t, struct dfa *, int); -/* Execute the given struct dfa on the buffer of characters. The - first char * points to the beginning, and the second points to the - first character after the end of the buffer, which must be a writable - place so a sentinel end-of-buffer marker can be stored there. The - second-to-last argument is a flag telling whether to allow newlines to - be part of a string matching the regexp. The next-to-last argument, - if non-NULL, points to a place to increment every time we see a - newline. The final argument, if non-NULL, points to a flag that will - be set if further examination by a backtracking matcher is needed in - order to verify backreferencing; otherwise the flag will be cleared. - Returns NULL if no match is found, or a pointer to the first - character after the first & shortest matching string in the buffer. */ -extern char *dfaexec (struct dfa *, char const *, char *, int , int *, int *); +/* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the + buffer, and the shortest possible version thereof. Return a pointer to + the first character after the match, or NULL if none is found. BEGIN + points to the beginning of the buffer, and END points to the first byte + after its end. Note however that we store a sentinel byte (usually + newline) in *END, so the actual buffer must be one byte longer. + When NEWLINE is nonzero, newlines may appear in the matching string. + If COUNT is non-NULL, increment *COUNT once for each newline processed. + Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we + encountered a back-reference (1) or not (0). The caller may use this + to decide whether to fall back on a backtracking matcher. */ +extern char *dfaexec (struct dfa *d, char const *begin, char *end, + int newline, int *count, int *backref); /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); diff --git a/src/search.c b/src/search.c index bd44696..e565746 100644 --- a/src/search.c +++ b/src/search.c @@ -333,7 +333,8 @@ EXECUTE_FCT(EGexecute) { if (match_icase) { - char *case_buf = xmalloc(size); + /* Add one for the sentinel byte dfaexec may add. */ + char *case_buf = xmalloc(size + 1); memcpy(case_buf, buf, size); if (start_ptr) start_ptr = case_buf + (start_ptr - buf); @@ -375,17 +376,18 @@ EXECUTE_FCT(EGexecute) --beg; if (kwsm.index < kwset_exact_matches) goto success; - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL) continue; } else { /* No good fixed strings; start with DFA. */ - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); - if (offset == (size_t) -1) + char const *next_beg = dfaexec (&dfa, beg, (char *) buflim, + 0, NULL, &backref); + if (next_beg == NULL) break; /* Narrow down to the line we've found. */ - beg += offset; + beg = next_beg; if ((end = memchr(beg, eol, buflim - beg)) != NULL) end++; else -- 1.7.0.2.393.gfb6b >From 704fa4539a4e17a6930fc51e5b47fc0b4efbc859 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Sun, 7 Mar 2010 20:07:30 +0100 Subject: [PATCH 3/3] tests: add test cases for dfaexec bug * tests/dfaexec-multibyte: New test. * tests/Makefile.am (TESTS): Add it. Reported by Paolo Bonzini in http://bugzilla.redhat.com/544407 and http://bugzilla.redhat.com/544406 . --- tests/Makefile.am | 1 + tests/dfaexec-multibyte | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 0 deletions(-) create mode 100644 tests/dfaexec-multibyte diff --git a/tests/Makefile.am b/tests/Makefile.am index 1f54ec7..06460bc 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -19,6 +19,7 @@ TESTS = \ bre.sh \ case-fold-char-class \ case-fold-char-type \ + dfaexec-multibyte \ empty.sh \ ere.sh \ file.sh \ diff --git a/tests/dfaexec-multibyte b/tests/dfaexec-multibyte new file mode 100644 index 0000000..47db42b --- /dev/null +++ b/tests/dfaexec-multibyte @@ -0,0 +1,30 @@ +#!/bin/sh +# This would fail for grep-2.5.3 +: ${srcdir=.} +. "$srcdir/init.sh"; path_prepend_ ../src + +printf 'aa\n' > exp-aa || framework_failure +printf 'ab\n' > exp-ab || framework_failure +printf '1 2 3\n' > exp-123 || framework_failure + +fail=0 + +for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do + LC_ALL=$LOC grep -E '([a]|[b]){2}' < exp-aa > out || fail=1 + compare out exp-aa || fail=1 + + LC_ALL=$LOC grep -E '([b]|[a]){2}' < exp-aa > out || fail=1 + compare out exp-aa || fail=1 + + LC_ALL=$LOC grep -E '([b]|[a]){2}' < exp-ab > out || fail=1 + compare out exp-ab || fail=1 + + LC_ALL=$LOC grep -E '([a]|[b]){2}' < exp-ab > out || fail=1 + compare out exp-ab || fail=1 + + LC_ALL=$LOC grep -E '^([[:digit:]]+[[:space:]]+){2}' < exp-123 > out || fail=1 + compare out exp-123 || fail=1 + +done + +Exit $fail -- 1.7.0.2.393.gfb6b
