In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/a94485510cc7a65b67130a8a8c4d1065ba596a8d?hp=3b6c52ce7db772c296d8f10d92dec46af03391dc>
- Log ----------------------------------------------------------------- commit a94485510cc7a65b67130a8a8c4d1065ba596a8d Author: Karl Williamson <[email protected]> Date: Mon Jan 29 10:46:02 2018 -0700 Fix bug in new [[:ascii:]] nodes Commit aff4cafe362e55c7722ba12952e287a7d1770cb9 added new regnodes for [[:ascii:]] and its complement for a significant performance improvement. In looking at the code later, I realized that there was a bug in find_byclass() in that it didn't continue to try after an initial trial match succeeds, but getting the whole pattern to match fails. It's supposed to try again with the next ascii. This commit fixes that, and adds tests. I thought that these new changes might lower the performance improvement of the original, but it doesn't. Here's a typical one where we have a string of a million non-ascii 2-byte characters, followed by a single ASCII one. posixa ascii Ratio % ------- -------- -------- Ir Inf Inf 665.9 Dr Inf 250907.0 1993.1 Dw Inf 597.0 167603.7 COND Inf 500532.0 399.7 IND 22.0 22.0 100.0 (posixa is the old way of doing things; Inf just means the number was too large for the program to want to display it; the ratio is still valid). commit 1971914578c14bb2ffc687f6556c41148781acfc Author: Karl Williamson <[email protected]> Date: Mon Jan 29 09:51:45 2018 -0700 regexec.c: Extract some macro code into a submacro A future commit will reuse this code, so will avoid duplication. commit 21d1ed54f05b0cc903eaa25cbc4575df1a5a89ed Author: Karl Williamson <[email protected]> Date: Sun Jan 28 21:02:49 2018 -0700 regexec.c: Use different method for finding adjacent chars Commit 3b6c52ce7db772c296d8f10d92dec46af03391dc changed the variable name and commented what the code was doing. This changes that code to use a different mechanism that I think is simpler, and is extensible so that it can be used not just for instances in which the input is examined character-by-character. Until this commit, a boolean was used to indicate that we've found adjacent characters. This commit saves the address of the next character, so when we find the next match, if it begins at the saved address, we know it is adjacent. commit d990bd304fed1232b7a129ea8cb17fecdd3051c3 Author: Karl Williamson <[email protected]> Date: Sun Jan 28 20:33:10 2018 -0700 regexec.c: Extract some macro code into a sub-macro By doing this, it becomes common code with another place in the code, so the duplication can be removed. commit da10aa09bd687402764bf887b625b8081dd8cf0a Author: Karl Williamson <[email protected]> Date: Sun Jan 28 19:15:25 2018 -0700 regexec.c: Collapse some macros By adding a utf8ness parameter these 4 macros can be collapsed into 2, with no increase in run time, as the parameter is always a compile time constant and modern compilers will avoid the conditional. commit e4eb64812fa316ef6a2f62a20180e4f106fbd8b4 Author: Karl Williamson <[email protected]> Date: Mon Jan 29 16:05:41 2018 -0700 Fix bug in t/re/regex_sets_compat.t This tests the tests that regexp.t has and which have bracketed character classes. It converts those to the regex sets notation, and verifies they still work. It was adding and extra blank at the end of the pattern in some cases, causing it to fail. ----------------------------------------------------------------------- Summary of changes: regexec.c | 185 +++++++++++++++++++++++++++++----------------------------- t/re/re_tests | 4 ++ t/re/regexp.t | 11 ++-- 3 files changed, 102 insertions(+), 98 deletions(-) diff --git a/regexec.c b/regexec.c index 6e8c83fd08..5a1c5ef61f 100644 --- a/regexec.c +++ b/regexec.c @@ -1715,62 +1715,65 @@ STMT_START { \ } \ } STMT_END -#define REXEC_FBC_UTF8_SCAN(CODE) \ -STMT_START { \ - while (s < strend) { \ - CODE \ - s += UTF8SKIP(s); \ - } \ -} STMT_END - -#define REXEC_FBC_SCAN(CODE) \ -STMT_START { \ - while (s < strend) { \ - CODE \ - s++; \ - } \ -} STMT_END +#define REXEC_FBC_SCAN(UTF8, CODE) \ + STMT_START { \ + while (s < strend) { \ + CODE \ + s += ((UTF8) ? UTF8SKIP(s) : 1); \ + } \ + } STMT_END -/* In the next few macros, 'try_it' is a bool indicating whether to actually - * try the match or not. It is used for when the flags indicate that only the - * first occurrence of 'x' in a string of them should be considered for - * matching. try_it is initialized to 1, and set to 1 on every failure of the - * condition, thus it will be 1 whenever a 'x' happens to be first. But when - * the condition is met, and we don't exit the loop because we have ultimate - * success, try_it is set to 'doevery', the latter being FALSE if we only want - * the first in a string; otherwise TRUE, so try_it will be 0 when the previous - * thing was 'x' and we only want the first 'x' */ - -#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ -REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \ - if (COND) { \ - if (try_it && (reginfo->intuit || regtry(reginfo, &s)))\ - goto got_it; \ - else \ - try_it = doevery; \ - } \ - else \ - try_it = 1; \ -) +#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \ + STMT_START { \ + while (s < strend) { \ + REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ + } \ + } STMT_END -#define REXEC_FBC_CLASS_SCAN(COND) \ -REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ +#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ if (COND) { \ - if (try_it && (reginfo->intuit || regtry(reginfo, &s)))\ - goto got_it; \ - else \ - try_it = doevery; \ + FBC_CHECK_AND_TRY \ + s += ((UTF8) ? UTF8SKIP(s) : 1); \ + previous_occurrence_end = s; \ } \ - else \ - try_it = 1; \ -) + else { \ + s += ((UTF8) ? UTF8SKIP(s) : 1); \ + } #define REXEC_FBC_CSCAN(CONDUTF8,COND) \ if (utf8_target) { \ - REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \ + REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \ } \ else { \ - REXEC_FBC_CLASS_SCAN(COND); \ + REXEC_FBC_CLASS_SCAN(0, COND); \ + } + +/* We keep track of where the next character should start after an occurrence + * of the one we're looking for. Knowing that, we can see right away if the + * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we + * don't accept the 2nd and succeeding adjacent occurrences */ +#define FBC_CHECK_AND_TRY \ + if ( ( doevery \ + || s != previous_occurrence_end) \ + && (reginfo->intuit || regtry(reginfo, &s))) \ + { \ + goto got_it; \ + } + + +/* This differs from the above macros in that it calls a function which returns + * the next occurrence of the thing being looked for in 's'; and 'strend' if + * there is no such occurrence. */ +#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \ + while (s < strend) { \ + s = f; \ + if (s >= strend) { \ + break; \ + } \ + \ + FBC_CHECK_AND_TRY \ + s += (UTF8) ? UTF8SKIP(s) : 1; \ + previous_occurrence_end = s; \ } /* The three macros below are slightly different versions of the same logic. @@ -1801,7 +1804,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ * here. And vice-versa if we are looking for a non-boundary. * * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and - * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of + * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that @@ -1812,7 +1815,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ - REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ @@ -1836,7 +1839,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ } \ tmp = TEST_UV(tmp); \ LOAD_UTF8_CHARCLASS_ALNUM(); \ - REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ tmp = !tmp; \ IF_SUCCESS; \ @@ -1856,7 +1859,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ - REXEC_FBC_SCAN( /* advances s while s < strend */ \ + REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \ if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ IF_SUCCESS; \ tmp = !tmp; \ @@ -2001,7 +2004,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, regmatch_info *reginfo) { dVAR; + + /* TRUE if x+ need not match at just the 1st pos of run of x's */ const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; + char *pat_string; /* The pattern's exactish string */ char *pat_end; /* ptr to end char of pat_string */ re_fold_t folder; /* Function for computing non-utf8 folds */ @@ -2011,8 +2017,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, U8 c1; U8 c2; char *e; - bool try_it = 1; /* Use in some macros to control whether to accept this - occurrence of what's being matched, or not */ + + /* In some cases we accept only the first occurence of 'x' in a sequence of + * them. This variable points to just beyond the end of the previous + * occurrence of 'x', hence we can tell if we are in a sequence. (Having + * it point to beyond the 'x' allows us to work for UTF-8 without having to + * hop back.) */ + char * previous_occurrence_end = 0; + I32 tmp; /* Scratch variable */ const bool utf8_target = reginfo->is_utf8_target; UV utf8_fold_flags = 0; @@ -2039,14 +2051,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFD: case ANYOF: if (utf8_target) { - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else if (ANYOF_FLAGS(c)) { - REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0)); + REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0)); } else { - REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); + REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s))); } break; @@ -2516,17 +2528,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case ASCII: - s = find_next_ascii(s, strend, utf8_target); - if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) { - goto got_it; - } - + REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target)); break; case NASCII: - s = find_next_non_ascii(s, strend, utf8_target); - if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) { - goto got_it; + if (utf8_target) { + REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend, + utf8_target)); + } + else { + REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend, + utf8_target)); } break; @@ -2558,8 +2570,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (utf8_target) { /* The complement of something that matches only ASCII matches all * non-ASCII, plus everything in ASCII that isn't in the class. */ - REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend) - || ! _generic_isCC_A(*s, FLAGS(c))); + REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend) + || ! _generic_isCC_A(*s, FLAGS(c))); break; } @@ -2572,12 +2584,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * as otherwise we would have to examine all the continuation * characters */ if (utf8_target) { - REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c))); + REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c))); break; } posixa: - REXEC_FBC_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); break; @@ -2587,7 +2599,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case POSIXU: if (! utf8_target) { - REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ + to_complement ^ cBOOL(_generic_isCC(*s, FLAGS(c)))); } else { @@ -2600,55 +2613,45 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We avoid loading in the swash as long as possible, but * should we have to, we jump to a separate loop. This * extra 'if' statement is what keeps this code from being - * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + * just a call to REXEC_FBC_CLASS_SCAN() */ if (UTF8_IS_ABOVE_LATIN1(*s)) { goto found_above_latin1; } - if ((UTF8_IS_INVARIANT(*s) + + REXEC_FBC_CLASS_SCAN_GUTS(1, (UTF8_IS_INVARIANT(*s) && to_complement ^ cBOOL(_generic_isCC((U8) *s, classnum))) || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend) && to_complement ^ cBOOL( _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)), - classnum)))) - { - if (try_it && (reginfo->intuit || regtry(reginfo, &s))) - goto got_it; - else { - try_it = doevery; - } - } - else { - try_it = 1; - } - s += UTF8SKIP(s); + classnum)))); } } else switch (classnum) { /* These classes are implemented as macros */ case _CC_ENUM_SPACE: - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); break; case _CC_ENUM_BLANK: - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); break; case _CC_ENUM_XDIGIT: - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); break; case _CC_ENUM_VERTSPACE: - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); break; case _CC_ENUM_CNTRL: - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); break; @@ -2673,7 +2676,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* This is a copy of the loop above for swash classes, though using the * FBC macro instead of being expanded out. Since we've loaded the * swash, we don't have to check for that each time through the loop */ - REXEC_FBC_UTF8_CLASS_SCAN( + REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ to_complement ^ cBOOL(_generic_utf8_safe( classnum, s, @@ -3400,7 +3403,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, to_utf8_substr(prog); } ch = SvPVX_const(prog->anchored_utf8)[0]; - REXEC_FBC_SCAN( + REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; @@ -3418,7 +3421,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } } ch = SvPVX_const(prog->anchored_substr)[0]; - REXEC_FBC_SCAN( + REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; diff --git a/t/re/re_tests b/t/re/re_tests index 62ea30796a..61b8c875e2 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1986,6 +1986,10 @@ AB\s+\x{100} AB \x{100}X y - - /(?-x:[a b])/xx \N{SPACE} yS $& # Note a space char here ^a?bcd\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff ABCDEFGHIJKLMNOPQRSTUVWXYZ n - - # [perl #132187] for valgrind's benefit ^Xaaa?Xaa aaa\x{400000} n - - # [perl #132552] for valgrind's benefit +([[:ascii:]]+)\x81 a\x80b\x81 y $& b\x81 +[[:^ascii:]]+b \x80a\x81b y $& \x81b +[[:^ascii:]]+b \x80a\x81\x{100}b y $& \x81\x{100}b + # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab diff --git a/t/re/regexp.t b/t/re/regexp.t index 287d1c71a2..cced1e0560 100644 --- a/t/re/regexp.t +++ b/t/re/regexp.t @@ -244,7 +244,8 @@ foreach (@tests) { } } $j--; - $modified .= substr($pat, $i + 1, $j - $i) . " "; + $modified .= substr($pat, $i + 1, $j - $i); + $modified .= " " if $in_brackets; $i = $j; } elsif (ord($curchar) >= ord('0') @@ -287,12 +288,8 @@ foreach (@tests) { # A regular character. if ($curchar ne '[') { - if (! $in_brackets) { - $modified .= $curchar; - } - else { - $modified .= " $curchar "; - } + $modified .= " " if $in_brackets; + $modified .= $curchar; next; } -- Perl5 Master Repository
