In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/b57e41186b2ceb48bef4f0588dcd19e105cc8a38?hp=bfba585ac769bf0862ff2ce15499506aaa370ece>
- Log ----------------------------------------------------------------- commit b57e41186b2ceb48bef4f0588dcd19e105cc8a38 Author: Karl Williamson <pub...@khwilliamson.com> Date: Tue Jan 18 15:03:41 2011 -0700 regcomp: Disallow multi-char folds in lookbehind The addition of the ANYOFV regnode to treat multi-char folds in a bracketed character class has exposed a bug, in which those classes have long been able to be varying length (due to the multi-char fold), but the compiler wasn't aware of it. Now it is, and hence won't allow those which have multi-char folds to be part of a lookbehind pattern, which requires a constant length. This patch disallows multi-char folds in a lookbehind bracketed character class. ----------------------------------------------------------------------- Summary of changes: regcomp.c | 24 +++++++++++++++++------- t/re/pat_advanced.t | 15 ++++++++++++++- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/regcomp.c b/regcomp.c index 0c34695..729ebfb 100644 --- a/regcomp.c +++ b/regcomp.c @@ -138,6 +138,7 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + I32 in_lookbehind; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -184,6 +185,7 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') @@ -4459,6 +4461,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_sawback = 0; RExC_seen = 0; + RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_seen_evals = 0; RExC_extralen = 0; @@ -5938,6 +5941,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto capturing_parens; } RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ RExC_seen_zerolen++; @@ -6585,6 +6589,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } if (after_freeze) RExC_npar = after_freeze; return(ret); @@ -8590,9 +8598,6 @@ parseit: /* The \p could match something in the Latin1 range, hence * something that isn't utf8 */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP; - if (FOLD) { /* And one of these could have a multi-char fold */ - OP(ret) = ANYOFV; - } namedclass = ANYOF_MAX; /* no official name, but it's named */ } break; @@ -8921,8 +8926,10 @@ parseit: /* Currently, we don't look at every value in the range. * Therefore we have to assume the worst case: that if - * folding, it will match more than one character */ - if (FOLD) { + * folding, it will match more than one character. But in + * lookbehind patterns, can only be single character + * length, so disallow those folds */ + if (FOLD && ! RExC_in_lookbehind) { OP(ret) = ANYOFV; } } @@ -8956,8 +8963,9 @@ parseit: #endif Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f); - else { + else if (! RExC_in_lookbehind) { /* Any multicharacter foldings + * (disallowed in lookbehind patterns) * require the following transform: * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) * where E folds into "pq" and F folds @@ -9032,8 +9040,10 @@ parseit: /* This is the one character in the bitmap that needs special handling * under non-locale folding, as it folds to two characters 'ss'. This * happens if it is set and not inverting, or isn't set and are - * inverting */ + * inverting (disallowed in lookbehind patterns because they can't be + * variable length) */ if (! LOC + && ! RExC_in_lookbehind && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S)) ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT))) { diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 0254813..de48ad4 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -21,7 +21,7 @@ BEGIN { } -plan tests => 1303; # Update this when adding/deleting tests. +plan tests => 1304; # Update this when adding/deleting tests. run_tests() unless caller; @@ -2072,6 +2072,16 @@ sub run_tests { } } + { # Bleadperl v5.13.8-292-gf56b639 breaks NEZUMI/Unicode-LineBreak-1.011 + # \xdf in lookbehind failed to compile as is multi-char fold + eval_ok 'qr{ + (?u: (?<=^url:) | + (?<=[/]) (?=[^/]) | + (?<=[^-.]) (?=[-~.,_?\#%=&]) | + (?<=[=&]) (?=.) + )}iox', "Lookbehind with \\xdf matchable compiles"; + } + # # Keep the following tests last -- they may crash perl # @@ -2106,6 +2116,9 @@ sub run_tests { eval $a =~ /[a-z]/; ok(1); # If it didn't crash, it worked. } + + # !!! NOTE that tests that aren't at all likely to crash perl should go + # a ways above, above these last ones. } # End of sub run_tests 1; -- Perl5 Master Repository