In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8736d1c336efff364daa33d27d0381c4de4771d0?hp=e1d962edb53a9790649c1b76712fce5b4b85444c>
- Log ----------------------------------------------------------------- commit 8736d1c336efff364daa33d27d0381c4de4771d0 Author: Karl Williamson <[email protected]> Date: Sat Sep 12 11:39:37 2015 -0600 regcomp.c: Simplify some code Commit 2d3d6e6e7c2d50b1cc47032cf089151823fb20a6 introduced the 'optimizable' variable which if FALSE prevents the [...] node from being optimized, if otherwise possible, into something simpler. It turns out that several of the conditions which prevent such optimization can just clear this flag when they are found, rather than having to test for the conditions again later when the optimization is actually done. M embed.fnc M proto.h M regcomp.c commit a229ea8f086a14af7afbad5fa40d1f71e6e48a34 Author: Karl Williamson <[email protected]> Date: Sat Sep 12 11:34:57 2015 -0600 regcomp.c: Comment changes only M regcomp.c commit ab87267cc6ac3b1ea950ee4e45bd80dea25b8f79 Author: Karl Williamson <[email protected]> Date: Mon Aug 24 21:09:02 2015 -0600 PATCH: [perl #125892] qr/(?[ ]) regression with '!' This regression was introduced in 5.22. It stems from a logic error I made in a complicated 'if' statement. M regcomp.c M t/re/regex_sets.t commit 47f5936b63e4d8d8534d6dd332de6fe8a8510626 Author: Karl Williamson <[email protected]> Date: Sat Sep 12 10:10:59 2015 -0600 regcomp.c: Add synonym for macro complement OPERAND and OPERATOR are here complements of each other. It's better to refer to the thing you are manipulating instead of {! the thing you aren't}. M regcomp.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 +- proto.h | 2 +- regcomp.c | 76 +++++++++++++++++++++++++++++-------------------------- t/re/regex_sets.t | 2 ++ 4 files changed, 44 insertions(+), 38 deletions(-) diff --git a/embed.fnc b/embed.fnc index f1abcd0..eccc76c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2131,7 +2131,7 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ |bool allow_multi_fold \ |const bool silence_non_portable \ |const bool strict \ - |const bool optimizable \ + |bool optimizable \ |NULLOK SV** ret_invlist Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ |NN SV** invlist diff --git a/proto.h b/proto.h index 4d3465f..6d32816 100644 --- a/proto.h +++ b/proto.h @@ -4774,7 +4774,7 @@ STATIC regnode* S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth STATIC regnode* S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth); #define PERL_ARGS_ASSERT_REGBRANCH \ assert(pRExC_state); assert(flagp) -STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, const bool optimiza ... [23 chars truncated] +STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, bool optimizable, S ... [17 chars truncated] #define PERL_ARGS_ASSERT_REGCLASS \ assert(pRExC_state); assert(flagp) STATIC unsigned int S_regex_set_precedence(const U8 my_operator) diff --git a/regcomp.c b/regcomp.c index 47b451c..fe9b326 100644 --- a/regcomp.c +++ b/regcomp.c @@ -13537,7 +13537,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * a stack. Each entry on the stack is a single character representing one * of the operators; or else a pointer to an operand inversion list. */ -#define IS_OPERAND(a) (! SvIOK(a)) +#define IS_OPERATOR(a) SvIOK(a) +#define IS_OPERAND(a) (! IS_OPERATOR(a)) /* The stack is kept in Åukasiewicz order. (That's pronounced similar * to luke-a-shave-itch (or -itz), but people who didn't want to bother @@ -13709,13 +13710,14 @@ redo_curchar: /* If the top entry on the stack is an operator, it had * better be a '!', otherwise the entry below the top * operand should be an operator */ - if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) - || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!') - || top_index - fence < 1 - || ! (stacked_ptr = av_fetch(stack, - top_index - 1, - FALSE)) - || IS_OPERAND(*stacked_ptr)) + if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) + || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') + || ( IS_OPERAND(*top_ptr) + && ( top_index - fence < 1 + || ! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || ! IS_OPERATOR(*stacked_ptr)))) { RExC_parse++; vFAIL("Unexpected '(' with no preceding operator"); @@ -13964,7 +13966,7 @@ redo_curchar: * be an operator */ top_ptr = av_fetch(stack, top_index, FALSE); assert(top_ptr); - if (! IS_OPERAND(*top_ptr)) { + if (IS_OPERATOR(*top_ptr)) { /* The only permissible operator at the top of the stack is * '!', which is applied immediately to this operand. */ @@ -14109,6 +14111,7 @@ redo_curchar: Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ return node; } +#undef IS_OPERATOR #undef IS_OPERAND STATIC void @@ -14251,7 +14254,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, about too large characters */ const bool strict, - const bool optimizable, /* ? Allow a non-ANYOF return + bool optimizable, /* ? Allow a non-ANYOF return node */ SV** ret_invlist /* Return an inversion list, not a node */ ) @@ -14700,6 +14703,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, (value == 'p' ? '+' : '!'), UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; + optimizable = FALSE; /* Will have to leave this an + ANYOF node */ /* We don't know yet, so have to assume that the * property could match something in the Latin1 range, @@ -14934,6 +14939,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; ANYOF_POSIXL_ZERO(ret); + + /* We can't change this into some other type of node + * (unless this is the only element, in which case there + * are nodes that mean exactly this) as has runtime + * dependencies */ + optimizable = FALSE; } /* Coverity thinks it is possible for this to be negative; both @@ -15658,7 +15669,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Our calculated list will be for Unicode rules. For locale * matching, we have to keep a separate list that is consulted at * runtime only when the locale indicates Unicode rules. For - * non-locale, we just use to the general list */ + * non-locale, we just use the general list */ if (LOC) { use_list = &only_utf8_locale_list; } @@ -15896,7 +15907,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (warn_super) { ANYOF_FLAGS(ret) - |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + + /* Because an ANYOF node is the only one that warns, this node + * can't be optimized into something else */ + optimizable = FALSE; } } @@ -15918,8 +15933,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (only_utf8_locale_list) { ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } - else if (cp_list) { /* Look to see if there a 0-255 code point is in - the list */ + else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { @@ -15978,24 +15992,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. I (khw) am not sure how much to look for here. It would - * be easy, but perhaps too slow, to check any candidates against all the - * node types they could possibly match using _invlistEQ(). */ - - if ( optimizable - && cp_list - && ! invert - && ! depends_list - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - - /* We don't optimize if we are supposed to make sure all non-Unicode - * code points raise a warning, as only ANYOF nodes have this check. - * */ - && ! ((ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) - && OP(ret) != ANYOFD - && ALWAYS_WARN_SUPER)) - { + * earlier code. I (khw) did some benchmarks and found essentially no + * speed difference between using a POSIXA node versus an ANYOF node, so + * there is no reason to optimize, for example [A-Za-z0-9_] into + * [[:word:]]/a (although if we did it in the sizing pass it would save + * space). _invlistEQ() could be used if one ever wanted to do something + * like this at this point in the code */ + + if (optimizable && cp_list && ! invert && ! depends_list) { UV start, end; U8 op = END; /* The optimzation node-type */ const char * cur_parse= RExC_parse; @@ -16004,9 +16008,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (! invlist_iternext(cp_list, &start, &end)) { /* Here, the list is empty. This happens, for example, when a - * Unicode property is the only thing in the character class, and - * it doesn't match anything. (perluniprops.pod notes such - * properties) */ + * Unicode property that doesn't match anything is the only element + * in the character class (perluniprops.pod notes such properties). + * */ op = OPFAIL; *flagp |= HASWIDTH|SIMPLE; } @@ -16062,7 +16066,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } } - } + } /* End of first range contains just a single code point */ else if (start == 0) { if (end == UV_MAX) { op = SANY; diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index a10bcea..ee161b2 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -97,6 +97,8 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without / eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/'; is($@, "", 'qr/(?[ [a] ])/ can be interpolated'); +like("B", qr/(?[ [B] | ! ( [^B] ) ])/, "[perl #125892]"); + if (! is_miniperl() && locales_enabled('LC_CTYPE')) { my $utf8_locale = find_utf8_ctype_locale; SKIP: { -- Perl5 Master Repository
