In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/c45df5a16bb5a26a06275cc63f2c3e6b1d708184?hp=3b9b32c914622129d88bd352d8269d046fa5efe5>
- Log ----------------------------------------------------------------- commit c45df5a16bb5a26a06275cc63f2c3e6b1d708184 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 23:19:16 2011 -0700 regcomp.c: Rmv unused parameter This silences a compiler warning M embed.fnc M proto.h M regcomp.c commit b8953805dfeee53cd2300f61834ba32ccaaefaa8 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 23:14:45 2011 -0700 re/pat.t: Remove TODO message on passing tests A previous commit fixed these. M t/re/pat.t commit aa19b56b2f07e9eabf57540f00d312d8093e9d28 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 23:13:59 2011 -0700 regcomp.c: Rmv unused parameter This silences a compiler warning M embed.fnc M proto.h M regcomp.c commit 2f88b8574d3a767b1b37edf9927413bbc8ffd0a4 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 23:08:16 2011 -0700 regcomp.c: Rmv unused parameter This silences a compiler warning M embed.fnc M proto.h M regcomp.c commit 1411dba431b74256819ba8c07e7a61e2aa0b1742 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 22:56:02 2011 -0700 PATCH: [perl #85528], add initialization Commit 137165a601b852a9679983cdfe8d35be29f0939c omitted required initialization for the synthetic start class. Adding it exposed other bugs in cl_and() and cl_or(), which have been fixed by a previous commit. M regcomp.c M t/re/re_tests commit c6b765375213e9d6dce25829a367fe4ef37da1a4 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 22:35:36 2011 -0700 regcomp.c: revamp cl_and() and cl_or() These two routines have not kept pace with the changes in the ANYOF flags. And, I believe there were issues even before them. I did a systematic re-thinking of what their behaviors should be. M regcomp.c commit ace6b0e469777649cb9a908e00e8780b3af366d0 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 21:57:24 2011 -0700 regcomp.h: #define of ANYOF flags immune from inversion M regcomp.h commit c613755a4b4fc8e64a77639d47d7e208fee68edc Author: Karl Williamson <[email protected]> Date: Tue Mar 8 17:06:47 2011 -0700 regex: /l in combo with others in syn start class Now that regexes can be combinations of different charset modifiers, a synthetic start class can match locale and non-locale both. locale should generally match only things in the bitmap for code points < 256. But a synthetic start class with a non-locale component can match such code points. This patch makes an exception for synthetic nodes that will be resolved if it passes and is matched again for real. M regcomp.c M regcomp.h M regexec.c commit f0c16e54b3b5efbb4380952c7ba5e8d7626d7cae Author: Karl Williamson <[email protected]> Date: Tue Mar 8 16:30:00 2011 -0700 regcomp.c: UTF /l should not use tries It's unclear if tries will work under /l. I haven't seen any failures, but there have been under /d. As a precaution, until more testing is done, disable tries under anything but /u and UTF. M regcomp.c commit 1051e1c4d07fec1c36934f253d2baa8842339cbf Author: Karl Williamson <[email protected]> Date: Tue Mar 8 16:20:52 2011 -0700 regcomp.c: Merge identical functions These two functions now have identical code, so merge them, but use a macro in case they ever need to diverge again. M embed.fnc M embed.h M proto.h M regcomp.c commit cf34198ebe3dd876d67c10caa9acf491ad2a0c51 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 15:28:05 2011 -0700 regcomp.c: Change start class init for /l Before /l was added, locale only applied to regular expressions as a whole. Now it can be subsections, so the flag for allowing it should be treated as any other flag. M regcomp.c commit 58b5ba03346c70dc37751766fe464485278999a8 Author: Karl Williamson <[email protected]> Date: Tue Mar 8 15:25:27 2011 -0700 regcomp.c: clarify comments M regcomp.c commit c8d3cd88811d23a268c37b61d1c0641a6d42d995 Author: Karl Williamson <[email protected]> Date: Sun Mar 6 09:00:52 2011 -0700 regcomp.c: Move #defines to be be in bit order M regcomp.h ----------------------------------------------------------------------- Summary of changes: embed.fnc | 11 +-- embed.h | 1 - proto.h | 27 ++----- regcomp.c | 226 ++++++++++++++++++++++++++++++++++----------------------- regcomp.h | 38 ++++++---- regexec.c | 11 ++- t/re/pat.t | 1 - t/re/re_tests | 3 + 8 files changed, 182 insertions(+), 136 deletions(-) diff --git a/embed.fnc b/embed.fnc index 7dcb82e..d5273ea 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1803,17 +1803,12 @@ Es |bool |reg_skipcomment|NN struct RExC_state_t *pRExC_state Es |void |scan_commit |NN const struct RExC_state_t *pRExC_state \ |NN struct scan_data_t *data|NN I32 *minlenp \ |int is_inf -Esn |void |cl_anything |NN const struct RExC_state_t *pRExC_state \ - |NN struct regnode_charclass_class *cl +Esn |void |cl_anything |NN struct regnode_charclass_class *cl EsRn |int |cl_is_anything |NN const struct regnode_charclass_class *cl -Esn |void |cl_init |NN const struct RExC_state_t *pRExC_state \ - |NN struct regnode_charclass_class *cl -Esn |void |cl_init_zero |NN const struct RExC_state_t *pRExC_state \ - |NN struct regnode_charclass_class *cl +Esn |void |cl_init |NN struct regnode_charclass_class *cl Esn |void |cl_and |NN struct regnode_charclass_class *cl \ |NN const struct regnode_charclass_class *and_with -Esn |void |cl_or |NN const struct RExC_state_t *pRExC_state \ - |NN struct regnode_charclass_class *cl \ +Esn |void |cl_or |NN struct regnode_charclass_class *cl \ |NN const struct regnode_charclass_class *or_with Es |I32 |study_chunk |NN struct RExC_state_t *pRExC_state \ |NN regnode **scanp|NN I32 *minlenp \ diff --git a/embed.h b/embed.h index 743eb46..1ae431d 100644 --- a/embed.h +++ b/embed.h @@ -874,7 +874,6 @@ #define cl_and S_cl_and #define cl_anything S_cl_anything #define cl_init S_cl_init -#define cl_init_zero S_cl_init_zero #define cl_is_anything S_cl_is_anything #define cl_or S_cl_or #define invlist_array(a) S_invlist_array(aTHX_ a) diff --git a/proto.h b/proto.h index 80f3bc0..5f8daeb 100644 --- a/proto.h +++ b/proto.h @@ -5988,23 +5988,15 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_ch #define PERL_ARGS_ASSERT_CL_AND \ assert(cl); assert(and_with) -STATIC void S_cl_anything(const struct RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) - __attribute__nonnull__(1) - __attribute__nonnull__(2); +STATIC void S_cl_anything(struct regnode_charclass_class *cl) + __attribute__nonnull__(1); #define PERL_ARGS_ASSERT_CL_ANYTHING \ - assert(pRExC_state); assert(cl) + assert(cl) -STATIC void S_cl_init(const struct RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) - __attribute__nonnull__(1) - __attribute__nonnull__(2); +STATIC void S_cl_init(struct regnode_charclass_class *cl) + __attribute__nonnull__(1); #define PERL_ARGS_ASSERT_CL_INIT \ - assert(pRExC_state); assert(cl) - -STATIC void S_cl_init_zero(const struct RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) - __attribute__nonnull__(1) - __attribute__nonnull__(2); -#define PERL_ARGS_ASSERT_CL_INIT_ZERO \ - assert(pRExC_state); assert(cl) + assert(cl) STATIC int S_cl_is_anything(const struct regnode_charclass_class *cl) __attribute__warn_unused_result__ @@ -6012,12 +6004,11 @@ STATIC int S_cl_is_anything(const struct regnode_charclass_class *cl) #define PERL_ARGS_ASSERT_CL_IS_ANYTHING \ assert(cl) -STATIC void S_cl_or(const struct RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +STATIC void S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) __attribute__nonnull__(1) - __attribute__nonnull__(2) - __attribute__nonnull__(3); + __attribute__nonnull__(2); #define PERL_ARGS_ASSERT_CL_OR \ - assert(pRExC_state); assert(cl); assert(or_with) + assert(cl); assert(or_with) PERL_STATIC_INLINE UV* S_invlist_array(pTHX_ HV* const invlist) __attribute__warn_unused_result__ diff --git a/regcomp.c b/regcomp.c index 9357a78..b7a6939 100644 --- a/regcomp.c +++ b/regcomp.c @@ -720,15 +720,13 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min /* Can match anything (initialization) */ STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_cl_anything(struct regnode_charclass_class *cl) { PERL_ARGS_ASSERT_CL_ANYTHING; ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; - if (LOC) - cl->flags |= ANYOF_LOCALE; + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE; } /* Can match anything (initialization) */ @@ -751,29 +749,21 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) /* Can match anything (initialization) */ STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_cl_init(struct regnode_charclass_class *cl) { PERL_ARGS_ASSERT_CL_INIT; Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; - cl_anything(pRExC_state, cl); + cl_anything(cl); + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); } -STATIC void -S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) -{ - PERL_ARGS_ASSERT_CL_INIT_ZERO; - - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - if (LOC) - cl->flags |= ANYOF_LOCALE; -} +/* These two functions currently do the exact same thing */ +#define cl_init_zero S_cl_init /* 'And' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* cl should not be inverted */ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) @@ -782,6 +772,7 @@ S_cl_and(struct regnode_charclass_class *cl, assert(and_with->type == ANYOF); + /* I (khw) am not sure all these restrictions are necessary XXX */ if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -796,46 +787,86 @@ S_cl_and(struct regnode_charclass_class *cl, for (i = 0; i < ANYOF_BITMAP_SIZE; i++) cl->bitmap[i] &= and_with->bitmap[i]; } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - if (!(and_with->flags & ANYOF_EOS)) - cl->flags &= ~ANYOF_EOS; - if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)) - cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD; - if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; + if (and_with->flags & ANYOF_INVERT) { - if (cl->flags & ANYOF_UNICODE_ALL - && ANYOF_NONBITMAP(and_with) - && !(and_with->flags & ANYOF_INVERT)) - { - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + /* Here, the and'ed node is inverted. Get the AND of the flags that + * aren't affected by the inversion. Those that are affected are + * handled individually below */ + U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; + cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); + cl->flags |= affected_flags; + + /* We currently don't know how to deal with things that aren't in the + * bitmap, but we know that the intersection is no greater than what + * is already in cl, so let there be false positives that get sorted + * out after the synthetic start class succeeds, and the node is + * matched for real. */ + + /* The inversion of these two flags indicate that the resulting + * intersection doesn't have them */ + if (and_with->flags & ANYOF_UNICODE_ALL) { cl->flags &= ~ANYOF_UNICODE_ALL; } - else { - - /* The intersection of all unicode with something that isn't all - * unicode is that something */ - ARG_SET(cl, ARG(and_with)); + if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { + cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; } } - if (!(and_with->flags & ANYOF_UNICODE_ALL) && - !(and_with->flags & ANYOF_INVERT)) - { - cl->flags &= ~ANYOF_UNICODE_ALL; + else { /* and'd node is not inverted */ if (! ANYOF_NONBITMAP(and_with)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + + /* Here 'and_with' doesn't match anything outside the bitmap + * (except possibly ANYOF_UNICODE_ALL), which means the + * intersection can't either, except for ANYOF_UNICODE_ALL, in + * which case we don't know what the intersection is, but it's no + * greater than what cl already has, so can just leave it alone, + * with possible false positives */ + if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + } + } + else if (! ANYOF_NONBITMAP(cl)) { + + /* Here, 'and_with' does match something outside the bitmap, and cl + * doesn't have a list of things to match outside the bitmap. If + * cl can match all code points above 255, the intersection will + * be those above-255 code points that 'and_with' matches. There + * may be false positives from code points in 'and_with' that are + * outside the bitmap but below 256, but those get sorted out + * after the synthetic start class succeeds). If cl can't match + * all Unicode code points, it means here that it can't match * + * anything outside the bitmap, so we leave the bitmap empty */ + if (cl->flags & ANYOF_UNICODE_ALL) { + ARG_SET(cl, ARG(and_with)); + } } + else { + /* Here, both 'and_with' and cl match something outside the + * bitmap. Currently we do not do the intersection, so just match + * whatever cl had at the beginning. */ + } + + + /* Take the intersection of the two sets of flags */ + cl->flags &= and_with->flags; } } /* 'OR' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* cl should not be inverted */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { PERL_ARGS_ASSERT_CL_OR; if (or_with->flags & ANYOF_INVERT) { + + /* Here, the or'd node is to be inverted. This means we take the + * complement of everything not in the bitmap, but currently we don't + * know what that is, so give up and match anything */ + if (ANYOF_NONBITMAP(or_with)) { + cl_anything(cl); + } /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) * <= (B1 | !B2) | (CL1 | !CL2) @@ -845,7 +876,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { int i; @@ -854,9 +885,23 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con cl->bitmap[i] |= ~or_with->bitmap[i]; } /* XXXX: logic is complicated otherwise */ else { - cl_anything(pRExC_state, cl); + cl_anything(cl); } - } else { + + /* And, we can just take the union of the flags that aren't affected + * by the inversion */ + cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + + /* For the remaining flags: + ANYOF_UNICODE_ALL and inverted means to not match anything above + 255, which means that the union with cl should just be + what cl has in it, so can ignore this flag + ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord + is 127-255 to match them, but then invert that, so the + union with cl should just be what cl has in it, so can + ignore this flag + */ + } else { /* 'or_with' is not inverted */ /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) @@ -873,27 +918,27 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } } else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); + cl_anything(cl); } - } - if (or_with->flags & ANYOF_EOS) - cl->flags |= ANYOF_EOS; - if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; + /* Take the union */ + cl->flags |= or_with->flags; - /* If both nodes match something outside the bitmap, but what they match - * outside is not the same pointer, and hence not easily compared, give up - * and allow the start class to match everything outside the bitmap */ - if (ANYOF_NONBITMAP(cl) && ANYOF_NONBITMAP(or_with) && - ARG(cl) != ARG(or_with)) { - cl->flags |= ANYOF_UNICODE_ALL; - } + if (ANYOF_NONBITMAP(or_with)) { - if (or_with->flags & ANYOF_UNICODE_ALL) { - cl->flags |= ANYOF_UNICODE_ALL; + /* Use the added node's outside-the-bit-map match if there isn't a + * conflict. If there is a conflict (both nodes match something + * outside the bitmap, but what they match outside is not the same + * pointer, and hence not easily compared until XXX we extend + * inversion lists this far), give up and allow the start class to + * match everything outside the bitmap */ + if (! ANYOF_NONBITMAP(cl)) { + ARG_SET(cl, ARG(or_with)); + } + else if (ARG(cl) != ARG(or_with)) { + cl->flags |= ANYOF_UNICODE_ALL; + } + } } } @@ -2703,7 +2748,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + cl_init_zero(&accum); while (OP(scan) == code) { I32 deltanext, minnext, f = 0, fake; @@ -2724,7 +2769,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + cl_init(&this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -2757,7 +2802,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + cl_or(&accum, &this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; @@ -2770,7 +2815,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + cl_or(data->start_class, &accum); if (min1) { cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; @@ -2950,10 +2995,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, If/when this is fixed the following define can be swapped in below to fully enable trie logic. + XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps + not /aa + #define TRIE_TYPE_IS_SAFE 1 */ -#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT) if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, @@ -3042,7 +3090,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + cl_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3303,7 +3351,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + cl_init(&this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3331,7 +3379,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + cl_or(data->start_class, &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of @@ -3347,7 +3395,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + cl_or(data->start_class, &this_class); cl_and(data->start_class, and_withp); } else if (flags & SCF_DO_STCLASS_AND) @@ -3597,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + cl_anything(data->start_class); flags &= ~SCF_DO_STCLASS; break; } @@ -3660,7 +3708,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + cl_anything(data->start_class); break; case REG_ANY: if (OP(scan) == SANY) @@ -3668,7 +3716,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ value = (ANYOF_BITMAP_TEST(data->start_class,'\n') || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); + cl_anything(data->start_class); } if (flags & SCF_DO_STCLASS_AND || !value) ANYOF_BITMAP_CLEAR(data->start_class,'\n'); @@ -3678,7 +3726,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_and(data->start_class, (struct regnode_charclass_class*)scan); else - cl_or(pRExC_state, data->start_class, + cl_or(data->start_class, (struct regnode_charclass_class*)scan); break; case ALNUM: @@ -3907,7 +3955,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + cl_init(&intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -3941,7 +3989,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + cl_init(data->start_class); } else { /* AND before and after: combine and continue */ const int was = (data->start_class->flags & ANYOF_EOS); @@ -3992,7 +4040,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + cl_init(&intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4094,7 +4142,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + cl_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { @@ -4125,7 +4173,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + cl_init_zero(&accum); if (!trie->jump) { min1= trie->minlen; @@ -4148,7 +4196,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + cl_init(&this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -4192,7 +4240,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + cl_or(&accum, &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4204,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + cl_or(data->start_class, &accum); if (min1) { cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; @@ -4900,7 +4948,7 @@ reStudy: data.longest = &(data.longest_fixed); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + cl_init(&ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ @@ -5022,14 +5070,13 @@ reStudy: && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5084,7 +5131,7 @@ reStudy: DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + cl_init(&ch_class); data.start_class = &ch_class; data.last_closep = &last_close; @@ -5097,12 +5144,11 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); diff --git a/regcomp.h b/regcomp.h index 8e96b75..cc6708b 100644 --- a/regcomp.h +++ b/regcomp.h @@ -331,26 +331,24 @@ struct regnode_charclass_class { #define ANYOF_INVERT 0x04 -/* EOS, meaning that it can match an empty string too, is used for the - * synthetic start class (ssc) only. It looks like it could share the INVERT - * bit, as the ssc is never inverted. But doing that caused this reges to - * not match: - * 'foo/file.fob' =~ m,^(?=[^\.])[^/]* /(?=[^\.])[^/]*\.fo[^/]$,; - * (except the space between the * and the / above shouldn't be there; it was - * inserted to make this comment continue on.) - * Rather than try to figure out what was going on in the optimizer, I (khw) - * found a way to save a different bit. But my original line of reasoning was - * "The bit just needs to be turned off before regexec.c gets a hold of it so - * that regexec.c doesn't think it's inverted, but this happens automatically, - * as if the ssc can match an EOS, the ssc is discarded, and never passed to - * regexec.c" */ -#define ANYOF_EOS 0x10 - /* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ... * The non-locale ones are resolved at compile-time */ #define ANYOF_CLASS 0x08 #define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */ +/* EOS, meaning that it can match an empty string too, is used for the + * synthetic start class only. */ +#define ANYOF_EOS 0x10 + +/* ? Is this node the synthetic start class (ssc). This bit is shared with + * ANYOF_EOS, as the latter is used only for the ssc, and then not used by + * regexec.c. And, the code is structured so that if it is set, the ssc is + * not used, so it is guaranteed to be 0 for the ssc by the time regexec.c + * gets executed, and 0 for a non-ssc ANYOF node, as it only ever gets set for + * a potential ssc candidate. Thus setting it to 1 after it has been + * determined that the ssc will be used is not ambiguous */ +#define ANYOF_IS_SYNTHETIC ANYOF_EOS + /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_NONBITMAP_NON_UTF8 0x20 @@ -363,6 +361,16 @@ struct regnode_charclass_class { #define ANYOF_FLAGS_ALL 0xff +/* These are the flags that ANYOF_INVERT being set or not doesn't affect + * whether they are operative or not. e.g., the node still has LOCALE + * regardless of being inverted; whereas ANYOF_UNICODE_ALL means something + * different if inverted */ +#define INVERSION_UNAFFECTED_FLAGS (ANYOF_LOCALE \ + |ANYOF_LOC_NONBITMAP_FOLD \ + |ANYOF_CLASS \ + |ANYOF_EOS \ + |ANYOF_NONBITMAP_NON_UTF8) + /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ /* 2n should pair with 2n+1 */ diff --git a/regexec.c b/regexec.c index 739eba6..76784ee 100644 --- a/regexec.c +++ b/regexec.c @@ -6587,16 +6587,21 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. Locale nodes specifiy completely the * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except XXX in + * cause them to be treated as Unicode and not locale), except in * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes */ + * could be a combination of locale and non-locale nodes. So allow locale + * to match for the synthetic start class, which will give a false + * positive that will be resolved when the match is done again as not part + * of the synthetic start class */ if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { match = TRUE; /* Everything above 255 matches */ } else if ((flags & ANYOF_NONBITMAP_NON_UTF8 || (utf8_target && ANYOF_NONBITMAP(n) - && (c >=256 || ! (flags & ANYOF_LOCALE))))) + && (c >=256 + || (! (flags & ANYOF_LOCALE)) + || (flags & ANYOF_IS_SYNTHETIC))))) { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); diff --git a/t/re/pat.t b/t/re/pat.t index a14cb4f..66ce5ea 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -1030,7 +1030,6 @@ sub run_tests { my $message = '\p property after empty * match'; { - local $::TODO = "Bug 77414"; like("1", qr/\s*\pN/, $message); like("-", qr/\s*\p{Dash}/, $message); like(" ", qr/\w*\p{Blank}/, $message); diff --git a/t/re/re_tests b/t/re/re_tests index 924434c..b44fb73 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1493,4 +1493,7 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer (?:(?:)foo|bar|zot|rt78356) foo y $& foo /\xe0\pL/i \xc0a y $& \xc0a + +# RT #85528 +(?{})[\x{100}] \x{100} y $& \x{100} # vim: softtabstop=0 noexpandtab -- Perl5 Master Repository
