In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/03fa83ba2035289e6ac69e9f1228252bcc3c0b9d?hp=ca5d3bffbe69ee0415742df798d2d990fce531fc>
- Log ----------------------------------------------------------------- commit 03fa83ba2035289e6ac69e9f1228252bcc3c0b9d Author: Karl Williamson <[email protected]> Date: Wed Sep 3 12:42:07 2014 -0600 regcomp.h: Comment nits M regcomp.h commit e0a1ff7a2452ef34ae8bb33cda6415709f1833fc Author: Karl Williamson <[email protected]> Date: Thu Aug 28 14:22:14 2014 -0600 Allow for changing size of bracketed regex char class This commit allows Perl to be compiled with a bitmap size that is larger than 256. This bitmap is used to directly look up whether a character matches or not, without having to do a binary search or hash lookup. It might improve the performance for some installations that have a lot of use of scripts that are above the Latin1 range. M embedvar.h M intrpvar.h M perl.c M regcomp.c M regcomp.h M regexec.c M sv.c commit 8e8a446824eed109a7c437ac4a417de07db94cc4 Author: Karl Williamson <[email protected]> Date: Thu Aug 28 20:07:30 2014 -0600 Fix -Dr output to work for larger ANYOF node size This generalizes the code for -Dr output to work to dump the contents of ANYOF nodes (bracketed character classes) which have bitmaps for more than code points 0-255. M embed.fnc M embed.h M proto.h M regcomp.c commit 2ab58e930a8796c192de074ed05261cc1616c779 Author: Karl Williamson <[email protected]> Date: Tue Aug 26 08:36:31 2014 -0600 regcomp.c: Swap if/else clauses This makes it slightly easier to understand as there is no explicit complement, but is mostly for a future commit. M regcomp.c commit 93e92956bb470aeaf41fd87a47176cf4906ffd1c Author: Karl Williamson <[email protected]> Date: Thu Aug 28 14:05:40 2014 -0600 Rename some internal regex #defines These are renamed to be more clear as to their actual meanings. I know other people have been confused by their former names. Some of the name changes will become more important as future commits will allow the bitmap in a bracketed character class to be a different size. M regcomp.c M regcomp.h M regexec.c commit f64bdbe57e6dafabd081fc3815ae72a00ebd03e6 Author: Karl Williamson <[email protected]> Date: Thu Aug 28 18:19:56 2014 -0600 regcomp.h: Remove some no-longer used #defines This is an internal header, so can change names within it. M regcomp.h commit bc51fd7848385e58210a13810ef5ac6f01f70afb Author: Karl Williamson <[email protected]> Date: Thu Aug 28 14:36:15 2014 -0600 regcomp.h: Use unsigned 1 in left shift This prevents a signed result if this macro ever gets used in a U8. The ANYOF_BITMAP_TEST macro must now be cast or it would generate warnings when compiled with -DPERL_BOOL_AS_CHAR M regcomp.h commit 6f16c8da34619f286a6f24a2d6286f398d3b4503 Author: Karl Williamson <[email protected]> Date: Thu Aug 28 18:50:22 2014 -0600 regcomp.h: Fix comment that said the opposite of the truth Too many negations led to this. M regcomp.h commit 70422107c447d915bfc6189d56be459dadadf660 Author: Karl Williamson <[email protected]> Date: Thu Aug 28 18:13:47 2014 -0600 regcomp.c: Remove unnecessary test The 'while' makes the 'if' unnecessary here. M regcomp.c commit 6942fd9a567743c5784c5445ee49c3a4fc1d3b48 Author: Karl Williamson <[email protected]> Date: Wed Aug 27 22:12:02 2014 -0600 regexec.c: Simplify a short code section Two "if"s can be combined, leading to one fewer (unoptimized) tests M regexec.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 +- embed.h | 2 +- embedvar.h | 1 + intrpvar.h | 1 + perl.c | 2 + proto.h | 10 ++-- regcomp.c | 195 ++++++++++++++++++++++++++++++++++--------------------------- regcomp.h | 82 +++++++++++++------------- regexec.c | 31 +++++----- sv.c | 1 + 10 files changed, 181 insertions(+), 146 deletions(-) diff --git a/embed.fnc b/embed.fnc index 44f5ebf..0513663 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2194,7 +2194,7 @@ Es |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \ |NULLOK const regnode *last \ |NULLOK const regnode *plast \ |NN SV* sv|I32 indent|U32 depth -Es |void |put_byte |NN SV* sv|int c +Es |void |put_code_point |NN SV* sv|UV c Es |bool |put_charclass_bitmap_innards|NN SV* sv \ |NN char* bitmap \ |NULLOK SV** bitmap_invlist diff --git a/embed.h b/embed.h index 938a5c9..2abc4e2 100644 --- a/embed.h +++ b/embed.h @@ -917,8 +917,8 @@ #define dump_trie_interim_list(a,b,c,d,e) S_dump_trie_interim_list(aTHX_ a,b,c,d,e) #define dump_trie_interim_table(a,b,c,d,e) S_dump_trie_interim_table(aTHX_ a,b,c,d,e) #define dumpuntil(a,b,c,d,e,f,g,h) S_dumpuntil(aTHX_ a,b,c,d,e,f,g,h) -#define put_byte(a,b) S_put_byte(aTHX_ a,b) #define put_charclass_bitmap_innards(a,b,c) S_put_charclass_bitmap_innards(aTHX_ a,b,c) +#define put_code_point(a,b) S_put_code_point(aTHX_ a,b) #define put_range(a,b,c,d) S_put_range(aTHX_ a,b,c,d) #define regdump_extflags(a,b) S_regdump_extflags(aTHX_ a,b) #define regdump_intflags(a,b) S_regdump_intflags(aTHX_ a,b) diff --git a/embedvar.h b/embedvar.h index 766880c..d481681 100644 --- a/embedvar.h +++ b/embedvar.h @@ -53,6 +53,7 @@ #define PL_Dir (vTHX->IDir) #define PL_Env (vTHX->IEnv) #define PL_HasMultiCharFold (vTHX->IHasMultiCharFold) +#define PL_InBitmap (vTHX->IInBitmap) #define PL_LIO (vTHX->ILIO) #define PL_Latin1 (vTHX->ILatin1) #define PL_Mem (vTHX->IMem) diff --git a/intrpvar.h b/intrpvar.h index 06194d9..57918b2 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -580,6 +580,7 @@ PERLVAR(I, numeric_radix_sv, SV *) /* The radix separator if not '.' */ PERLVAR(I, Latin1, SV *) PERLVAR(I, UpperLatin1, SV *) /* Code points 128 - 255 */ PERLVAR(I, AboveLatin1, SV *) +PERLVAR(I, InBitmap, SV *) PERLVAR(I, NonL1NonFinalFold, SV *) PERLVAR(I, HasMultiCharFold, SV *) diff --git a/perl.c b/perl.c index b61e2ff..8f45273 100644 --- a/perl.c +++ b/perl.c @@ -1034,6 +1034,7 @@ perl_destruct(pTHXx) SvREFCNT_dec(PL_utf8_foldable); SvREFCNT_dec(PL_utf8_foldclosures); SvREFCNT_dec(PL_AboveLatin1); + SvREFCNT_dec(PL_InBitmap); SvREFCNT_dec(PL_UpperLatin1); SvREFCNT_dec(PL_Latin1); SvREFCNT_dec(PL_NonL1NonFinalFold); @@ -1047,6 +1048,7 @@ perl_destruct(pTHXx) PL_utf8_idcont = NULL; PL_utf8_foldclosures = NULL; PL_AboveLatin1 = NULL; + PL_InBitmap = NULL; PL_HasMultiCharFold = NULL; PL_Latin1 = NULL; PL_NonL1NonFinalFold = NULL; diff --git a/proto.h b/proto.h index a6453dc..35ec89b 100644 --- a/proto.h +++ b/proto.h @@ -5382,17 +5382,17 @@ STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, c #define PERL_ARGS_ASSERT_DUMPUNTIL \ assert(r); assert(start); assert(node); assert(sv) -STATIC void S_put_byte(pTHX_ SV* sv, int c) - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_PUT_BYTE \ - assert(sv) - STATIC bool S_put_charclass_bitmap_innards(pTHX_ SV* sv, char* bitmap, SV** bitmap_invlist) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS \ assert(sv); assert(bitmap) +STATIC void S_put_code_point(pTHX_ SV* sv, UV c) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_PUT_CODE_POINT \ + assert(sv) + STATIC void S_put_range(pTHX_ SV* sv, UV start, const UV end, const bool allow_literals) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_PUT_RANGE \ diff --git a/regcomp.c b/regcomp.c index ef6cae9..3f12e97 100644 --- a/regcomp.c +++ b/regcomp.c @@ -873,7 +873,7 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc) ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ _append_range_to_invlist(ssc->invlist, 0, UV_MAX); - ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ + ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ } STATIC int @@ -891,7 +891,7 @@ S_ssc_is_anything(const regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); - if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { return FALSE; } @@ -930,7 +930,7 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) Zero(ssc, 1, regnode_ssc); set_ANYOF_SYNTHETIC(ssc); - ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, @@ -1000,7 +1000,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; /* Look at the data structure created by S_set_ANYOF_arg() */ - if (n != ANYOF_NONBITMAP_EMPTY) { + if (n != ANYOF_ONLY_HAS_BITMAP) { SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); @@ -1056,13 +1056,13 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* If this can match all upper Latin1 code points, have to add them * as well */ - if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { _invlist_union(invlist, PL_UpperLatin1, &invlist); } /* Similarly for these */ - if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { - invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); } if (ANYOF_FLAGS(node) & ANYOF_INVERT) { @@ -1095,8 +1095,8 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) /* 'AND' a given class with another one. Can create false positives. 'ssc' - * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be + * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */ STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, @@ -1187,7 +1187,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* If either P1 or P2 is empty, the intersection will be also; can skip * the loop */ - if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { ANYOF_POSIXL_ZERO(ssc); } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { @@ -1246,16 +1246,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ssc->invlist = anded_cp_list; ANYOF_POSIXL_ZERO(ssc); - if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); } } } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) - || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { /* One or the other of P1, P2 is non-empty. */ - if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); } ssc_union(ssc, anded_cp_list, FALSE); @@ -1317,7 +1317,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, { /* We ignore P2, leaving P1 going forward */ } /* else Not inverted */ - else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { unsigned int i; @@ -1421,8 +1421,8 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); /* The code in this file assumes that all but these flags aren't relevant - * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the - * time we reach here */ + * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared + * by the time we reach here */ assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -1434,7 +1434,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) ssc->invlist = NULL; if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; } assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); @@ -4235,7 +4235,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { ssc_cp_and(data->start_class, uc); - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { @@ -4243,7 +4243,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } @@ -4418,7 +4418,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } if (flags & SCF_DO_STCLASS_AND) { - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; ANYOF_POSIXL_ZERO(data->start_class); ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } @@ -4427,7 +4427,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; SvREFCNT_dec(EXACTF_invlist); @@ -4546,7 +4546,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_STCLASS_AND; StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { @@ -4842,7 +4843,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", ssc_intersection(data->start_class, PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; } else if (flags & SCF_DO_STCLASS_OR) { ssc_union(data->start_class, @@ -4852,7 +4854,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", /* See commit msg for * 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } @@ -4879,7 +4882,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", U8 namedclass; /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ @@ -5120,7 +5123,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", * assertions are zero-length, so can match an EMPTY * string */ ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; } } } @@ -5192,7 +5196,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", if (f & SCF_DO_STCLASS_AND) { ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -6272,6 +6276,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + + /* This is calculated here, because the Perl program that generates the + * static global ones doesn't currently have access to + * NUM_ANYOF_CODE_POINTS */ + PL_InBitmap = _new_invlist(2); + PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, + NUM_ANYOF_CODE_POINTS - 1); } #endif @@ -6989,7 +7000,7 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && !ssc_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -7069,7 +7080,7 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && ! ssc_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -12451,11 +12462,11 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) UV high; int i; - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { + ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; } - else if (end >= 256) { - ANYOF_FLAGS(node) |= ANYOF_UTF8; + else if (end >= NUM_ANYOF_CODE_POINTS) { + ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES; } /* Quit if are above what we should change */ @@ -12478,13 +12489,13 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) invlist_iterfinish(*invlist_ptr); /* Done with loop; remove any code points that are in the bitmap from - * *invlist_ptr; similarly for code points above latin1 if we have a - * flag to match all of them anyways */ + * *invlist_ptr; similarly for code points above the bitmap if we have + * a flag to match all of them anyways */ if (change_invlist) { - _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); } - if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { - _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); } /* If have completely emptied it, remove it completely */ @@ -13646,7 +13657,8 @@ parseit: * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there * is no <depends_list> */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + ANYOF_FLAGS(ret) + |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES; } else { @@ -13865,18 +13877,18 @@ parseit: else { RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; } - ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; ANYOF_POSIXL_ZERO(ret); } /* Coverity thinks it is possible for this to be negative; both * jhi and khw think it's not, but be safer */ - assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); /* See if it already matches the complement of this POSIX * class */ - if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) ? -1 : 1))) @@ -14598,7 +14610,7 @@ parseit: if (DEPENDS_SEMANTICS) { /* Under /d, everything in the upper half of the Latin1 range * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII; } else if (AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, everything above ASCII matches these @@ -14904,7 +14916,7 @@ parseit: else { cp_list = depends_list; } - ANYOF_FLAGS(ret) |= ANYOF_UTF8; + ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES; } /* If there is a swash and more than one element, we can't use the swash in @@ -14946,7 +14958,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, { /* Sets the arg field of an ANYOF-type node 'node', using information about * the node passed-in. If there is nothing outside the node's bitmap, the - * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to * the count returned by add_data(), having allocated and stored an array, * av, that that count references, as follows: * av[0] stores the character class description in its textual form. @@ -14972,15 +14984,17 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { assert(! (ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); - ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES + |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES))); + ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); } else { AV * const av = newAV(); SV *rv; assert(ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES + |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD)); av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); @@ -15046,7 +15060,8 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; assert(ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES + |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -15944,9 +15959,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } - if ((flags & (ANYOF_ABOVE_LATIN1_ALL - |ANYOF_UTF8 - |ANYOF_NONBITMAP_NON_UTF8 + if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP + |ANYOF_HAS_UTF8_NONBITMAP_MATCHES + |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES |ANYOF_LOC_FOLD))) { if (do_sep) { @@ -15956,14 +15971,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "^"); } - if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { sv_catpvs(sv, "{non-utf8-latin1-all}"); } /* output information about the unicode matching */ - if (flags & ANYOF_ABOVE_LATIN1_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) + sv_catpvs(sv, "{above_bitmap_all}"); + else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { SV *lv; /* Set if there is something outside the bit map. */ bool byte_output = FALSE; /* If something in the bitmap has been output */ @@ -15985,7 +16000,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if (*s == '\n') { const char * const t = ++s; - if (flags & ANYOF_NONBITMAP_NON_UTF8) { + if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) { sv_catpvs(sv, "{outside bitmap}"); } else { @@ -16685,12 +16700,21 @@ Perl_save_re_context(pTHX) ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^') STATIC void -S_put_byte(pTHX_ SV *sv, int c) +S_put_code_point(pTHX_ SV *sv, UV c) { - PERL_ARGS_ASSERT_PUT_BYTE; + PERL_ARGS_ASSERT_PUT_CODE_POINT; - if (!isPRINT(c)) { - switch (c) { + if (c > 255) { + Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c); + } + else if (isPRINT(c)) { + const char string = (char) c; + if (isBACKSLASHED_PUNCT(c)) + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } + else { + switch ((U8) c) { case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break; case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break; @@ -16698,15 +16722,9 @@ S_put_byte(pTHX_ SV *sv, int c) case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; - default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", c); break; + default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c); break; } } - else { - const char string = c; - if (isBACKSLASHED_PUNCT(c)) - sv_catpvs(sv, "\\"); - sv_catpvn(sv, &string, 1); - } } #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C @@ -16720,7 +16738,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { /* Appends to 'sv' a displayable version of the range of code points from * 'start' to 'end'. It assumes that only ASCII printables are displayable - * as-is (though some of these will be escaped by put_byte()). */ + * as-is (though some of these will be escaped by put_code_point()). */ const unsigned int min_range_count = 3; @@ -16729,11 +16747,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) PERL_ARGS_ASSERT_PUT_RANGE; while (start <= end) { + UV this_end; + const char * format; + if (end - start < min_range_count) { /* Individual chars in short ranges */ for (; start <= end; start++) { - put_byte(sv, start); + put_code_point(sv, start); } break; } @@ -16805,9 +16826,9 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) put_range(sv, start, temp_end, FALSE); } else { /* Output as a range */ - put_byte(sv, start); + put_code_point(sv, start); sv_catpvs(sv, "-"); - put_byte(sv, temp_end); + put_code_point(sv, temp_end); } start = temp_end + 1; continue; @@ -16818,7 +16839,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) { - put_byte(sv, start); + put_code_point(sv, start); start++; } continue; @@ -16829,11 +16850,9 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) * mnemonic names. Split off any of those at the beginning and end of * the range to print mnemonically. It isn't possible for many of * these to be in a row, so this won't overwhelm with output */ - if (isMNEMONIC_CNTRL(start)) { - while (isMNEMONIC_CNTRL(start) && start <= end) { - put_byte(sv, start); - start++; - } + while (isMNEMONIC_CNTRL(start) && start <= end) { + put_code_point(sv, start); + start++; } if (start < end && isMNEMONIC_CNTRL(end)) { @@ -16850,18 +16869,21 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* Then output the mnemonic trailing controls */ start = temp_end + 1; while (start <= end) { - put_byte(sv, start); + put_code_point(sv, start); start++; } break; } /* As a final resort, output the range or subrange as hex. */ - Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", - start, - (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1); + + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; + format = (this_end < 256) + ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}" + : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; + Perl_sv_catpvf(aTHX_ sv, format, start, this_end); break; } } @@ -16952,8 +16974,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) /* Add everything remaining to the list, so when we invert it just * below, it will be excluded */ - *invlist_ptr = _add_range_to_invlist(*invlist_ptr, - NUM_ANYOF_CODE_POINTS, UV_MAX); + _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr); _invlist_invert(*invlist_ptr); } @@ -17118,7 +17139,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL) ? ANYOF_POSIXL_SKIP : ANYOF_SKIP); node = NEXTOPER(node); diff --git a/regcomp.h b/regcomp.h index 68646f1..2b73d86 100644 --- a/regcomp.h +++ b/regcomp.h @@ -184,7 +184,20 @@ struct regnode_2 { U16 arg2; }; -#define NUM_ANYOF_CODE_POINTS 256 +/* This give the number of code points that can be in the bitmap of an ANYOF + * node. The shift number must currently be one of: 8..12. It can't be less + * than 8 (256) because some code relies on it being at least that. Above 12 + * (4096), and you start running into warnings that some data structure widths + * have been exceeded, though the test suite as of this writing still passes + * for up through 16, which is as high as anyone would ever want to go, + * encompassing all of the Unicode BMP, and thus including all the economically + * important world scripts. At 12 most of them are: including Arabic, + * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han, + * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and + * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values + * below above 12.) Be sure to benchmark before changing, as larger sizes do + * significantly slow down the test suite */ +#define NUM_ANYOF_CODE_POINTS (1 << 8) #define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */ @@ -210,11 +223,11 @@ struct regnode_charclass { /* has runtime (locale) \d, \w, ..., [:posix:] classes */ struct regnode_charclass_class { - U8 flags; /* ANYOF_POSIXL bit must go here */ + U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ U8 type; U16 next_off; U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ + char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ U32 classflags; /* and run-time */ }; @@ -228,11 +241,11 @@ struct regnode_charclass_class { * have a pointer field because there is no alignment issue, and because it is * set to NULL after construction, before any cloning of the pattern */ struct regnode_ssc { - U8 flags; /* ANYOF_POSIXL bit must go here */ + U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ U8 type; U16 next_off; U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ + char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ U32 classflags; /* and run-time */ /* Auxiliary, only used during construction; NULL afterwards: list of code @@ -347,13 +360,13 @@ struct regnode_ssc { #define PASS1 SIZE_ONLY #define PASS2 (! SIZE_ONLY) -/* If the bitmap doesn't fully represent what this ANYOF node can match, the +/* If the bitmap fully represents what this ANYOF node can match, the * ARG is set to this special value (since 0, 1, ... are legal, but will never * reach this high). */ -#define ANYOF_NONBITMAP_EMPTY ((U32) -1) +#define ANYOF_ONLY_HAS_BITMAP ((U32) -1) /* Flags for node->flags of ANYOF. These are in short supply, with none - * currently available. The ABOVE_LATIN1_ALL bit could be freed up + * currently available. The ABOVE_BITMAP_ALL bit could be freed up * by resorting to creating a swash containing everything above 255. This * introduces a performance penalty. An option that wouldn't slow things down * would be to split one of the two LOC flags out into a separate @@ -365,57 +378,55 @@ struct regnode_ssc { * only for /d, so there are no combinatorial issues. The LOC flag to use is * probably the POSIXL one. * Several flags are not used in synthetic start class (SSC) nodes, so could be - * shared should new flags be needed for SSCs, like ANYOF_EMPTY_STRING now. */ + * shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING + * now. */ /* regexec.c is expecting this to be in the low bit */ -#define ANYOF_INVERT 0x01 +#define ANYOF_INVERT 0x01 /* For the SSC node only, which cannot be inverted, so is shared with that bit. - * This means "Does this SSC match an empty string?" This is used only during - * regex compilation. */ -#define ANYOF_EMPTY_STRING ANYOF_INVERT + * This is used only during regex compilation. */ +#define SSC_MATCHES_EMPTY_STRING ANYOF_INVERT -/* Are there things that will match only if the target string is encoded in - * UTF-8? (This is not set if ANYOF_AOVE_LATIN1_ALL is set) */ -#define ANYOF_UTF8 0x02 +/* Are there things outside the bitmap that will match only if the target + * string is encoded in UTF-8? (This is not set if ANYOF_ABOVE_BITMAP_ALL is + * set) */ +#define ANYOF_HAS_UTF8_NONBITMAP_MATCHES 0x02 /* The fold is calculated and stored in the bitmap where possible at compile * time. However under locale, the actual folding varies depending on * what the locale is at the time of execution, so it has to be deferred until * then */ -#define ANYOF_LOC_FOLD 0x04 +#define ANYOF_LOC_FOLD 0x04 /* Set if this is a regnode_charclass_posixl vs a regnode_charclass. This * is used for runtime \d, \w, [:posix:], ..., which are used only in locale * and the optimizer's synthetic start class. Non-locale \d, etc are resolved * at compile-time */ -#define ANYOF_POSIXL 0x08 -#define ANYOF_CLASS ANYOF_POSIXL -#define ANYOF_LARGE ANYOF_POSIXL +#define ANYOF_MATCHES_POSIXL 0x08 /* Should we raise a warning if matching against an above-Unicode code point? * */ -#define ANYOF_WARN_SUPER 0x10 +#define ANYOF_WARN_SUPER 0x10 /* Can match something outside the bitmap that isn't in utf8 */ -#define ANYOF_NONBITMAP_NON_UTF8 0x20 +#define ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES 0x20 -/* Matches every code point 0x100 and above*/ -#define ANYOF_ABOVE_LATIN1_ALL 0x40 -#define ANYOF_UNICODE_ALL ANYOF_ABOVE_LATIN1_ALL +/* Matches every code point NUM_ANYOF_CODE_POINTS and above*/ +#define ANYOF_MATCHES_ALL_ABOVE_BITMAP 0x40 /* Match all Latin1 characters that aren't ASCII when the target string is not * in utf8. */ -#define ANYOF_NON_UTF8_NON_ASCII_ALL 0x80 +#define ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII 0x80 #define ANYOF_FLAGS_ALL (0xff) -#define ANYOF_LOCALE_FLAGS (ANYOF_LOC_FOLD | ANYOF_POSIXL) +#define ANYOF_LOCALE_FLAGS (ANYOF_LOC_FOLD | ANYOF_MATCHES_POSIXL) /* These are the flags that apply to both regular ANYOF nodes and synthetic * start class nodes during construction of the SSC. During finalization of * the SSC, other of the flags could be added to it */ -#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER|ANYOF_UTF8) +#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER|ANYOF_HAS_UTF8_NONBITMAP_MATCHES) /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ @@ -500,7 +511,7 @@ struct regnode_ssc { #define ANYOF_FLAGS(p) ((p)->flags) -#define ANYOF_BIT(c) (1 << ((c) & 7)) +#define ANYOF_BIT(c) (1U << ((c) & 7)) #define ANYOF_POSIXL_SET(p, c) (((regnode_charclass_posixl*) (p))->classflags |= (1U << (c))) #define ANYOF_CLASS_SET(p, c) ANYOF_POSIXL_SET((p), (c)) @@ -519,7 +530,7 @@ struct regnode_ssc { #define ANYOF_CLASS_SETALL(ret) ANYOF_POSIXL_SETALL(ret) #define ANYOF_POSIXL_TEST_ANY_SET(p) \ - ((ANYOF_FLAGS(p) & ANYOF_POSIXL) \ + ((ANYOF_FLAGS(p) & ANYOF_MATCHES_POSIXL) \ && (((regnode_charclass_posixl*)(p))->classflags)) #define ANYOF_CLASS_TEST_ANY_SET(p) ANYOF_POSIXL_TEST_ANY_SET(p) @@ -532,7 +543,7 @@ struct regnode_ssc { == ((1U << ((ANYOF_POSIXL_MAX) - 1))) - 1) #define ANYOF_POSIXL_TEST_ALL_SET(p) \ - ((ANYOF_FLAGS(p) & ANYOF_POSIXL) \ + ((ANYOF_FLAGS(p) & ANYOF_MATCHES_POSIXL) \ && ((regnode_charclass_posixl*) (p))->classflags \ == ((1U << ((ANYOF_POSIXL_MAX) - 1))) - 1) @@ -546,19 +557,12 @@ struct regnode_ssc { #define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[(((U8)(c)) >> 3) & 31]) #define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) #define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) -#define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) +#define ANYOF_BITMAP_TEST(p, c) cBOOL(ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) #define ANYOF_BITMAP_SETALL(p) \ memset (ANYOF_BITMAP(p), 255, ANYOF_BITMAP_SIZE) #define ANYOF_BITMAP_CLEARALL(p) \ Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE) -#if ANYOF_BITMAP_SIZE == 32 -/* Check that all 256 bits are all set. */ -# define ANYOF_BITMAP_TESTALLSET(p) /* Assumes sizeof(p) == 32 */ \ - memEQ (ANYOF_BITMAP(p), "\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377", ANYOF_BITMAP_SIZE) -#else -# error Need to fix this if raise bitmap size. (As of this writing this macro is unused in the core) -#endif #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) #define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode)) diff --git a/regexec.c b/regexec.c index b6d163e..52ff312 100644 --- a/regexec.c +++ b/regexec.c @@ -7678,19 +7678,22 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < NUM_ANYOF_CODE_POINTS) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL - && ! utf8_target - && ! isASCII(c)) + else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) + && ! utf8_target + && ! isASCII(c)) { match = TRUE; } else if (flags & ANYOF_LOCALE_FLAGS) { - if (flags & ANYOF_LOC_FOLD) { - if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { - match = TRUE; - } + if ((flags & ANYOF_LOC_FOLD) + && c < 256 + && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) + { + match = TRUE; } - if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + else if (ANYOF_POSIXL_TEST_ANY_SET(n) + && c < 256 + ) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7743,14 +7746,16 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. */ if (!match) { - if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { - match = TRUE; /* Everything above 255 matches */ + if (c >= NUM_ANYOF_CODE_POINTS + && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)) + { + match = TRUE; /* Everything above the bitmap matches */ } - else if ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target && (flags & ANYOF_UTF8)) + else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) + || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES)) || ((flags & ANYOF_LOC_FOLD) && IN_UTF8_CTYPE_LOCALE - && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + && ARG(n) != ANYOF_ONLY_HAS_BITMAP)) { SV* only_utf8_locale = NULL; SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, diff --git a/sv.c b/sv.c index 65aa456..78086b4 100644 --- a/sv.c +++ b/sv.c @@ -14444,6 +14444,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_Latin1 = sv_dup_inc(proto_perl->ILatin1, param); PL_UpperLatin1 = sv_dup_inc(proto_perl->IUpperLatin1, param); PL_AboveLatin1 = sv_dup_inc(proto_perl->IAboveLatin1, param); + PL_InBitmap = sv_dup_inc(proto_perl->IInBitmap, param); PL_NonL1NonFinalFold = sv_dup_inc(proto_perl->INonL1NonFinalFold, param); PL_HasMultiCharFold = sv_dup_inc(proto_perl->IHasMultiCharFold, param); -- Perl5 Master Repository
