In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8c75ba159ca49f468760e532410c0af95809dc6d?hp=93f6e112d1711b0b220ea546f5fcb0f9ef445b66>
- Log ----------------------------------------------------------------- commit 8c75ba159ca49f468760e532410c0af95809dc6d Author: Hugo van der Sanden <[email protected]> Date: Thu Dec 11 15:27:07 2014 +0000 make the EXACTF_invlist only when SCF_DO_STCLASS The data is used only for STCLASS, and it's somewhat expensive to create. ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 + embed.h | 1 + proto.h | 7 ++ regcomp.c | 311 ++++++++++++++++++++++++++++++++------------------------------ 4 files changed, 172 insertions(+), 149 deletions(-) diff --git a/embed.fnc b/embed.fnc index 963a00f..16c33a2 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1528,6 +1528,8 @@ EiMn |void |invlist_iterinit|NN SV* invlist EsMRn |bool |invlist_iternext|NN SV* invlist|NN UV* start|NN UV* end EiMn |void |invlist_iterfinish|NN SV* invlist EiMRn |UV |invlist_highest|NN SV* const invlist +EMRs |SV* |_make_exactf_invlist |NN RExC_state_t *pRExC_state \ + |NN regnode *node #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) EXmM |void |_invlist_intersection |NN SV* const a|NN SV* const b|NN SV** i diff --git a/embed.h b/embed.h index 2bf125e..8e71a42 100644 --- a/embed.h +++ b/embed.h @@ -952,6 +952,7 @@ # if defined(PERL_IN_REGCOMP_C) #define _append_range_to_invlist(a,b,c) S__append_range_to_invlist(aTHX_ a,b,c) #define _invlist_array_init S__invlist_array_init +#define _make_exactf_invlist(a,b) S__make_exactf_invlist(aTHX_ a,b) #define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) #define add_data S_add_data diff --git a/proto.h b/proto.h index 3345d1c..ce86fca 100644 --- a/proto.h +++ b/proto.h @@ -6864,6 +6864,13 @@ PERL_STATIC_INLINE UV* S__invlist_array_init(SV* const invlist, const bool will_ #define PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT \ assert(invlist) +STATIC SV* S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST \ + assert(pRExC_state); assert(node) + STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_3); diff --git a/regcomp.c b/regcomp.c index a1784c7..e5d6a76 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4513,9 +4513,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is EXACTFish */ SSize_t l = STR_LEN(scan); - UV uc = *((U8*)STRING(scan)); - SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 - separate code points */ const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ @@ -4524,7 +4521,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } if (unfolded_multi_char) { @@ -4544,156 +4540,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } - if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { - ssc_clear_locale(data->start_class); - } + if (flags & SCF_DO_STCLASS) { + SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); - if (! UTF) { - - /* We punt and assume can match anything if the node begins - * with a multi-character fold. Things are complicated. For - * example, /ffi/i could match any of: - * "\N{LATIN SMALL LIGATURE FFI}" - * "\N{LATIN SMALL LIGATURE FF}I" - * "F\N{LATIN SMALL LIGATURE FI}" - * plus several other things; and making sure we have all the - * possibilities is hard. */ - if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { - EXACTF_invlist = - _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); - } - else { - - /* Any Latin1 range character can potentially match any - * other depending on the locale */ - if (OP(scan) == EXACTFL) { - _invlist_union(EXACTF_invlist, PL_Latin1, - &EXACTF_invlist); - } - else { - /* But otherwise, it matches at least itself. We can - * quickly tell if it has a distinct fold, and if so, - * it matches that as well */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); - if (IS_IN_SOME_FOLD_L1(uc)) { - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, - PL_fold_latin1[uc]); - } - } - - /* Some characters match above-Latin1 ones under /i. This - * is true of EXACTFL ones when the locale is UTF-8 */ - if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) - && (! isASCII(uc) || (OP(scan) != EXACTFA - && OP(scan) != EXACTFA_NO_TRIE))) - { - add_above_Latin1_folds(pRExC_state, - (U8) uc, - &EXACTF_invlist); - } - } - } - else { /* Pattern is UTF-8 */ - U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; - STRLEN foldlen = UTF8SKIP(s); - const U8* e = s + STR_LEN(scan); - SV** listp; - - /* The only code points that aren't folded in a UTF EXACTFish - * node are are the problematic ones in EXACTFL nodes */ - if (OP(scan) == EXACTFL - && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) - { - /* We need to check for the possibility that this EXACTFL - * node begins with a multi-char fold. Therefore we fold - * the first few characters of it so that we can make that - * check */ - U8 *d = folded; - int i; - - for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { - if (isASCII(*s)) { - *(d++) = (U8) toFOLD(*s); - s++; - } - else { - STRLEN len; - to_utf8_fold(s, d, &len); - d += len; - s += UTF8SKIP(s); - } - } - - /* And set up so the code below that looks in this folded - * buffer instead of the node's string */ - e = d; - foldlen = UTF8SKIP(folded); - s = folded; - } - - /* When we reach here 's' points to the fold of the first - * character(s) of the node; and 'e' points to far enough along - * the folded string to be just past any possible multi-char - * fold. 'foldlen' is the length in bytes of the first - * character in 's' - * - * Unlike the non-UTF-8 case, the macro for determining if a - * string is a multi-char fold requires all the characters to - * already be folded. This is because of all the complications - * if not. Note that they are folded anyway, except in EXACTFL - * nodes. Like the non-UTF case above, we punt if the node - * begins with a multi-char fold */ - - if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { - EXACTF_invlist = - _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + assert(EXACTF_invlist); + if (flags & SCF_DO_STCLASS_AND) { + if (OP(scan) != EXACTFL) + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } - else { /* Single char fold */ - - /* It matches all the things that fold to it, which are - * found in PL_utf8_foldclosures (including itself) */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); - if (! PL_utf8_foldclosures) { - _load_PL_utf8_foldclosures(); - } - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) s, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_tindex(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - assert(c_p); - - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non- */ - if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) - && isASCII(c) != isASCII(uc)) - { - continue; - } + else { /* SCF_DO_STCLASS_OR */ + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); - } - } + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } - if (flags & SCF_DO_STCLASS_AND) { - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - ANYOF_POSIXL_ZERO(data->start_class); - ssc_intersection(data->start_class, EXACTF_invlist, FALSE); - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; @@ -9501,6 +9368,152 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) } #endif +/* + * As best we can, determine the characters that can match the start of + * the given EXACTF-ish node. + * + * Returns the invlist as a new SV*; it is the caller's responsibility to + * call SvREFCNT_dec() when done with it. + */ +STATIC SV* +S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) +{ + const U8 * s = (U8*)STRING(node); + SSize_t bytelen = STR_LEN(node); + UV uc; + /* Start out big enough for 2 separate code points */ + SV* invlist = _new_invlist(4); + + PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; + + if (! UTF) { + uc = *s; + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(node) == EXACTFL) { + _invlist_union(invlist, PL_Latin1, &invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + invlist = add_cp_to_invlist(invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) + invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(node) != EXACTFA + && OP(node) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + bytelen; + SV** listp; + + uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + invlist = add_cp_to_invlist(invlist, uc); + if (! PL_utf8_foldclosures) + _load_PL_utf8_foldclosures(); + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + invlist = add_cp_to_invlist(invlist, c); + } + } + } + } + + return invlist; +} + #undef HEADER_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE -- Perl5 Master Repository
