In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/0b13e5291ebd9c786dea21905e17886c5a310454?hp=1e2cfe157cae98578de3c274bc64b8ea032b91e0>
- Log ----------------------------------------------------------------- commit 0b13e5291ebd9c786dea21905e17886c5a310454 Author: Karl Williamson <k...@cpan.org> Date: Sun Feb 4 19:15:00 2018 -0700 regcomp.c: Comment, white-space only commit 03a2aaa3c921884eebd5bc46c16b9ed523d8d7fd Author: Karl Williamson <k...@cpan.org> Date: Sun Feb 4 18:48:26 2018 -0700 regcomp.c: Simplify handling of varying loop increments Prior to this commit, this loop added 1 to a variable each iteration as part of the for(;;). This created some issues for the few cases where that increment should be something else. Now, the addition is removed from the for(;;), and defaults to 1, so that the code inside doesn't have to account for an automatic 1 in the for(). ----------------------------------------------------------------------- Summary of changes: regcomp.c | 91 ++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/regcomp.c b/regcomp.c index 8cfe6a1b38..6f89a8ec30 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3706,14 +3706,16 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * input nodes. * * And *unfolded_multi_char is set to indicate whether or not the node contains - * an unfolded multi-char fold. This happens when whether the fold is valid or - * not won't be known until runtime; namely for EXACTF nodes that contain LATIN - * SMALL LETTER SHARP S, as only if the target string being matched against - * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose - * folding rules depend on the locale in force at runtime. (Multi-char folds - * whose components are all above the Latin1 range are not run-time locale - * dependent, and have already been folded by the time this function is - * called.) + * an unfolded multi-char fold. This happens when it won't be known until + * runtime whether the fold is valid or not; namely + * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the + * target string being matched against turns out to be UTF-8 is that fold + * valid; or + * 2) for EXACTFL nodes whose folding rules depend on the locale in force at + * runtime. + * (Multi-char folds whose components are all above the Latin1 range are not + * run-time locale dependent, and have already been folded by the time this + * function is called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long @@ -13318,6 +13320,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * faster to match */ bool maybe_exact; + /* The node_type may change below, but since the size of the node + * doesn't change, it works */ ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -13347,17 +13351,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * ones, in which case we just leave the node fully filled, and * hope that it doesn't match the string in just the wrong place */ - assert( ! UTF /* Is at the beginning of a character */ + assert( ! UTF /* Is at the beginning of a character */ || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); /* Here, we have a literal character. Find the maximal string of * them in the input that we can fit into a single EXACTish node. * We quit at the first non-literal or when the node gets full */ - for (p = RExC_parse; - len < upper_parse && p < RExC_end; - len++) - { + for (p = RExC_parse; len < upper_parse && p < RExC_end; ) { + + /* In most cases each iteration adds one byte to the output. + * The exceptions override this */ + Size_t added_len = 1; + oldp = p; /* White space has already been ignored */ @@ -13647,8 +13653,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; } /* End of switch on the literal */ - /* Here, have looked at the literal character and <ender> - * contains its ordinal, <p> points to the character after it. + /* Here, have looked at the literal character, and <ender> + * contains its ordinal; <p> points to the character after it. * We need to check if the next non-ignored thing is a * quantifier. Move <p> to after anything that should be * ignored, which, as a side effect, positions <p> for the next @@ -13683,15 +13689,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (UTF && ! UVCHR_IS_INVARIANT(ender)) { const STRLEN unilen = UVCHR_SKIP(ender); s += unilen; - - /* We have to subtract 1 just below (and again in - * the corresponding PASS2 code) because the loop - * increments <len> each time, as all but this path - * (and one other) through it add a single byte to - * the EXACTish node. But these paths would change - * len to be the correct final value, so cancel out - * the increment that follows */ - len += unilen - 1; + added_len = unilen; } else { s++; @@ -13700,7 +13698,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) not_fold_common: if (UTF && ! UVCHR_IS_INVARIANT(ender)) { U8 * new_s = uvchr_to_utf8((U8*)s, ender); - len += (char *) new_s - s - 1; + added_len = (char *) new_s - s; s = (char *) new_s; } else { @@ -13713,6 +13711,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Here are folding under /l, and the code point is * problematic. First, we know we can't simplify things */ maybe_exact = FALSE; + + /* This code point means we can't simplify things */ maybe_exactfu = FALSE; /* A problematic code point in this context means that its @@ -13742,7 +13742,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) #endif )) { /* Here, are folding and are not UTF-8 encoded; therefore - * the character must be in the range 0-255, and is not /l + * the character must be in the range 0-255, and is not /l. * (Not /l because we already handled these under /l in * is_PROBLEMATIC_LOCALE_FOLD_cp) */ if (IS_IN_SOME_FOLD_L1(ender)) { @@ -13769,25 +13769,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } - /* Even when folding, we store just the input character, as - * we have an array that finds its fold quickly */ - *(s++) = (char) ender; + /* Even when folding, we store just the input + * character, as we have an array that finds its fold + * quickly */ + *(s++) = (char) ender; } else { /* FOLD, and UTF (or sharp s) */ /* Unlike the non-fold case, we do actually have to - * calculate the results here in pass 1. This is for two - * reasons, the folded length may be longer than the - * unfolded, and we have to calculate how many EXACTish - * nodes it will take; and we may run out of room in a node - * in the middle of a potential multi-char fold, and have - * to back off accordingly. */ + * calculate the fold in pass 1. This is for two reasons, + * the folded length may be longer than the unfolded, and + * we have to calculate how many EXACTish nodes it will + * take; and we may run out of room in a node in the middle + * of a potential multi-char fold, and have to back off + * accordingly. */ UV folded; if (isASCII_uni(ender)) { folded = toFOLD(ender); *(s)++ = (U8) folded; } - else { + else { /* Not ASCII */ STRLEN foldlen; folded = _to_uni_fold_flags( @@ -13798,13 +13799,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ? FOLD_FLAGS_NOMIX_ASCII : 0)); s += foldlen; - - /* The loop increments <len> each time, as all but this - * path (and one other) through it add a single byte to - * the EXACTish node. But this one has changed len to - * be the correct final value, so subtract one to - * cancel out the increment that follows */ - len += foldlen - 1; + added_len = foldlen; } /* If this node only contains non-folding code points so * far, see if this new one is also non-folding */ @@ -13825,13 +13820,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ender = folded; } + len += added_len; + if (next_is_quantifier) { /* Here, the next input is a quantifier, and to get here, - * the current character is the only one in the node. - * Also, here <len> doesn't include the final byte for this - * character */ - len++; + * the current character is the only one in the node. */ goto loopdone; } @@ -13889,7 +13883,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) s = (char *) utf8_hop((U8 *) s, -1); while (s >= s0) { /* Search backwards until find - non-problematic char */ + a non-problematic char */ if (UTF8_IS_INVARIANT(*s)) { /* There are no ascii characters that participate @@ -14026,6 +14020,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) : EXACTFU; } } + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, FALSE /* Don't look to see if could be turned into an EXACT -- Perl5 Master Repository