In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/0b13e5291ebd9c786dea21905e17886c5a310454?hp=1e2cfe157cae98578de3c274bc64b8ea032b91e0>

- Log -----------------------------------------------------------------
commit 0b13e5291ebd9c786dea21905e17886c5a310454
Author: Karl Williamson <k...@cpan.org>
Date:   Sun Feb 4 19:15:00 2018 -0700

    regcomp.c: Comment, white-space only

commit 03a2aaa3c921884eebd5bc46c16b9ed523d8d7fd
Author: Karl Williamson <k...@cpan.org>
Date:   Sun Feb 4 18:48:26 2018 -0700

    regcomp.c: Simplify handling of varying loop increments
    
    Prior to this commit, this loop added 1 to a variable each iteration as
    part of the for(;;).  This created some issues for the few cases where
    that increment should be something else.
    
    Now, the addition is removed from the for(;;), and defaults to 1, so
    that the code inside doesn't have to account for an automatic 1 in the
    for().

-----------------------------------------------------------------------

Summary of changes:
 regcomp.c | 91 ++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 43 insertions(+), 48 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 8cfe6a1b38..6f89a8ec30 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3706,14 +3706,16 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t 
*pRExC_state, regnode *sour
  * input nodes.
  *
  * And *unfolded_multi_char is set to indicate whether or not the node contains
- * an unfolded multi-char fold.  This happens when whether the fold is valid or
- * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
- * SMALL LETTER SHARP S, as only if the target string being matched against
- * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
- * folding rules depend on the locale in force at runtime.  (Multi-char folds
- * whose components are all above the Latin1 range are not run-time locale
- * dependent, and have already been folded by the time this function is
- * called.)
+ * an unfolded multi-char fold.  This happens when it won't be known until
+ * runtime whether the fold is valid or not; namely
+ *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
+ *      target string being matched against turns out to be UTF-8 is that fold
+ *      valid; or
+ *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
+ *      runtime.
+ * (Multi-char folds whose components are all above the Latin1 range are not
+ * run-time locale dependent, and have already been folded by the time this
+ * function is called.)
  *
  * This is as good a place as any to discuss the design of handling these
  * multi-character fold sequences.  It's been wrong in Perl for a very long
@@ -13318,6 +13320,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
              * faster to match */
             bool maybe_exact;
 
+            /* The node_type may change below, but since the size of the node
+             * doesn't change, it works */
            ret = reg_node(pRExC_state, node_type);
 
             /* In pass1, folded, we use a temporary buffer instead of the
@@ -13347,17 +13351,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth)
              * ones, in which case we just leave the node fully filled, and
              * hope that it doesn't match the string in just the wrong place */
 
-            assert(   ! UTF     /* Is at the beginning of a character */
+            assert( ! UTF     /* Is at the beginning of a character */
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
             /* Here, we have a literal character.  Find the maximal string of
              * them in the input that we can fit into a single EXACTish node.
              * We quit at the first non-literal or when the node gets full */
-           for (p = RExC_parse;
-                len < upper_parse && p < RExC_end;
-                len++)
-           {
+           for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+
+                /* In most cases each iteration adds one byte to the output.
+                 * The exceptions override this */
+                Size_t added_len = 1;
+
                oldp = p;
 
                 /* White space has already been ignored */
@@ -13647,8 +13653,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                    break;
                } /* End of switch on the literal */
 
-               /* Here, have looked at the literal character and <ender>
-                 * contains its ordinal, <p> points to the character after it.
+               /* Here, have looked at the literal character, and <ender>
+                 * contains its ordinal; <p> points to the character after it.
                  * We need to check if the next non-ignored thing is a
                  * quantifier.  Move <p> to after anything that should be
                  * ignored, which, as a side effect, positions <p> for the next
@@ -13683,15 +13689,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                         if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
                             const STRLEN unilen = UVCHR_SKIP(ender);
                             s += unilen;
-
-                            /* We have to subtract 1 just below (and again in
-                             * the corresponding PASS2 code) because the loop
-                             * increments <len> each time, as all but this path
-                             * (and one other) through it add a single byte to
-                             * the EXACTish node.  But these paths would change
-                             * len to be the correct final value, so cancel out
-                             * the increment that follows */
-                            len += unilen - 1;
+                            added_len = unilen;
                         }
                         else {
                             s++;
@@ -13700,7 +13698,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                       not_fold_common:
                         if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
-                            len += (char *) new_s - s - 1;
+                            added_len = (char *) new_s - s;
                             s = (char *) new_s;
                         }
                         else {
@@ -13713,6 +13711,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                     /* Here are folding under /l, and the code point is
                      * problematic.  First, we know we can't simplify things */
                     maybe_exact = FALSE;
+
+                    /* This code point means we can't simplify things */
                     maybe_exactfu = FALSE;
 
                     /* A problematic code point in this context means that its
@@ -13742,7 +13742,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
 #endif
                 )) {
                     /* Here, are folding and are not UTF-8 encoded; therefore
-                     * the character must be in the range 0-255, and is not /l
+                     * the character must be in the range 0-255, and is not /l.
                      * (Not /l because we already handled these under /l in
                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
                     if (IS_IN_SOME_FOLD_L1(ender)) {
@@ -13769,25 +13769,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth)
                         }
                     }
 
-                    /* Even when folding, we store just the input character, as
-                     * we have an array that finds its fold quickly */
-                    *(s++) = (char) ender;
+                        /* Even when folding, we store just the input
+                         * character, as we have an array that finds its fold
+                         * quickly */
+                        *(s++) = (char) ender;
                 }
                 else {  /* FOLD, and UTF (or sharp s) */
                     /* Unlike the non-fold case, we do actually have to
-                     * calculate the results here in pass 1.  This is for two
-                     * reasons, the folded length may be longer than the
-                     * unfolded, and we have to calculate how many EXACTish
-                     * nodes it will take; and we may run out of room in a node
-                     * in the middle of a potential multi-char fold, and have
-                     * to back off accordingly.  */
+                     * calculate the fold in pass 1.  This is for two reasons,
+                     * the folded length may be longer than the unfolded, and
+                     * we have to calculate how many EXACTish nodes it will
+                     * take; and we may run out of room in a node in the middle
+                     * of a potential multi-char fold, and have to back off
+                     * accordingly.  */
 
                     UV folded;
                     if (isASCII_uni(ender)) {
                         folded = toFOLD(ender);
                         *(s)++ = (U8) folded;
                     }
-                    else {
+                    else {  /* Not ASCII */
                         STRLEN foldlen;
 
                         folded = _to_uni_fold_flags(
@@ -13798,13 +13799,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                                                         ? 
FOLD_FLAGS_NOMIX_ASCII
                                                         : 0));
                         s += foldlen;
-
-                        /* The loop increments <len> each time, as all but this
-                         * path (and one other) through it add a single byte to
-                         * the EXACTish node.  But this one has changed len to
-                         * be the correct final value, so subtract one to
-                         * cancel out the increment that follows */
-                        len += foldlen - 1;
+                        added_len = foldlen;
                     }
                     /* If this node only contains non-folding code points so
                      * far, see if this new one is also non-folding */
@@ -13825,13 +13820,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth)
                     ender = folded;
                }
 
+                len += added_len;
+
                if (next_is_quantifier) {
 
                     /* Here, the next input is a quantifier, and to get here,
-                     * the current character is the only one in the node.
-                     * Also, here <len> doesn't include the final byte for this
-                     * character */
-                    len++;
+                     * the current character is the only one in the node. */
                     goto loopdone;
                }
 
@@ -13889,7 +13883,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                     s = (char *) utf8_hop((U8 *) s, -1);
 
                     while (s >= s0) {   /* Search backwards until find
-                                           non-problematic char */
+                                           a non-problematic char */
                         if (UTF8_IS_INVARIANT(*s)) {
 
                             /* There are no ascii characters that participate
@@ -14026,6 +14020,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                                   : EXACTFU;
                     }
                 }
+
                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
                                            FALSE /* Don't look to see if could
                                                     be turned into an EXACT

-- 
Perl5 Master Repository

Reply via email to