[perl.git] branch blead updated. v5.29.8-145-gbc50875544

Karl Williamson Tue, 19 Mar 2019 09:30:25 -0700

In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/bc508755449a899a1f962877248064475fb91770?hp=52bcf2657b8c6ee52d2eeb841fc2607db893f58f>


- Log -----------------------------------------------------------------
commit bc508755449a899a1f962877248064475fb91770
Merge: 52bcf2657b 695717ad22
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 19 10:25:06 2019 -0600

    Merge branch 'safer' into blead
    
    I undertook an audit to see where there might be places where malformed
    UTF-8 input could cause us to exceed the boundaries of buffers.  I
    looked for where hopping to other characters in the string and skipping
    to the next character based on the current start byte.
    
    This branch is the result of that.  Wherever I didn't see how exceeding
    the bounds wasn't a problem, I changed to use the safer versions of the
    hop and skip operations that we already have.  As an example of where it
    isn't a problem is when doing the operation is at the end of a loop
    iteration, and the loop tests if we've exceeded it, with the result that
    the loop just stops executing.
    
    I may have missed things, and  may have changed to unnecessarily use the
    safer operations in places

commit 695717ad22405f9f3b40e16e6f2ff2b2d2028bca
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 11:02:23 2019 -0600

    regexec.c: Use safe UTF8SKIP

commit 8875b6def42a6a9cc4d0f0ef4b01ba406f5c5e47
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 11:09:22 2019 -0600

    regexec.c: Use safer utf8_hop

commit ec3889a71ffae71ee81108f5ff2b72ed56db0157
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 10:38:56 2019 -0600

    regcomp.c: Use safer utf8_hop

commit e33043825a7d2f1b577f3c8174bba4868d3a5ee3
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 20:16:39 2019 -0600

    regcomp.c: Use safe UTF8SKIP

commit 40c725d15854de75b71b44c83af13d67b9112d53
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 19:48:48 2019 -0600

    pp_ctl.c: Use safe UTF8SKIP

commit 18c47def0203efccddd268e8d0635c40247ce2fa
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 19:27:57 2019 -0600

    pp_pack.c: Use safe UTF8SKIP

commit cf70d9e66cc428b97f59d1cca621579346948015
Author: Karl Williamson <[email protected]>
Date:   Mon Mar 18 10:29:46 2019 -0600

    pp.c: Use safer utf8_hop

-----------------------------------------------------------------------

Summary of changes:
 pp.c      |  4 ++--
 pp_ctl.c  |  3 ++-
 pp_pack.c |  2 +-
 regcomp.c | 43 ++++++++++++++++++++++++++++++-------------
 regexec.c | 61 ++++++++++++++++++++++++++++++++++++-------------------------
 5 files changed, 71 insertions(+), 42 deletions(-)

diff --git a/pp.c b/pp.c
index 42b111ea32..1d83b08e9b 100644
--- a/pp.c
+++ b/pp.c
@@ -6246,7 +6246,7 @@ PP(pp_split)
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
                if (do_utf8)
-                   s = (char*)utf8_hop((U8*)m, len);
+                   s = (char*)utf8_hop_forward((U8*) m, len, (U8*) strend);
                else
                    s = m + len; /* Fake \n at the end */
            }
@@ -6270,7 +6270,7 @@ PP(pp_split)
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
                if (do_utf8)
-                   s = (char*)utf8_hop((U8*)m, len);
+                   s = (char*)utf8_hop_forward((U8*)m, len, (U8 *) strend);
                else
                    s = m + len; /* Fake \n at the end */
            }
diff --git a/pp_ctl.c b/pp_ctl.c
index 17d4f0d14a..a38b9c19b2 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -781,7 +781,8 @@ PP(pp_formline)
                         * for safety */
                        grow = linemax;
                        while (linemark--)
-                           s += UTF8SKIP(s);
+                           s += UTF8_SAFE_SKIP(s,
+                                            (U8 *) SvEND(PL_formtarget));
                        linemark = s - (U8*)SvPVX(PL_formtarget);
                    }
                    /* Easy. They agree.  */
diff --git a/pp_pack.c b/pp_pack.c
index 726f7438a3..33cb086db2 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -290,7 +290,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, 
const char *buf, SSize_t
        if (from >= end) return FALSE;
        val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags);
        if (retlen == (STRLEN) -1) {
-           from += UTF8SKIP(from);
+           from += UTF8_SAFE_SKIP(from, end);
            bad |= 1;
        } else from += retlen;
        if (val >= 0x100) {
diff --git a/regcomp.c b/regcomp.c
index 275945c388..f44ec79bd1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -706,7 +706,7 @@ static const scan_data_t zero_scan_data = {
 
 /* Used to point after bad bytes for an error message, but avoid skipping
  * past a nul byte. */
-#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 
 /* Set up to clean up after our imminent demise */
 #define PREPARE_TO_DIE                                                      \
@@ -5619,9 +5619,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        STRLEN l;
                        const char * const s = SvPV_const(data->last_found, l);
                        SSize_t old = b - data->last_start_min;
+                        assert(old >= 0);
 
                        if (UTF)
-                           old = utf8_hop((U8*)s, old) - (U8*)s;
+                           old = utf8_hop_forward((U8*)s, old,
+                                               (U8 *) SvEND(data->last_found))
+                                - (U8*)s;
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -10929,7 +10932,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t 
*pRExC_state)
                 return;
             default:
               fail_modifiers:
-                RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                /* diag_listed_as: Sequence (?%s...) not recognized in regex; 
marked by <-- HERE in m/%s/ */
                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
@@ -11341,7 +11344,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp, U32 depth)
 
            } /* End of switch */
            if ( ! op ) {
-               RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+               RExC_parse += UTF
+                              ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                              : 1;
                 if (has_upper || verb_len == 0) {
                     vFAIL2utf8f(
                     "Unknown verb pattern '%" UTF8f "'",
@@ -11421,7 +11426,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp, U32 depth)
                     return handle_named_backref(pRExC_state, flagp,
                                                 parse_start, ')');
                 }
-                RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; 
marked by <-- HERE in m/%s/ */
                vFAIL3("Sequence (%.*s...) not recognized",
                                 RExC_parse-seqstart, seqstart);
@@ -11696,7 +11701,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp, U32 depth)
            case '?':           /* (??...) */
                is_logical = 1;
                if (*RExC_parse != '{') {
-                    RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                    RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                     /* diag_listed_as: Sequence (?%s...) not recognized in 
regex; marked by <-- HERE in m/%s/ */
                     vFAIL2utf8f(
                         "Sequence (%" UTF8f "...) not recognized",
@@ -11894,7 +11899,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp, U32 depth)
 
                  insert_if_check_paren:
                    if (UCHARAT(RExC_parse) != ')') {
-                        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                        RExC_parse += UTF
+                                      ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                      : 1;
                        vFAIL("Switch condition not recognized");
                    }
                    nextchar(pRExC_state);
@@ -11956,7 +11963,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
*flagp, U32 depth)
 #endif
                    return ret;
                }
-                RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                RExC_parse += UTF
+                              ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                              : 1;
                 vFAIL("Unknown switch condition (?(...))");
            }
            case '[':           /* (?[ ... ]) */
@@ -14522,7 +14531,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                 else {
 
                     /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop((U8 *) s, -1);
+                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
 
                     while (s >= s0) {   /* Search backwards until find
                                            a non-problematic char */
@@ -15870,7 +15879,9 @@ redo_curchar:
                             RExC_parse = RExC_end;
                         }
                         else if (RExC_parse != save_parse) {
-                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                            RExC_parse += (UTF)
+                                          ? UTF8_SAFE_SKIP(RExC_parse, 
RExC_end)
+                                          : 1;
                         }
                         vFAIL("Expecting '(?flags:(?[...'");
                     }
@@ -17057,7 +17068,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
 
                }   /* The \p isn't immediately followed by a '{' */
                else if (! isALPHA(*RExC_parse)) {
-                    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                    RExC_parse += (UTF)
+                                  ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                  : 1;
                     vFAIL2("Character following \\%c must be '{' or a "
                            "single-character Unicode property name",
                            (U8) value);
@@ -17226,7 +17239,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                    RExC_parse += numlen;
                     if (numlen != 3) {
                         if (strict) {
-                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                            RExC_parse += (UTF)
+                                          ? UTF8_SAFE_SKIP(RExC_parse, 
RExC_end)
+                                          : 1;
                             vFAIL("Need exactly 3 octal digits");
                         }
                         else if (   numlen < 3 /* like \08, \178 */
@@ -19435,7 +19450,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
                || UTF8_IS_INVARIANT(*RExC_parse)
                || UTF8_IS_START(*RExC_parse));
 
-        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+        RExC_parse += (UTF)
+                      ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                      : 1;
 
         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
                                 FALSE /* Don't force /x */ );
diff --git a/regexec.c b/regexec.c
index cd0a94fa5f..87d02fbd37 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1720,7 +1720,7 @@ STMT_START {
         } else {                                                               
     \
             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, 
&foldlen,    \
                                                                             
flags); \
-            len = UTF8SKIP(uc);                                                
     \
+            len = UTF8_SAFE_SKIP(uc, uc_end);                                  
     \
             skiplen = UVCHR_SKIP( uvc );                                       
     \
             foldlen -= skiplen;                                                
     \
             uscan = foldbuf + skiplen;                                         
     \
@@ -3305,7 +3305,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char 
*stringarg, char *strend,
             RXp_MATCH_UTF8_set(prog, utf8_target);
             prog->offs[0].start = s - strbeg;
             prog->offs[0].end = utf8_target
-                ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+                ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) 
strend) - strbeg
                 : s - strbeg + prog->minlenret;
             if ( !(flags & REXEC_NOT_FIRST) )
                 S_reg_set_capture_string(aTHX_ rx,
@@ -6921,7 +6921,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                         }
                         break;
                 }
-                locinput += UTF8SKIP(locinput);
+                locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
             }
             break;
 
@@ -8242,8 +8242,10 @@ NULL
                             * having to worry about one being shorter than the
                             * other, since the first byte of each gives the
                             * length of the character) */
-                    if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
-                        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                    if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
+                                                              reginfo->strend))
+                        && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
+                                                             reginfo->strend)))
                     {
                         /* simulate B failing */
                         DEBUG_OPTIMISE_r(
@@ -8505,20 +8507,26 @@ NULL
                    n = (ST.oldloc == locinput) ? 0 : 1;
                    if (ST.c1 == ST.c2) {
                        /* set n to utf8_distance(oldloc, locinput) */
-                       while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, 
UTF8SKIP(locinput)))
+                       while (    locinput <= ST.maxpos
+                               &&  locinput < loceol
+                               &&  memNE(locinput, ST.c1_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-                           locinput += UTF8SKIP(locinput);
+                           locinput += UTF8_SAFE_SKIP(locinput,
+                                                       reginfo->strend);
                            n++;
                        }
                    }
                    else {
                        /* set n to utf8_distance(oldloc, locinput) */
-                       while (locinput <= ST.maxpos
-                              && memNE(locinput, ST.c1_utf8, 
UTF8SKIP(locinput))
-                              && memNE(locinput, ST.c2_utf8, 
UTF8SKIP(locinput)))
+                       while (   locinput <= ST.maxpos
+                               && locinput < loceol
+                               && memNE(locinput, ST.c1_utf8,
+                                     UTF8_SAFE_SKIP(locinput, reginfo->strend))
+                               && memNE(locinput, ST.c2_utf8,
+                                    UTF8_SAFE_SKIP(locinput, reginfo->strend)))
                         {
-                           locinput += UTF8SKIP(locinput);
+                           locinput += UTF8_SAFE_SKIP(locinput, 
reginfo->strend);
                            n++;
                        }
                    }
@@ -8596,16 +8604,16 @@ NULL
                 if (ST.c1 != CHRTEST_VOID && could_match) {
                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
                     {
-                        could_match = memEQ(locinput,
-                                            ST.c1_utf8,
-                                            UTF8SKIP(locinput))
-                                    || memEQ(locinput,
-                                             ST.c2_utf8,
-                                             UTF8SKIP(locinput));
+                        could_match =  memEQ(locinput, ST.c1_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend))
+                                    || memEQ(locinput, ST.c2_utf8,
+                                             UTF8_SAFE_SKIP(locinput,
+                                                            reginfo->strend));
                     }
                     else {
-                        could_match = UCHARAT(locinput) == ST.c1
-                                      || UCHARAT(locinput) == ST.c2;
+                        could_match =   UCHARAT(locinput) == ST.c1
+                                     || UCHARAT(locinput) == ST.c2;
                     }
                 }
                 if (ST.c1 == CHRTEST_VOID || could_match) {
@@ -9377,19 +9385,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
                 if (c1 == c2) {
                     while (scan < this_eol
                            && hardcount < max
-                           && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+                           && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                  loceol)))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8SKIP(c1_utf8);
                         hardcount++;
                     }
                 }
                 else {
                     while (scan < this_eol
                            && hardcount < max
-                           && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
-                               || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+                           && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))
+                               || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
+                                                                     loceol))))
                     {
-                        scan += UTF8SKIP(scan);
+                        scan += UTF8_SAFE_SKIP(scan, loceol);
                         hardcount++;
                     }
                 }

-- 
Perl5 Master Repository

[perl.git] branch blead updated. v5.29.8-145-gbc50875544

Reply via email to