In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/bc508755449a899a1f962877248064475fb91770?hp=52bcf2657b8c6ee52d2eeb841fc2607db893f58f>
- Log ----------------------------------------------------------------- commit bc508755449a899a1f962877248064475fb91770 Merge: 52bcf2657b 695717ad22 Author: Karl Williamson <[email protected]> Date: Tue Mar 19 10:25:06 2019 -0600 Merge branch 'safer' into blead I undertook an audit to see where there might be places where malformed UTF-8 input could cause us to exceed the boundaries of buffers. I looked for where hopping to other characters in the string and skipping to the next character based on the current start byte. This branch is the result of that. Wherever I didn't see how exceeding the bounds wasn't a problem, I changed to use the safer versions of the hop and skip operations that we already have. As an example of where it isn't a problem is when doing the operation is at the end of a loop iteration, and the loop tests if we've exceeded it, with the result that the loop just stops executing. I may have missed things, and may have changed to unnecessarily use the safer operations in places commit 695717ad22405f9f3b40e16e6f2ff2b2d2028bca Author: Karl Williamson <[email protected]> Date: Mon Mar 18 11:02:23 2019 -0600 regexec.c: Use safe UTF8SKIP commit 8875b6def42a6a9cc4d0f0ef4b01ba406f5c5e47 Author: Karl Williamson <[email protected]> Date: Mon Mar 18 11:09:22 2019 -0600 regexec.c: Use safer utf8_hop commit ec3889a71ffae71ee81108f5ff2b72ed56db0157 Author: Karl Williamson <[email protected]> Date: Mon Mar 18 10:38:56 2019 -0600 regcomp.c: Use safer utf8_hop commit e33043825a7d2f1b577f3c8174bba4868d3a5ee3 Author: Karl Williamson <[email protected]> Date: Mon Mar 18 20:16:39 2019 -0600 regcomp.c: Use safe UTF8SKIP commit 40c725d15854de75b71b44c83af13d67b9112d53 Author: Karl Williamson <[email protected]> Date: Mon Mar 18 19:48:48 2019 -0600 pp_ctl.c: Use safe UTF8SKIP commit 18c47def0203efccddd268e8d0635c40247ce2fa Author: Karl Williamson <[email protected]> Date: Mon Mar 18 19:27:57 2019 -0600 pp_pack.c: Use safe UTF8SKIP commit cf70d9e66cc428b97f59d1cca621579346948015 Author: Karl Williamson <[email protected]> Date: Mon Mar 18 10:29:46 2019 -0600 pp.c: Use safer utf8_hop ----------------------------------------------------------------------- Summary of changes: pp.c | 4 ++-- pp_ctl.c | 3 ++- pp_pack.c | 2 +- regcomp.c | 43 ++++++++++++++++++++++++++++++------------- regexec.c | 61 ++++++++++++++++++++++++++++++++++++------------------------- 5 files changed, 71 insertions(+), 42 deletions(-) diff --git a/pp.c b/pp.c index 42b111ea32..1d83b08e9b 100644 --- a/pp.c +++ b/pp.c @@ -6246,7 +6246,7 @@ PP(pp_split) /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ if (do_utf8) - s = (char*)utf8_hop((U8*)m, len); + s = (char*)utf8_hop_forward((U8*) m, len, (U8*) strend); else s = m + len; /* Fake \n at the end */ } @@ -6270,7 +6270,7 @@ PP(pp_split) /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ if (do_utf8) - s = (char*)utf8_hop((U8*)m, len); + s = (char*)utf8_hop_forward((U8*)m, len, (U8 *) strend); else s = m + len; /* Fake \n at the end */ } diff --git a/pp_ctl.c b/pp_ctl.c index 17d4f0d14a..a38b9c19b2 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -781,7 +781,8 @@ PP(pp_formline) * for safety */ grow = linemax; while (linemark--) - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, + (U8 *) SvEND(PL_formtarget)); linemark = s - (U8*)SvPVX(PL_formtarget); } /* Easy. They agree. */ diff --git a/pp_pack.c b/pp_pack.c index 726f7438a3..33cb086db2 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -290,7 +290,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t if (from >= end) return FALSE; val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); if (retlen == (STRLEN) -1) { - from += UTF8SKIP(from); + from += UTF8_SAFE_SKIP(from, end); bad |= 1; } else from += retlen; if (val >= 0x100) { diff --git a/regcomp.c b/regcomp.c index 275945c388..f44ec79bd1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -706,7 +706,7 @@ static const scan_data_t zero_scan_data = { /* Used to point after bad bytes for an error message, but avoid skipping * past a nul byte. */ -#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1) +#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) /* Set up to clean up after our imminent demise */ #define PREPARE_TO_DIE \ @@ -5619,9 +5619,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, STRLEN l; const char * const s = SvPV_const(data->last_found, l); SSize_t old = b - data->last_start_min; + assert(old >= 0); if (UTF) - old = utf8_hop((U8*)s, old) - (U8*)s; + old = utf8_hop_forward((U8*)s, old, + (U8 *) SvEND(data->last_found)) + - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); @@ -10929,7 +10932,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) return; default: fail_modifiers: - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); @@ -11341,7 +11344,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } /* End of switch */ if ( ! op ) { - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; if (has_upper || verb_len == 0) { vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'", @@ -11421,7 +11426,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) return handle_named_backref(pRExC_state, flagp, parse_start, ')'); } - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); @@ -11696,7 +11701,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) case '?': /* (??...) */ is_logical = 1; if (*RExC_parse != '{') { - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f( "Sequence (%" UTF8f "...) not recognized", @@ -11894,7 +11899,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) insert_if_check_paren: if (UCHARAT(RExC_parse) != ')') { - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Switch condition not recognized"); } nextchar(pRExC_state); @@ -11956,7 +11963,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) #endif return ret; } - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Unknown switch condition (?(...))"); } case '[': /* (?[ ... ]) */ @@ -14522,7 +14531,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { /* Point to the first byte of the final character */ - s = (char *) utf8_hop((U8 *) s, -1); + s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); while (s >= s0) { /* Search backwards until find a non-problematic char */ @@ -15870,7 +15879,9 @@ redo_curchar: RExC_parse = RExC_end; } else if (RExC_parse != save_parse) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; } vFAIL("Expecting '(?flags:(?[...'"); } @@ -17057,7 +17068,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* The \p isn't immediately followed by a '{' */ else if (! isALPHA(*RExC_parse)) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL2("Character following \\%c must be '{' or a " "single-character Unicode property name", (U8) value); @@ -17226,7 +17239,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse += numlen; if (numlen != 3) { if (strict) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Need exactly 3 octal digits"); } else if ( numlen < 3 /* like \08, \178 */ @@ -19435,7 +19450,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) || UTF8_IS_INVARIANT(*RExC_parse) || UTF8_IS_START(*RExC_parse)); - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force /x */ ); diff --git a/regexec.c b/regexec.c index cd0a94fa5f..87d02fbd37 100644 --- a/regexec.c +++ b/regexec.c @@ -1720,7 +1720,7 @@ STMT_START { } else { \ uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ flags); \ - len = UTF8SKIP(uc); \ + len = UTF8_SAFE_SKIP(uc, uc_end); \ skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ @@ -3305,7 +3305,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; prog->offs[0].end = utf8_target - ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg : s - strbeg + prog->minlenret; if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, @@ -6921,7 +6921,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; } - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); } break; @@ -8242,8 +8242,10 @@ NULL * having to worry about one being shorter than the * other, since the first byte of each gives the * length of the character) */ - if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend))) { /* simulate B failing */ DEBUG_OPTIMISE_r( @@ -8505,20 +8507,26 @@ NULL n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, + reginfo->strend); n++; } } else { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend)) + && memNE(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); n++; } } @@ -8596,16 +8604,16 @@ NULL if (ST.c1 != CHRTEST_VOID && could_match) { if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) { - could_match = memEQ(locinput, - ST.c1_utf8, - UTF8SKIP(locinput)) - || memEQ(locinput, - ST.c2_utf8, - UTF8SKIP(locinput)); + could_match = memEQ(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + || memEQ(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)); } else { - could_match = UCHARAT(locinput) == ST.c1 - || UCHARAT(locinput) == ST.c2; + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; } } if (ST.c1 == CHRTEST_VOID || could_match) { @@ -9377,19 +9385,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == c2) { while (scan < this_eol && hardcount < max - && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol))) { - scan += UTF8SKIP(scan); + scan += UTF8SKIP(c1_utf8); hardcount++; } } else { while (scan < this_eol && hardcount < max - && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) - || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol)) + || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan, + loceol)))) { - scan += UTF8SKIP(scan); + scan += UTF8_SAFE_SKIP(scan, loceol); hardcount++; } } -- Perl5 Master Repository
