In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/2d66f61e3a78c413bbb02ffa1bab2ad8aa6aed5a?hp=6e89a33a76ad97097e5b456387f1289b6addf741>
- Log ----------------------------------------------------------------- commit 2d66f61e3a78c413bbb02ffa1bab2ad8aa6aed5a Author: Karl Williamson <[email protected]> Date: Wed Jan 1 11:36:17 2014 -0700 regexec.c: Clarify comment M regexec.c commit 3db24e1e3c8c10bf892a8f48ff3d780fdd1f88a0 Author: Karl Williamson <[email protected]> Date: Wed Jan 1 09:59:20 2014 -0700 regexec.c: Guard against malformed UTF-8 in [...] The code that handles bracketed character classes assumed that the string being matched against did not have the too-short malformation; this could lead to reading beyond-the-end-of-buffer. (It did check for other malformations.) This is solved by changing the function that operates on bracketed character classes to take and use an extra parameter, the actaul buffer end. M embed.fnc M embed.h M proto.h M regexec.c commit f3943cf2ca1b7a02583ef8bbeb1ced9414bf1fac Author: Karl Williamson <[email protected]> Date: Wed Jan 1 09:52:55 2014 -0700 pp.c: Remove unnecessary mask operation. An unsigned character (U8) should not have more than 8 bits of data, so no need to force that by masking with 0xFF. M pp.c commit 4f6386b6c255e97472036daac543efef3399b495 Author: Karl Williamson <[email protected]> Date: Wed Jan 1 09:49:04 2014 -0700 pp.c: Guard against malformed UTF-8 input in ord() This code got the actual length of the input scalar, but discarded it. If that scalar contains malformed UTF-8 that has fewer bytes than is indicated, a read beyond-buffer-end could happen. Simply use the actual length. M pp.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 5 ++++- embed.h | 2 +- pp.c | 5 +++-- proto.h | 7 ++++--- regexec.c | 26 +++++++++++++++----------- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/embed.fnc b/embed.fnc index dd9e6cf..18610ae 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2144,7 +2144,10 @@ ERs |I32 |regrepeat |NN regexp *prog|NN char **startposp \ |I32 max \ |int depth ERs |I32 |regtry |NN regmatch_info *reginfo|NN char **startposp -ERs |bool |reginclass |NULLOK regexp * const prog|NN const regnode * const n|NN const U8 * const p\ +ERs |bool |reginclass |NULLOK regexp * const prog \ + |NN const regnode * const n \ + |NN const U8 * const p \ + |NN const U8 * const p_end \ |bool const utf8_target Es |CHECKPOINT|regcppush |NN const regexp *rex|I32 parenfloor\ |U32 maxopenparen diff --git a/embed.h b/embed.h index b84b5e4..e918bcb 100644 --- a/embed.h +++ b/embed.h @@ -1002,7 +1002,7 @@ #define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c) #define reghop3 S_reghop3 #define reghopmaybe3 S_reghopmaybe3 -#define reginclass(a,b,c,d) S_reginclass(aTHX_ a,b,c,d) +#define reginclass(a,b,c,d,e) S_reginclass(aTHX_ a,b,c,d,e) #define regmatch(a,b,c) S_regmatch(aTHX_ a,b,c) #define regrepeat(a,b,c,d,e,f) S_regrepeat(aTHX_ a,b,c,d,e,f) #define regtry(a,b) S_regtry(aTHX_ a,b) diff --git a/pp.c b/pp.c index dd4d89a..6925f49 100644 --- a/pp.c +++ b/pp.c @@ -3324,12 +3324,13 @@ PP(pp_ord) if (PL_encoding && SvPOK(argsv) && !DO_UTF8(argsv)) { SV * const tmpsv = sv_2mortal(newSVsv(argsv)); s = (U8*)sv_recode_to_utf8(tmpsv, PL_encoding); + len = UTF8SKIP(s); /* Should be well-formed; so this is its length */ argsv = tmpsv; } XPUSHu(DO_UTF8(argsv) - ? utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV) - : (UV)(*s & 0xff)); + ? utf8n_to_uvchr(s, len, 0, UTF8_ALLOW_ANYUV) + : (UV)(*s)); RETURN; } diff --git a/proto.h b/proto.h index c486045..647d357 100644 --- a/proto.h +++ b/proto.h @@ -7158,12 +7158,13 @@ STATIC U8* S_reghopmaybe3(U8 *s, SSize_t off, const U8 *lim) #define PERL_ARGS_ASSERT_REGHOPMAYBE3 \ assert(s); assert(lim) -STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8 * const p, bool const utf8_target) +STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8 * const p, const U8 * const p_end, bool const utf8_target) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_2) - __attribute__nonnull__(pTHX_3); + __attribute__nonnull__(pTHX_3) + __attribute__nonnull__(pTHX_4); #define PERL_ARGS_ASSERT_REGINCLASS \ - assert(n); assert(p) + assert(n); assert(p); assert(p_end) STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) __attribute__warn_unused_result__ diff --git a/regexec.c b/regexec.c index d0e0fd2..3e82bc4 100644 --- a/regexec.c +++ b/regexec.c @@ -101,10 +101,10 @@ static const char* const non_utf8_target_but_utf8_required #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -1392,12 +1392,13 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1491,7 +1492,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOF_SYNTHETIC: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -4323,7 +4324,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4388,7 +4390,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -7002,7 +7005,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -7401,6 +7404,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7412,7 +7416,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7425,7 +7429,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for -- Perl5 Master Repository
