In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8842e230bdaeef0ab4af48b7d7a9b29ee40b3890?hp=5d9574c10ca04dcefc1cac5441cb321f7bb4cc37>
- Log ----------------------------------------------------------------- commit 8842e230bdaeef0ab4af48b7d7a9b29ee40b3890 Author: Karl Williamson <[email protected]> Date: Wed Mar 12 14:11:58 2014 -0600 regcomp.c: Don't read past string-end In doing an audit of regcomp.c, and experimenting using Encode::_utf8_on(), I found this one instance of a regen/regcharclass.pl macro that could read beyond the end of the string if given malformed UTF-8. Hence we convert to use the 'safe' form. There are no other uses of the non-safe version, so don't need to generate them. ----------------------------------------------------------------------- Summary of changes: regcharclass.h | 26 -------------------------- regcomp.c | 2 +- regen/regcharclass.pl | 2 +- 3 files changed, 2 insertions(+), 28 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index b0f635d..5e34ec0 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -723,15 +723,6 @@ \p{PatWS} */ /*** GENERATED CODE ***/ -#define is_PATWS(s,is_utf8) \ -( ( ( 0x09 <= NATIVE_TO_LATIN1(((U8*)s)[0]) && NATIVE_TO_LATIN1(((U8*)s)[0]) <= 0x0D ) || 0x20 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? 1\ -: ( is_utf8 ) ? \ - ( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ - ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ - : ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == ... [20 chars truncated] -: ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ) - -/*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ ( ((e) > (s)) ? \ ( ( ( 0x09 <= NATIVE_TO_LATIN1(((U8*)s)[0]) && NATIVE_TO_LATIN1(((U8*)s)[0]) <= 0x0D ) || 0x20 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? 1\ @@ -745,14 +736,6 @@ : 0 ) /*** GENERATED CODE ***/ -#define is_PATWS_non_low(s,is_utf8) \ -( ( is_utf8 ) ? \ - ( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ - ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ - : ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == ... [20 chars truncated] -: ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ) - -/*** GENERATED CODE ***/ #define is_PATWS_non_low_safe(s,e,is_utf8) \ ( ((e) > (s)) ? \ ( (! is_utf8) ? \ @@ -773,15 +756,6 @@ ( 0x200F == NATIVE_TO_UNI(cp) || ( 0x200F < NATIVE_TO_UNI(cp) && \ ( 0x2028 == NATIVE_TO_UNI(cp) || 0x2029 == NATIVE_TO_UNI(cp) ) ) ) ) ) ) ) ) ) ) ) -/*** GENERATED CODE ***/ -#define is_PATWS_cp(cp) \ -( ( 0x09 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x0D ) || ( 0x0D < NATIVE_TO_UNI(cp) &&\ -( 0x20 == NATIVE_TO_UNI(cp) || ( 0x20 < NATIVE_TO_UNI(cp) && \ -( 0x85 == NATIVE_TO_UNI(cp) || ( 0x85 < NATIVE_TO_UNI(cp) && \ -( 0x200E == NATIVE_TO_UNI(cp) || ( 0x200E < NATIVE_TO_UNI(cp) && \ -( 0x200F == NATIVE_TO_UNI(cp) || ( 0x200F < NATIVE_TO_UNI(cp) && \ -( 0x2028 == NATIVE_TO_UNI(cp) || 0x2029 == NATIVE_TO_UNI(cp) ) ) ) ) ) ) ) ) ) ) ) - #endif /* H_REGCHARCLASS */ diff --git a/regcomp.c b/regcomp.c index d72d344..ca2ffb8 100644 --- a/regcomp.c +++ b/regcomp.c @@ -11920,7 +11920,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index fa947a3..187023a 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1652,5 +1652,5 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are \p{_Perl_Problematic_Locale_Foldeds_Start} PATWS: pattern white space -=> generic generic_non_low cp : fast safe +=> generic generic_non_low cp : safe \p{PatWS} -- Perl5 Master Repository
