In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/3e2d3818e517e0037c1ab6a482f31d50271f9e27?hp=65cccc5e92d46159b5887e72094aab44ee260ea3>
- Log ----------------------------------------------------------------- commit 3e2d3818e517e0037c1ab6a482f31d50271f9e27 Author: Nicholas Clark <[email protected]> Date: Sun Jul 11 20:11:10 2010 +0100 Avoid UTF-8 cache panics with offsets beyond the string. Fixes RT #75898. Change S_sv_pos_u2b_forwards() to take a point to the (requested) UTF-8 offset, and return the actual UTF-8 offset for the byte position returned. This ensures that the cache is consistent with reality. M embed.fnc M proto.h M sv.c M t/op/index.t commit 48f9cf718354a5326f9e9d40d02a063952160024 Author: Nicholas Clark <[email protected]> Date: Sun Jul 11 17:17:37 2010 +0100 In Perl_sv_pos_u2b_flags and S_sv_pos_u2b_cached, return early for offset 0. 0 Unicode characters are always 0 octets long. Returning early ensures that any offsets we calculate later will always be non-zero. M sv.c commit 503752a16bf16b90ff6c073c4bd5d818e68a2e2a Author: Nicholas Clark <[email protected]> Date: Sun Jul 11 16:49:29 2010 +0100 In S_sv_pos_u2b_midway, inline the call to S_sv_pos_u2b_forwards. M embed.fnc M proto.h M sv.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 6 +++--- proto.h | 11 ++++++----- sv.c | 40 ++++++++++++++++++++++++++++++---------- t/op/index.t | 11 ++++++++++- 4 files changed, 49 insertions(+), 19 deletions(-) diff --git a/embed.fnc b/embed.fnc index 1ba9041..d3f14b1 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1883,12 +1883,12 @@ s |int |sv_2iuv_non_preserve |NN SV *const sv sR |I32 |expect_number |NN char **const pattern # sn |STRLEN |sv_pos_u2b_forwards|NN const U8 *const start \ - |NN const U8 *const send|STRLEN uoffset + |NN const U8 *const send|NN STRLEN *const uoffset sn |STRLEN |sv_pos_u2b_midway|NN const U8 *const start \ - |NN const U8 *send|const STRLEN uoffset|const STRLEN uend + |NN const U8 *send|STRLEN uoffset|const STRLEN uend s |STRLEN |sv_pos_u2b_cached|NN SV *const sv|NN MAGIC **const mgp \ |NN const U8 *const start|NN const U8 *const send \ - |const STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0 + |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0 s |void |utf8_mg_pos_cache_update|NN SV *const sv|NN MAGIC **const mgp \ |const STRLEN byte|const STRLEN utf8|const STRLEN blen s |STRLEN |sv_pos_b2u_midway|NN const U8 *const s|NN const U8 *const target \ diff --git a/proto.h b/proto.h index f25b40c..c1c0f05 100644 --- a/proto.h +++ b/proto.h @@ -5809,19 +5809,20 @@ STATIC I32 S_expect_number(pTHX_ char **const pattern) assert(pattern) # -STATIC STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, STRLEN uoffset) +STATIC STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, STRLEN *const uoffset) __attribute__nonnull__(1) - __attribute__nonnull__(2); + __attribute__nonnull__(2) + __attribute__nonnull__(3); #define PERL_ARGS_ASSERT_SV_POS_U2B_FORWARDS \ - assert(start); assert(send) + assert(start); assert(send); assert(uoffset) -STATIC STRLEN S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, const STRLEN uoffset, const STRLEN uend) +STATIC STRLEN S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, STRLEN uoffset, const STRLEN uend) __attribute__nonnull__(1) __attribute__nonnull__(2); #define PERL_ARGS_ASSERT_SV_POS_U2B_MIDWAY \ assert(start); assert(send) -STATIC STRLEN S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start, const U8 *const send, const STRLEN uoffset, STRLEN uoffset0, STRLEN boffset0) +STATIC STRLEN S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start, const U8 *const send, STRLEN uoffset, STRLEN uoffset0, STRLEN boffset0) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) diff --git a/sv.c b/sv.c index c38a318..2f13091 100644 --- a/sv.c +++ b/sv.c @@ -6089,19 +6089,23 @@ Perl_sv_len_utf8(pTHX_ register SV *const sv) offset. */ static STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, - STRLEN uoffset) + STRLEN *const uoffset_p) { const U8 *s = start; + STRLEN uoffset = *uoffset_p; PERL_ARGS_ASSERT_SV_POS_U2B_FORWARDS; - while (s < send && uoffset--) + while (s < send && uoffset) { + --uoffset; s += UTF8SKIP(s); + } if (s > send) { /* This is the existing behaviour. Possibly it should be a croak, as it's actually a bounds error */ s = send; } + *uoffset_p -= uoffset; return s - start; } @@ -6110,7 +6114,7 @@ S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, the passed in UTF-8 offset. */ static STRLEN S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, - const STRLEN uoffset, const STRLEN uend) + STRLEN uoffset, const STRLEN uend) { STRLEN backw = uend - uoffset; @@ -6120,7 +6124,14 @@ S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, /* The assumption is that going forwards is twice the speed of going forward (that's where the 2 * backw comes from). (The real figure of course depends on the UTF-8 data.) */ - return sv_pos_u2b_forwards(start, send, uoffset); + const U8 *s = start; + + while (s < send && uoffset--) + s += UTF8SKIP(s); + assert (s <= send); + if (s > send) + s = send; + return s - start; } while (backw--) { @@ -6141,7 +6152,7 @@ S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, created if necessary, and the found value offered to it for update. */ static STRLEN S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start, - const U8 *const send, const STRLEN uoffset, + const U8 *const send, STRLEN uoffset, STRLEN uoffset0, STRLEN boffset0) { STRLEN boffset = 0; /* Actually always set, but let's keep gcc happy. */ @@ -6151,6 +6162,9 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start assert (uoffset >= uoffset0); + if (!uoffset) + return 0; + if (!SvREADONLY(sv) && PL_utf8cache && (*mgp || (SvTYPE(sv) >= SVt_PVMG && @@ -6180,9 +6194,11 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start uoffset - uoffset0, (*mgp)->mg_len - uoffset0); } else { + uoffset -= uoffset0; boffset = boffset0 + sv_pos_u2b_forwards(start + boffset0, - send, uoffset - uoffset0); + send, &uoffset); + uoffset += uoffset0; } } else if (cache[2] < uoffset) { @@ -6220,9 +6236,11 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start } if (!found || PL_utf8cache < 0) { - const STRLEN real_boffset - = boffset0 + sv_pos_u2b_forwards(start + boffset0, - send, uoffset - uoffset0); + STRLEN real_boffset; + uoffset -= uoffset0; + real_boffset = boffset0 + sv_pos_u2b_forwards(start + boffset0, + send, &uoffset); + uoffset += uoffset0; if (found && PL_utf8cache < 0) { if (real_boffset != boffset) { @@ -6280,7 +6298,9 @@ Perl_sv_pos_u2b_flags(pTHX_ SV *const sv, STRLEN uoffset, STRLEN *const lenp, MAGIC *mg = NULL; boffset = sv_pos_u2b_cached(sv, &mg, start, send, uoffset, 0, 0); - if (lenp) { + if (lenp + && *lenp /* don't bother doing work for 0, as its bytes equivalent + is 0, and *lenp is already set to that. */) { /* Convert the relative offset to absolute. */ const STRLEN uoffset2 = uoffset + *lenp; const STRLEN boffset2 diff --git a/t/op/index.t b/t/op/index.t index 59b5542..5ef69fc 100644 --- a/t/op/index.t +++ b/t/op/index.t @@ -7,7 +7,7 @@ BEGIN { } use strict; -plan( tests => 111 ); +plan( tests => 113 ); run_tests() unless caller; @@ -194,4 +194,13 @@ SKIP: { } } +{ + # RT#75898 + is(eval { utf8::upgrade($_ = " "); index $_, " ", 72 }, -1, + 'UTF-8 cache handles offset beyond the end of the string'); + $_ = "\x{100}BC"; + is(index($_, "C", 4), -1, + 'UTF-8 cache handles offset beyond the end of the string'); +} + } -- Perl5 Master Repository
