In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/9df83ffded21f5c1d01d1ef847c3447b6553810f?hp=634d6919699655c843f8d8c3ea64922d0403c499>
- Log ----------------------------------------------------------------- commit 9df83ffded21f5c1d01d1ef847c3447b6553810f Author: Nicholas Clark <[email protected]> Date: Mon Jul 12 13:39:19 2010 +0100 Create S_assert_uft8_cache_coherent() with one copy of the cache panic code. Replacing 4 copies of this debugging-only routine with 1 reduces source and object code size. M embed.fnc M embed.h M proto.h M sv.c commit 6ef2ab89d2567e144b289574a2e087dd7eec7894 Author: Nicholas Clark <[email protected]> Date: Mon Jul 12 13:09:28 2010 +0100 Perl_sv_len_utf8 can use the UTF-8 offset cache to reduce its linear scan. Previously, if the scalar's character length wasn't yet known, but an offset midway was, the offset would be ignored, and the linear scan of UTF-8 was for the entire length of the scalar. M sv.c M t/op/length.t commit 0d7caf4cc74eb29a95f71af5a91fef30ca848e41 Author: Nicholas Clark <[email protected]> Date: Mon Jul 12 11:56:59 2010 +0100 Perl_sv_pos_b2u now calls utf8_mg_len_cache_update for the string end offset. Previously it would not take special action if the offset requested happened to be the end of the string, meaning that the (fixed size) UTF-8 offset cache would be used for a value which could (and should) be stored elsewhere. M sv.c M t/op/pos.t commit 79d2d448469df7917fb728ca8a674f771610599c Author: Nicholas Clark <[email protected]> Date: Mon Jul 12 11:38:31 2010 +0100 S_sv_pos_u2b_cached now updates the UTF-8 length cache if at the end of string. Pass in a boolean to S_sv_pos_u2b_forwards, which sets it to true if it discovers that the UTF-8 offset is at (or after) the end of the string. This can only happen if we don't already know the SV's length (in Unicode characters), because if we know it, we always call S_sv_pos_u2b_midway(). M embed.fnc M proto.h M sv.c commit ec49a12ce17d116f4e9bda1c3d385aad560ec655 Author: Nicholas Clark <[email protected]> Date: Mon Jul 12 11:16:41 2010 +0100 Break S_utf8_mg_len_cache_update() out from Perl_sv_len_utf8(). M embed.fnc M embed.h M proto.h M sv.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 7 ++- embed.h | 4 ++ proto.h | 19 ++++++- sv.c | 144 ++++++++++++++++++++++++++++++++------------------------- t/op/length.t | 9 +++- t/op/pos.t | 10 ++++- 6 files changed, 124 insertions(+), 69 deletions(-) diff --git a/embed.fnc b/embed.fnc index 15bd938..85beec1 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1884,16 +1884,21 @@ s |int |sv_2iuv_non_preserve |NN SV *const sv sR |I32 |expect_number |NN char **const pattern # sn |STRLEN |sv_pos_u2b_forwards|NN const U8 *const start \ - |NN const U8 *const send|NN STRLEN *const uoffset + |NN const U8 *const send|NN STRLEN *const uoffset \ + |NN bool *const at_end sn |STRLEN |sv_pos_u2b_midway|NN const U8 *const start \ |NN const U8 *send|STRLEN uoffset|const STRLEN uend s |STRLEN |sv_pos_u2b_cached|NN SV *const sv|NN MAGIC **const mgp \ |NN const U8 *const start|NN const U8 *const send \ |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0 +s |void |utf8_mg_len_cache_update|NN SV *const sv|NN MAGIC **const mgp \ + |const STRLEN ulen s |void |utf8_mg_pos_cache_update|NN SV *const sv|NN MAGIC **const mgp \ |const STRLEN byte|const STRLEN utf8|const STRLEN blen s |STRLEN |sv_pos_b2u_midway|NN const U8 *const s|NN const U8 *const target \ |NN const U8 *end|STRLEN endu +s |void |assert_uft8_cache_coherent|NN const char *const func \ + |STRLEN from_cache|STRLEN real|NN SV *const sv sn |char * |F0convert |NV nv|NN char *const endbuf|NN STRLEN *const len # if defined(PERL_OLD_COPY_ON_WRITE) sM |void |sv_release_COW |NN SV *sv|NN const char *pvx|NN SV *after diff --git a/embed.h b/embed.h index 5e79e58..d7a62dd 100644 --- a/embed.h +++ b/embed.h @@ -1594,8 +1594,10 @@ #define sv_pos_u2b_forwards S_sv_pos_u2b_forwards #define sv_pos_u2b_midway S_sv_pos_u2b_midway #define sv_pos_u2b_cached S_sv_pos_u2b_cached +#define utf8_mg_len_cache_update S_utf8_mg_len_cache_update #define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update #define sv_pos_b2u_midway S_sv_pos_b2u_midway +#define assert_uft8_cache_coherent S_assert_uft8_cache_coherent #define F0convert S_F0convert #endif # if defined(PERL_OLD_COPY_ON_WRITE) @@ -4044,8 +4046,10 @@ #define sv_pos_u2b_forwards S_sv_pos_u2b_forwards #define sv_pos_u2b_midway S_sv_pos_u2b_midway #define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g) +#define utf8_mg_len_cache_update(a,b,c) S_utf8_mg_len_cache_update(aTHX_ a,b,c) #define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e) #define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d) +#define assert_uft8_cache_coherent(a,b,c,d) S_assert_uft8_cache_coherent(aTHX_ a,b,c,d) #define F0convert S_F0convert #endif # if defined(PERL_OLD_COPY_ON_WRITE) diff --git a/proto.h b/proto.h index 535dc78..1824377 100644 --- a/proto.h +++ b/proto.h @@ -5814,12 +5814,13 @@ STATIC I32 S_expect_number(pTHX_ char **const pattern) assert(pattern) # -STATIC STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, STRLEN *const uoffset) +STATIC STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, STRLEN *const uoffset, bool *const at_end) __attribute__nonnull__(1) __attribute__nonnull__(2) - __attribute__nonnull__(3); + __attribute__nonnull__(3) + __attribute__nonnull__(4); #define PERL_ARGS_ASSERT_SV_POS_U2B_FORWARDS \ - assert(start); assert(send); assert(uoffset) + assert(start); assert(send); assert(uoffset); assert(at_end) STATIC STRLEN S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, STRLEN uoffset, const STRLEN uend) __attribute__nonnull__(1) @@ -5835,6 +5836,12 @@ STATIC STRLEN S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U #define PERL_ARGS_ASSERT_SV_POS_U2B_CACHED \ assert(sv); assert(mgp); assert(start); assert(send) +STATIC void S_utf8_mg_len_cache_update(pTHX_ SV *const sv, MAGIC **const mgp, const STRLEN ulen) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT_UTF8_MG_LEN_CACHE_UPDATE \ + assert(sv); assert(mgp) + STATIC void S_utf8_mg_pos_cache_update(pTHX_ SV *const sv, MAGIC **const mgp, const STRLEN byte, const STRLEN utf8, const STRLEN blen) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); @@ -5848,6 +5855,12 @@ STATIC STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *const s, const U8 *const targe #define PERL_ARGS_ASSERT_SV_POS_B2U_MIDWAY \ assert(s); assert(target); assert(end) +STATIC void S_assert_uft8_cache_coherent(pTHX_ const char *const func, STRLEN from_cache, STRLEN real, SV *const sv) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_4); +#define PERL_ARGS_ASSERT_ASSERT_UFT8_CACHE_COHERENT \ + assert(func); assert(sv) + STATIC char * S_F0convert(NV nv, char *const endbuf, STRLEN *const len) __attribute__nonnull__(2) __attribute__nonnull__(3); diff --git a/sv.c b/sv.c index 3e99d9c..39689fd 100644 --- a/sv.c +++ b/sv.c @@ -6047,37 +6047,26 @@ Perl_sv_len_utf8(pTHX_ register SV *const sv) STRLEN ulen; MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; - if (mg && mg->mg_len != -1) { - ulen = mg->mg_len; + if (mg && (mg->mg_len != -1 || mg->mg_ptr)) { + if (mg->mg_len != -1) + ulen = mg->mg_len; + else { + /* We can use the offset cache for a headstart. + The longer value is stored in the first pair. */ + STRLEN *cache = (STRLEN *) mg->mg_ptr; + + ulen = cache[0] + Perl_utf8_length(aTHX_ s + cache[1], + s + len); + } + if (PL_utf8cache < 0) { const STRLEN real = Perl_utf8_length(aTHX_ s, s + len); - if (real != ulen) { - /* Need to turn the assertions off otherwise we may - recurse infinitely while printing error messages. - */ - SAVEI8(PL_utf8cache); - PL_utf8cache = 0; - Perl_croak(aTHX_ "panic: sv_len_utf8 cache %"UVuf - " real %"UVuf" for %"SVf, - (UV) ulen, (UV) real, SVfARG(sv)); - } + assert_uft8_cache_coherent("sv_len_utf8", ulen, real, sv); } } else { ulen = Perl_utf8_length(aTHX_ s, s + len); - if (!SvREADONLY(sv)) { - if (!mg && (SvTYPE(sv) < SVt_PVMG || - !(mg = mg_find(sv, PERL_MAGIC_utf8)))) { - mg = sv_magicext(sv, 0, PERL_MAGIC_utf8, - &PL_vtbl_utf8, 0, 0); - } - assert(mg); - mg->mg_len = ulen; - /* For now, treat "overflowed" as "still unknown". - See RT #72924. */ - if (ulen != (STRLEN) mg->mg_len) - mg->mg_len = -1; - } + utf8_mg_len_cache_update(sv, &mg, ulen); } return ulen; } @@ -6089,7 +6078,7 @@ Perl_sv_len_utf8(pTHX_ register SV *const sv) offset. */ static STRLEN S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, - STRLEN *const uoffset_p) + STRLEN *const uoffset_p, bool *const at_end) { const U8 *s = start; STRLEN uoffset = *uoffset_p; @@ -6100,7 +6089,11 @@ S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send, --uoffset; s += UTF8SKIP(s); } - if (s > send) { + if (s == send) { + *at_end = TRUE; + } + else if (s > send) { + *at_end = TRUE; /* This is the existing behaviour. Possibly it should be a croak, as it's actually a bounds error */ s = send; @@ -6157,6 +6150,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start { STRLEN boffset = 0; /* Actually always set, but let's keep gcc happy. */ bool found = FALSE; + bool at_end = FALSE; PERL_ARGS_ASSERT_SV_POS_U2B_CACHED; @@ -6197,7 +6191,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start uoffset -= uoffset0; boffset = boffset0 + sv_pos_u2b_forwards(start + boffset0, - send, &uoffset); + send, &uoffset, &at_end); uoffset += uoffset0; } } @@ -6239,25 +6233,21 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start STRLEN real_boffset; uoffset -= uoffset0; real_boffset = boffset0 + sv_pos_u2b_forwards(start + boffset0, - send, &uoffset); + send, &uoffset, &at_end); uoffset += uoffset0; - if (found && PL_utf8cache < 0) { - if (real_boffset != boffset) { - /* Need to turn the assertions off otherwise we may recurse - infinitely while printing error messages. */ - SAVEI8(PL_utf8cache); - PL_utf8cache = 0; - Perl_croak(aTHX_ "panic: sv_pos_u2b_cache cache %"UVuf - " real %"UVuf" for %"SVf, - (UV) boffset, (UV) real_boffset, SVfARG(sv)); - } - } + if (found && PL_utf8cache < 0) + assert_uft8_cache_coherent("sv_pos_u2b_cache", boffset, + real_boffset, sv); boffset = real_boffset; } - if (PL_utf8cache) - utf8_mg_pos_cache_update(sv, mgp, boffset, uoffset, send - start); + if (PL_utf8cache) { + if (at_end) + utf8_mg_len_cache_update(sv, mgp, uoffset); + else + utf8_mg_pos_cache_update(sv, mgp, boffset, uoffset, send - start); + } return boffset; } @@ -6358,6 +6348,26 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp } } +static void +S_utf8_mg_len_cache_update(pTHX_ SV *const sv, MAGIC **const mgp, + const STRLEN ulen) +{ + PERL_ARGS_ASSERT_UTF8_MG_LEN_CACHE_UPDATE; + if (SvREADONLY(sv)) + return; + + if (!*mgp && (SvTYPE(sv) < SVt_PVMG || + !(*mgp = mg_find(sv, PERL_MAGIC_utf8)))) { + *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, &PL_vtbl_utf8, 0, 0); + } + assert(*mgp); + + (*mgp)->mg_len = ulen; + /* For now, treat "overflowed" as "still unknown". See RT #72924. */ + if (ulen != (STRLEN) (*mgp)->mg_len) + (*mgp)->mg_len = -1; +} + /* Create and update the UTF8 magic offset cache, with the proffered utf8/ byte length pairing. The (byte) length of the total SV is passed in too, as blen, because for some (more esoteric) SVs, the call to SvPV_const() @@ -6416,14 +6426,8 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *const sv, MAGIC **const mgp, const STRLEN b const U8 *start = (const U8 *) SvPVX_const(sv); const STRLEN realutf8 = utf8_length(start, start + byte); - if (realutf8 != utf8) { - /* Need to turn the assertions off otherwise we may recurse - infinitely while printing error messages. */ - SAVEI8(PL_utf8cache); - PL_utf8cache = 0; - Perl_croak(aTHX_ "panic: utf8_mg_pos_cache_update cache %"UVuf - " real %"UVuf" for %"SVf, (UV) utf8, (UV) realutf8, SVfARG(sv)); - } + assert_uft8_cache_coherent("utf8_mg_pos_cache_update", utf8, realutf8, + sv); } /* Cache is held with the later position first, to simplify the code @@ -6644,23 +6648,37 @@ Perl_sv_pos_b2u(pTHX_ register SV *const sv, I32 *const offsetp) if (!found || PL_utf8cache < 0) { const STRLEN real_len = utf8_length(s, send); - if (found && PL_utf8cache < 0) { - if (len != real_len) { - /* Need to turn the assertions off otherwise we may recurse - infinitely while printing error messages. */ - SAVEI8(PL_utf8cache); - PL_utf8cache = 0; - Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVuf - " real %"UVuf" for %"SVf, - (UV) len, (UV) real_len, SVfARG(sv)); - } - } + if (found && PL_utf8cache < 0) + assert_uft8_cache_coherent("sv_pos_b2u", len, real_len, sv); len = real_len; } *offsetp = len; - if (PL_utf8cache) - utf8_mg_pos_cache_update(sv, &mg, byte, len, blen); + if (PL_utf8cache) { + if (blen == byte) + utf8_mg_len_cache_update(sv, &mg, len); + else + utf8_mg_pos_cache_update(sv, &mg, byte, len, blen); + } +} + +static void +S_assert_uft8_cache_coherent(pTHX_ const char *const func, STRLEN from_cache, + STRLEN real, SV *const sv) +{ + PERL_ARGS_ASSERT_ASSERT_UFT8_CACHE_COHERENT; + + /* As this is debugging only code, save space by keeping this test here, + rather than inlining it in all the callers. */ + if (from_cache == real) + return; + + /* Need to turn the assertions off otherwise we may recurse infinitely + while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: %s cache %"UVuf" real %"UVuf" for %"SVf, + func, (UV) from_cache, (UV) real, SVfARG(sv)); } /* diff --git a/t/op/length.t b/t/op/length.t index eb35720..c73d4c5 100644 --- a/t/op/length.t +++ b/t/op/length.t @@ -6,7 +6,7 @@ BEGIN { @INC = '../lib'; } -plan (tests => 28); +plan (tests => 30); print "not " unless length("") == 0; print "ok 1\n"; @@ -196,3 +196,10 @@ is(length($uo), undef, "Length of overloaded reference"); # ok(!defined $uo); Turns you can't test this. FIXME for pp_defined? is($warnings, 0, "There were no warnings"); + +{ + my $y = "\x{100}BC"; + is(index($y, "B"), 1, 'adds an intermediate position to the offset cache'); + is(length $y, 3, + 'Check that sv_len_utf8() can take advantage of the offset cache'); +} diff --git a/t/op/pos.t b/t/op/pos.t index c3abfbe..04263e1 100644 --- a/t/op/pos.t +++ b/t/op/pos.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 6; +plan tests => 7; $x='banana'; $x=~/.a/g; @@ -28,3 +28,11 @@ $x = "123 56"; $x =~ / /g; is(pos($x), 4); { local $x } is(pos($x), 4); + +# Explict test that triggers the utf8_mg_len_cache_update() code path in +# Perl_sv_pos_b2u(). + +$x = "\x{100}BC"; +$x =~ m/.*/g; +is(pos $x, 3); + -- Perl5 Master Repository
