In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/9df83ffded21f5c1d01d1ef847c3447b6553810f?hp=634d6919699655c843f8d8c3ea64922d0403c499>

- Log -----------------------------------------------------------------
commit 9df83ffded21f5c1d01d1ef847c3447b6553810f
Author: Nicholas Clark <[email protected]>
Date:   Mon Jul 12 13:39:19 2010 +0100

    Create S_assert_uft8_cache_coherent() with one copy of the cache panic code.
    
    Replacing 4 copies of this debugging-only routine with 1 reduces source and
    object code size.

M       embed.fnc
M       embed.h
M       proto.h
M       sv.c

commit 6ef2ab89d2567e144b289574a2e087dd7eec7894
Author: Nicholas Clark <[email protected]>
Date:   Mon Jul 12 13:09:28 2010 +0100

    Perl_sv_len_utf8 can use the UTF-8 offset cache to reduce its linear scan.
    
    Previously, if the scalar's character length wasn't yet known, but an offset
    midway was, the offset would be ignored, and the linear scan of UTF-8 was 
for
    the entire length of the scalar.

M       sv.c
M       t/op/length.t

commit 0d7caf4cc74eb29a95f71af5a91fef30ca848e41
Author: Nicholas Clark <[email protected]>
Date:   Mon Jul 12 11:56:59 2010 +0100

    Perl_sv_pos_b2u now calls utf8_mg_len_cache_update for the string end 
offset.
    
    Previously it would not take special action if the offset requested 
happened to
    be the end of the string, meaning that the (fixed size) UTF-8 offset cache
    would be used for a value which could (and should) be stored elsewhere.

M       sv.c
M       t/op/pos.t

commit 79d2d448469df7917fb728ca8a674f771610599c
Author: Nicholas Clark <[email protected]>
Date:   Mon Jul 12 11:38:31 2010 +0100

    S_sv_pos_u2b_cached now updates the UTF-8 length cache if at the end of 
string.
    
    Pass in a boolean to S_sv_pos_u2b_forwards, which sets it to true if it
    discovers that the UTF-8 offset is at (or after) the end of the string.
    This can only happen if we don't already know the SV's length (in Unicode
    characters), because if we know it, we always call S_sv_pos_u2b_midway().

M       embed.fnc
M       proto.h
M       sv.c

commit ec49a12ce17d116f4e9bda1c3d385aad560ec655
Author: Nicholas Clark <[email protected]>
Date:   Mon Jul 12 11:16:41 2010 +0100

    Break S_utf8_mg_len_cache_update() out from Perl_sv_len_utf8().

M       embed.fnc
M       embed.h
M       proto.h
M       sv.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc     |    7 ++-
 embed.h       |    4 ++
 proto.h       |   19 ++++++-
 sv.c          |  144 ++++++++++++++++++++++++++++++++-------------------------
 t/op/length.t |    9 +++-
 t/op/pos.t    |   10 ++++-
 6 files changed, 124 insertions(+), 69 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 15bd938..85beec1 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1884,16 +1884,21 @@ s       |int    |sv_2iuv_non_preserve   |NN SV *const sv
 sR     |I32    |expect_number  |NN char **const pattern
 #
 sn     |STRLEN |sv_pos_u2b_forwards|NN const U8 *const start \
-               |NN const U8 *const send|NN STRLEN *const uoffset
+               |NN const U8 *const send|NN STRLEN *const uoffset \
+               |NN bool *const at_end
 sn     |STRLEN |sv_pos_u2b_midway|NN const U8 *const start \
                |NN const U8 *send|STRLEN uoffset|const STRLEN uend
 s      |STRLEN |sv_pos_u2b_cached|NN SV *const sv|NN MAGIC **const mgp \
                |NN const U8 *const start|NN const U8 *const send \
                |STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
+s      |void   |utf8_mg_len_cache_update|NN SV *const sv|NN MAGIC **const mgp \
+               |const STRLEN ulen
 s      |void   |utf8_mg_pos_cache_update|NN SV *const sv|NN MAGIC **const mgp \
                |const STRLEN byte|const STRLEN utf8|const STRLEN blen
 s      |STRLEN |sv_pos_b2u_midway|NN const U8 *const s|NN const U8 *const 
target \
                |NN const U8 *end|STRLEN endu
+s      |void   |assert_uft8_cache_coherent|NN const char *const func \
+               |STRLEN from_cache|STRLEN real|NN SV *const sv
 sn     |char * |F0convert      |NV nv|NN char *const endbuf|NN STRLEN *const 
len
 #  if defined(PERL_OLD_COPY_ON_WRITE)
 sM     |void   |sv_release_COW |NN SV *sv|NN const char *pvx|NN SV *after
diff --git a/embed.h b/embed.h
index 5e79e58..d7a62dd 100644
--- a/embed.h
+++ b/embed.h
@@ -1594,8 +1594,10 @@
 #define sv_pos_u2b_forwards    S_sv_pos_u2b_forwards
 #define sv_pos_u2b_midway      S_sv_pos_u2b_midway
 #define sv_pos_u2b_cached      S_sv_pos_u2b_cached
+#define utf8_mg_len_cache_update       S_utf8_mg_len_cache_update
 #define utf8_mg_pos_cache_update       S_utf8_mg_pos_cache_update
 #define sv_pos_b2u_midway      S_sv_pos_b2u_midway
+#define assert_uft8_cache_coherent     S_assert_uft8_cache_coherent
 #define F0convert              S_F0convert
 #endif
 #  if defined(PERL_OLD_COPY_ON_WRITE)
@@ -4044,8 +4046,10 @@
 #define sv_pos_u2b_forwards    S_sv_pos_u2b_forwards
 #define sv_pos_u2b_midway      S_sv_pos_u2b_midway
 #define sv_pos_u2b_cached(a,b,c,d,e,f,g)       S_sv_pos_u2b_cached(aTHX_ 
a,b,c,d,e,f,g)
+#define utf8_mg_len_cache_update(a,b,c)        
S_utf8_mg_len_cache_update(aTHX_ a,b,c)
 #define utf8_mg_pos_cache_update(a,b,c,d,e)    
S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
 #define sv_pos_b2u_midway(a,b,c,d)     S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
+#define assert_uft8_cache_coherent(a,b,c,d)    
S_assert_uft8_cache_coherent(aTHX_ a,b,c,d)
 #define F0convert              S_F0convert
 #endif
 #  if defined(PERL_OLD_COPY_ON_WRITE)
diff --git a/proto.h b/proto.h
index 535dc78..1824377 100644
--- a/proto.h
+++ b/proto.h
@@ -5814,12 +5814,13 @@ STATIC I32      S_expect_number(pTHX_ char **const 
pattern)
        assert(pattern)
 
 #
-STATIC STRLEN  S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const 
send, STRLEN *const uoffset)
+STATIC STRLEN  S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const 
send, STRLEN *const uoffset, bool *const at_end)
                        __attribute__nonnull__(1)
                        __attribute__nonnull__(2)
-                       __attribute__nonnull__(3);
+                       __attribute__nonnull__(3)
+                       __attribute__nonnull__(4);
 #define PERL_ARGS_ASSERT_SV_POS_U2B_FORWARDS   \
-       assert(start); assert(send); assert(uoffset)
+       assert(start); assert(send); assert(uoffset); assert(at_end)
 
 STATIC STRLEN  S_sv_pos_u2b_midway(const U8 *const start, const U8 *send, 
STRLEN uoffset, const STRLEN uend)
                        __attribute__nonnull__(1)
@@ -5835,6 +5836,12 @@ STATIC STRLEN    S_sv_pos_u2b_cached(pTHX_ SV *const sv, 
MAGIC **const mgp, const U
 #define PERL_ARGS_ASSERT_SV_POS_U2B_CACHED     \
        assert(sv); assert(mgp); assert(start); assert(send)
 
+STATIC void    S_utf8_mg_len_cache_update(pTHX_ SV *const sv, MAGIC **const 
mgp, const STRLEN ulen)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_UTF8_MG_LEN_CACHE_UPDATE      \
+       assert(sv); assert(mgp)
+
 STATIC void    S_utf8_mg_pos_cache_update(pTHX_ SV *const sv, MAGIC **const 
mgp, const STRLEN byte, const STRLEN utf8, const STRLEN blen)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
@@ -5848,6 +5855,12 @@ STATIC STRLEN    S_sv_pos_b2u_midway(pTHX_ const U8 
*const s, const U8 *const targe
 #define PERL_ARGS_ASSERT_SV_POS_B2U_MIDWAY     \
        assert(s); assert(target); assert(end)
 
+STATIC void    S_assert_uft8_cache_coherent(pTHX_ const char *const func, 
STRLEN from_cache, STRLEN real, SV *const sv)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_4);
+#define PERL_ARGS_ASSERT_ASSERT_UFT8_CACHE_COHERENT    \
+       assert(func); assert(sv)
+
 STATIC char *  S_F0convert(NV nv, char *const endbuf, STRLEN *const len)
                        __attribute__nonnull__(2)
                        __attribute__nonnull__(3);
diff --git a/sv.c b/sv.c
index 3e99d9c..39689fd 100644
--- a/sv.c
+++ b/sv.c
@@ -6047,37 +6047,26 @@ Perl_sv_len_utf8(pTHX_ register SV *const sv)
            STRLEN ulen;
            MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
 
-           if (mg && mg->mg_len != -1) {
-               ulen = mg->mg_len;
+           if (mg && (mg->mg_len != -1 || mg->mg_ptr)) {
+               if (mg->mg_len != -1)
+                   ulen = mg->mg_len;
+               else {
+                   /* We can use the offset cache for a headstart.
+                      The longer value is stored in the first pair.  */
+                   STRLEN *cache = (STRLEN *) mg->mg_ptr;
+
+                   ulen = cache[0] + Perl_utf8_length(aTHX_ s + cache[1],
+                                                      s + len);
+               }
+               
                if (PL_utf8cache < 0) {
                    const STRLEN real = Perl_utf8_length(aTHX_ s, s + len);
-                   if (real != ulen) {
-                       /* Need to turn the assertions off otherwise we may
-                          recurse infinitely while printing error messages.
-                       */
-                       SAVEI8(PL_utf8cache);
-                       PL_utf8cache = 0;
-                       Perl_croak(aTHX_ "panic: sv_len_utf8 cache %"UVuf
-                                  " real %"UVuf" for %"SVf,
-                                  (UV) ulen, (UV) real, SVfARG(sv));
-                   }
+                   assert_uft8_cache_coherent("sv_len_utf8", ulen, real, sv);
                }
            }
            else {
                ulen = Perl_utf8_length(aTHX_ s, s + len);
-               if (!SvREADONLY(sv)) {
-                   if (!mg && (SvTYPE(sv) < SVt_PVMG ||
-                               !(mg = mg_find(sv, PERL_MAGIC_utf8)))) {
-                       mg = sv_magicext(sv, 0, PERL_MAGIC_utf8,
-                                        &PL_vtbl_utf8, 0, 0);
-                   }
-                   assert(mg);
-                   mg->mg_len = ulen;
-                   /* For now, treat "overflowed" as "still unknown".
-                      See RT #72924.  */
-                   if (ulen != (STRLEN) mg->mg_len)
-                       mg->mg_len = -1;
-               }
+               utf8_mg_len_cache_update(sv, &mg, ulen);
            }
            return ulen;
        }
@@ -6089,7 +6078,7 @@ Perl_sv_len_utf8(pTHX_ register SV *const sv)
    offset.  */
 static STRLEN
 S_sv_pos_u2b_forwards(const U8 *const start, const U8 *const send,
-                     STRLEN *const uoffset_p)
+                     STRLEN *const uoffset_p, bool *const at_end)
 {
     const U8 *s = start;
     STRLEN uoffset = *uoffset_p;
@@ -6100,7 +6089,11 @@ S_sv_pos_u2b_forwards(const U8 *const start, const U8 
*const send,
        --uoffset;
        s += UTF8SKIP(s);
     }
-    if (s > send) {
+    if (s == send) {
+       *at_end = TRUE;
+    }
+    else if (s > send) {
+       *at_end = TRUE;
        /* This is the existing behaviour. Possibly it should be a croak, as
           it's actually a bounds error  */
        s = send;
@@ -6157,6 +6150,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const 
mgp, const U8 *const start
 {
     STRLEN boffset = 0; /* Actually always set, but let's keep gcc happy.  */
     bool found = FALSE;
+    bool at_end = FALSE;
 
     PERL_ARGS_ASSERT_SV_POS_U2B_CACHED;
 
@@ -6197,7 +6191,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const 
mgp, const U8 *const start
                    uoffset -= uoffset0;
                    boffset = boffset0
                        + sv_pos_u2b_forwards(start + boffset0,
-                                               send, &uoffset);
+                                             send, &uoffset, &at_end);
                    uoffset += uoffset0;
                }
            }
@@ -6239,25 +6233,21 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const 
mgp, const U8 *const start
        STRLEN real_boffset;
        uoffset -= uoffset0;
        real_boffset = boffset0 + sv_pos_u2b_forwards(start + boffset0,
-                                                     send, &uoffset);
+                                                     send, &uoffset, &at_end);
        uoffset += uoffset0;
 
-       if (found && PL_utf8cache < 0) {
-           if (real_boffset != boffset) {
-               /* Need to turn the assertions off otherwise we may recurse
-                  infinitely while printing error messages.  */
-               SAVEI8(PL_utf8cache);
-               PL_utf8cache = 0;
-               Perl_croak(aTHX_ "panic: sv_pos_u2b_cache cache %"UVuf
-                          " real %"UVuf" for %"SVf,
-                          (UV) boffset, (UV) real_boffset, SVfARG(sv));
-           }
-       }
+       if (found && PL_utf8cache < 0)
+           assert_uft8_cache_coherent("sv_pos_u2b_cache", boffset,
+                                      real_boffset, sv);
        boffset = real_boffset;
     }
 
-    if (PL_utf8cache)
-       utf8_mg_pos_cache_update(sv, mgp, boffset, uoffset, send - start);
+    if (PL_utf8cache) {
+       if (at_end)
+           utf8_mg_len_cache_update(sv, mgp, uoffset);
+       else
+           utf8_mg_pos_cache_update(sv, mgp, boffset, uoffset, send - start);
+    }
     return boffset;
 }
 
@@ -6358,6 +6348,26 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const 
offsetp, I32 *const lenp
     }
 }
 
+static void
+S_utf8_mg_len_cache_update(pTHX_ SV *const sv, MAGIC **const mgp,
+                          const STRLEN ulen)
+{
+    PERL_ARGS_ASSERT_UTF8_MG_LEN_CACHE_UPDATE;
+    if (SvREADONLY(sv))
+       return;
+
+    if (!*mgp && (SvTYPE(sv) < SVt_PVMG ||
+                 !(*mgp = mg_find(sv, PERL_MAGIC_utf8)))) {
+       *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, &PL_vtbl_utf8, 0, 0);
+    }
+    assert(*mgp);
+
+    (*mgp)->mg_len = ulen;
+    /* For now, treat "overflowed" as "still unknown". See RT #72924.  */
+    if (ulen != (STRLEN) (*mgp)->mg_len)
+       (*mgp)->mg_len = -1;
+}
+
 /* Create and update the UTF8 magic offset cache, with the proffered utf8/
    byte length pairing. The (byte) length of the total SV is passed in too,
    as blen, because for some (more esoteric) SVs, the call to SvPV_const()
@@ -6416,14 +6426,8 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *const sv, MAGIC 
**const mgp, const STRLEN b
        const U8 *start = (const U8 *) SvPVX_const(sv);
        const STRLEN realutf8 = utf8_length(start, start + byte);
 
-       if (realutf8 != utf8) {
-           /* Need to turn the assertions off otherwise we may recurse
-              infinitely while printing error messages.  */
-           SAVEI8(PL_utf8cache);
-           PL_utf8cache = 0;
-           Perl_croak(aTHX_ "panic: utf8_mg_pos_cache_update cache %"UVuf
-                      " real %"UVuf" for %"SVf, (UV) utf8, (UV) realutf8, 
SVfARG(sv));
-       }
+       assert_uft8_cache_coherent("utf8_mg_pos_cache_update", utf8, realutf8,
+                                  sv);
     }
 
     /* Cache is held with the later position first, to simplify the code
@@ -6644,23 +6648,37 @@ Perl_sv_pos_b2u(pTHX_ register SV *const sv, I32 *const 
offsetp)
     if (!found || PL_utf8cache < 0) {
        const STRLEN real_len = utf8_length(s, send);
 
-       if (found && PL_utf8cache < 0) {
-           if (len != real_len) {
-               /* Need to turn the assertions off otherwise we may recurse
-                  infinitely while printing error messages.  */
-               SAVEI8(PL_utf8cache);
-               PL_utf8cache = 0;
-               Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVuf
-                          " real %"UVuf" for %"SVf,
-                          (UV) len, (UV) real_len, SVfARG(sv));
-           }
-       }
+       if (found && PL_utf8cache < 0)
+           assert_uft8_cache_coherent("sv_pos_b2u", len, real_len, sv);
        len = real_len;
     }
     *offsetp = len;
 
-    if (PL_utf8cache)
-       utf8_mg_pos_cache_update(sv, &mg, byte, len, blen);
+    if (PL_utf8cache) {
+       if (blen == byte)
+           utf8_mg_len_cache_update(sv, &mg, len);
+       else
+           utf8_mg_pos_cache_update(sv, &mg, byte, len, blen);
+    }
+}
+
+static void
+S_assert_uft8_cache_coherent(pTHX_ const char *const func, STRLEN from_cache,
+                            STRLEN real, SV *const sv)
+{
+    PERL_ARGS_ASSERT_ASSERT_UFT8_CACHE_COHERENT;
+
+    /* As this is debugging only code, save space by keeping this test here,
+       rather than inlining it in all the callers.  */
+    if (from_cache == real)
+       return;
+
+    /* Need to turn the assertions off otherwise we may recurse infinitely
+       while printing error messages.  */
+    SAVEI8(PL_utf8cache);
+    PL_utf8cache = 0;
+    Perl_croak(aTHX_ "panic: %s cache %"UVuf" real %"UVuf" for %"SVf,
+              func, (UV) from_cache, (UV) real, SVfARG(sv));
 }
 
 /*
diff --git a/t/op/length.t b/t/op/length.t
index eb35720..c73d4c5 100644
--- a/t/op/length.t
+++ b/t/op/length.t
@@ -6,7 +6,7 @@ BEGIN {
     @INC = '../lib';
 }
 
-plan (tests => 28);
+plan (tests => 30);
 
 print "not " unless length("")    == 0;
 print "ok 1\n";
@@ -196,3 +196,10 @@ is(length($uo), undef, "Length of overloaded reference");
 # ok(!defined $uo); Turns you can't test this. FIXME for pp_defined?
 
 is($warnings, 0, "There were no warnings");
+
+{
+    my $y = "\x{100}BC";
+    is(index($y, "B"), 1, 'adds an intermediate position to the offset cache');
+    is(length $y, 3,
+       'Check that sv_len_utf8() can take advantage of the offset cache');
+}
diff --git a/t/op/pos.t b/t/op/pos.t
index c3abfbe..04263e1 100644
--- a/t/op/pos.t
+++ b/t/op/pos.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 6;
+plan tests => 7;
 
 $x='banana';
 $x=~/.a/g;
@@ -28,3 +28,11 @@ $x = "123 56"; $x =~ / /g;
 is(pos($x), 4);
 { local $x }
 is(pos($x), 4);
+
+# Explict test that triggers the utf8_mg_len_cache_update() code path in
+# Perl_sv_pos_b2u().
+
+$x = "\x{100}BC";
+$x =~ m/.*/g;
+is(pos $x, 3);
+

--
Perl5 Master Repository

Reply via email to