In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/841a5e1869a65f80379c03832eaf9887546a9622?hp=d93e3b8ce477b5ca3fdd199244bc7d258ae60fdd>
- Log ----------------------------------------------------------------- commit 841a5e1869a65f80379c03832eaf9887546a9622 Author: Father Chrysostomos <[email protected]> Date: Wed Jan 1 06:06:30 2014 -0800 pp.c: Simplify lc and uc stringification code Originally, lc and uc would not warn about undef, due to an implemen- tation detail. The implementation changed in 673061948, and extra code was added to keep the behaviour the same. Commit 0a0ffbced enabled the warnings about undef, but did so by added even more code in the midst of the blocks that existed solely to avoid the warning. We can just delete those blocks and put in a simple stringification. M pp.c commit 6006ebd02a4b62ba7535f6398bbb2d33a9ef46cc Author: Father Chrysostomos <[email protected]> Date: Wed Jan 1 05:56:15 2014 -0800 pp.c: Improve self-referential comment pp.c:pp_lc has this: /* Here is where we would do context-sensitive actions. See the * commit message for this comment for why there isn't any */ If I try to look up the commit that added the comment, I get this: commit 06b5486afd6f58eb7fdf8c5c8cdb8520a4c87f40 Author: Karl Williamson <[email protected]> Date: Fri Nov 11 10:13:28 2011 -0700 pp.c: White-space only This outdents and reflows comments as a result of the removal of a surrounding block 86510fb15 was the commit that added the comment, whose commit message contains the explanation, so cite that directly. M pp.c commit 5cd5e2d6301836ca9b0f94e9a100e697bd374cd8 Author: Father Chrysostomos <[email protected]> Date: Wed Jan 1 05:51:36 2014 -0800 Reënable in-place lc/uc It used to be that this code: for("$foo") { lc $_; ... } would modify $_, allowing other code in the âforâ block to see the changes (bug #43207). Commit 17fa077605 fixed that by changing the logic that determined whether lc/uc(first) could modify the sca- lar in place. In doing so, it stopped in-place modification from happening at all, because the condition became SvPADTMP && SvTEMP, which never happens. (SvPADTMP unually indicates an operator return value stored in a pad; i.e., a scalar that will next be used by the same operator again to return another value. SvTEMP indicates that the REFCNT will go down shortly, usually a temporary value created solely for the sake of returning something.) Now that bug #78194 is fixed, for("$foo") no longer exposes a PADTMP to the following code, so we *can* now assume (as was done erroneously before) that PADTMP indicates something like lc("$foo$bar") and modify pp_stringifyâs return value in place. Also, we can extend this to apply to TEMP variables that have a ref- erence count of 1, since they cannot be in use elsewhere. We skip TEMP variables with set-magic, because they could be tied, and SvSETMAGIC would have a side effect. (That could happen with lc(delete $h{tied_elem}).) Previously, this was skipped for uc and lc for overloaded references, since stringification could change the utf8ness. That is no longer sufficient. As of Perl 5.16, typeglobs and non-overloaded blessed references can also enable their utf8 flag upon stringification, if the stash or glob names contains wide characters. So I changed the !SvAMAGIC (not overloaded) to SvPOK (is a string already), which will cover most cases where this optimisation helps. The two tests added to the end of lc.t fail with !SvAMAGIC. M pp.c M t/op/lc.t commit 4b3427080c3b11e528bd84c7509af7fc8a60dcb1 Author: Father Chrysostomos <[email protected]> Date: Mon Dec 30 21:29:49 2013 -0800 lc.t: More tests for #43207 M t/op/lc.t ----------------------------------------------------------------------- Summary of changes: pp.c | 64 +++++++++++++++++++-------------------------------------------- t/op/lc.t | 34 ++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 52 deletions(-) diff --git a/pp.c b/pp.c index 4175808..dd4d89a 100644 --- a/pp.c +++ b/pp.c @@ -3480,20 +3480,15 @@ PP(pp_ucfirst) * UTF-8 or not, but in either case is the number of bytes */ bool tainted = FALSE; - SvGETMAGIC(source); - if (SvOK(source)) { - s = (const U8*)SvPV_nomg_const(source, slen); - } else { - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit(source); - s = (const U8*)""; - slen = 0; - } + s = (const U8*)SvPV_const(source, slen); /* We may be able to get away with changing only the first character, in * place, but not if read-only, etc. Later we may discover more reasons to * not convert in-place. */ - inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source); + inplace = !SvREADONLY(source) + && ( SvPADTMP(source) + || ( SvTEMP(source) && !SvSMAGICAL(source) + && SvREFCNT(source) == 1)); /* First calculate what the changed first character should be. This affects * whether we can just swap it out, leaving the rest of the string unchanged, @@ -3706,8 +3701,11 @@ PP(pp_uc) SvGETMAGIC(source); - if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source) + if ((SvPADTMP(source) + || + (SvTEMP(source) && !SvSMAGICAL(source) && SvREFCNT(source) == 1)) + && !SvREADONLY(source) && SvPOK(source) + && !DO_UTF8(source) && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) { /* We can convert in place. The reason we can't if in UNI_8_BIT is to @@ -3725,21 +3723,7 @@ PP(pp_uc) dest = TARG; - /* The old implementation would copy source into TARG at this point. - This had the side effect that if source was undef, TARG was now - an undefined SV with PADTMP set, and they don't warn inside - sv_2pv_flags(). However, we're now getting the PV direct from - source, which doesn't have PADTMP set, so it would warn. Hence the - little games. */ - - if (SvOK(source)) { - s = (const U8*)SvPV_nomg_const(source, len); - } else { - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit(source); - s = (const U8*)""; - len = 0; - } + s = (const U8*)SvPV_nomg_const(source, len); min = len + 1; SvUPGRADE(dest, SVt_PV); @@ -3952,8 +3936,12 @@ PP(pp_lc) SvGETMAGIC(source); - if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source)) { + if ( ( SvPADTMP(source) + || ( SvTEMP(source) && !SvSMAGICAL(source) + && SvREFCNT(source) == 1 ) + ) + && !SvREADONLY(source) && SvPOK(source) + && !DO_UTF8(source)) { /* We can convert in place, as lowercasing anything in the latin1 range * (or else DO_UTF8 would have been on) doesn't lengthen it */ @@ -3965,21 +3953,7 @@ PP(pp_lc) dest = TARG; - /* The old implementation would copy source into TARG at this point. - This had the side effect that if source was undef, TARG was now - an undefined SV with PADTMP set, and they don't warn inside - sv_2pv_flags(). However, we're now getting the PV direct from - source, which doesn't have PADTMP set, so it would warn. Hence the - little games. */ - - if (SvOK(source)) { - s = (const U8*)SvPV_nomg_const(source, len); - } else { - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit(source); - s = (const U8*)""; - len = 0; - } + s = (const U8*)SvPV_nomg_const(source, len); min = len + 1; SvUPGRADE(dest, SVt_PV); @@ -4005,7 +3979,7 @@ PP(pp_lc) cBOOL(IN_LOCALE_RUNTIME), &tainted); /* Here is where we would do context-sensitive actions. See the - * commit message for this comment for why there isn't any */ + * commit message for 86510fb15 for why there isn't any */ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { diff --git a/t/op/lc.t b/t/op/lc.t index ae15625..66f365b 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -10,7 +10,7 @@ BEGIN { use feature qw( fc ); -plan tests => 129; +plan tests => 134; is(lc(undef), "", "lc(undef) is ''"); is(lcfirst(undef), "", "lcfirst(undef) is ''"); @@ -256,17 +256,26 @@ for (1, 4, 9, 16, 25) { } # bug #43207 -my $temp = "Hello"; +my $temp = "HellO"; for ("$temp") { lc $_; - is($_, "Hello"); + is($_, "HellO", '[perl #43207] lc($_) modifying $_'); } - -# bug #43207 -my $temp = "Hello"; for ("$temp") { fc $_; - is($_, "Hello"); + is($_, "HellO", '[perl #43207] fc($_) modifying $_'); +} +for ("$temp") { + uc $_; + is($_, "HellO", '[perl #43207] uc($_) modifying $_'); +} +for ("$temp") { + ucfirst $_; + is($_, "HellO", '[perl #43207] ucfirst($_) modifying $_'); +} +for ("$temp") { + lcfirst $_; + is($_, "HellO", '[perl #43207] lcfirst($_) modifying $_'); } # new in Unicode 5.1.0 @@ -293,3 +302,14 @@ fresh_perl_like(<<'constantfolding', qr/^(\d+),\1\z/, {}, } constantfolding 'folded uc() in string eval uses the right hints'); + +# In-place lc/uc should not corrupt string buffers when given a non-utf8- +# flagged thingy that stringifies to utf8 +$h{k} = bless[], "\x{3b0}\x{3b0}\x{3b0}bcde"; # U+03B0 grows with uc() + # using delete marks it as TEMP, so uc-in-place is permitted +like uc delete $h{k}, qr "^(?:\x{3a5}\x{308}\x{301}){3}BCDE=ARRAY\(.*\)", + 'uc(TEMP ref) does not produce a corrupt string'; +$h{k} = bless[], "\x{130}bcde"; # U+0130 grows with lc() + # using delete marks it as TEMP, so uc-in-place is permitted +like lc delete $h{k}, qr "^i\x{307}bcde=array\(.*\)", + 'lc(TEMP ref) does not produce a corrupt string'; -- Perl5 Master Repository
