In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/85375852c1b4f84701cfe21f1161546c33a20d42?hp=c9701768842137264d0245944b038a5c23e589fe>
- Log ----------------------------------------------------------------- commit 85375852c1b4f84701cfe21f1161546c33a20d42 Author: Father Chrysostomos <[email protected]> Date: Tue Jul 23 14:38:30 2013 -0700 pos.t: test something I almost broke M t/op/pos.t commit e5b5ece22456ae38c5fbae9b5630d074b5a23080 Author: Father Chrysostomos <[email protected]> Date: Tue Jul 23 14:37:29 2013 -0700 fix typo in sv.c apidocs sv_pos_u2b_flags has no offsetp parameter. This was copied from sv_pos_u2b. M sv.c commit ab0894e548ee5755214913c09ee98d7335bcb698 Author: Father Chrysostomos <[email protected]> Date: Sun Jul 21 06:56:07 2013 -0700 sv.c: Remove overflow check in utf8 length cache In order to make large values of pos() possible, the previous commit enlarged the mg_len field of the magic struct. That field is also used for cached utf8 length values. Since it is now large enough to store any length the OS/compiler supports, the overflow never happens. M sv.c commit 6174b39a88cd48740c024cfb6035edb6ffed9f2d Author: Father Chrysostomos <[email protected]> Date: Sun Jul 21 00:38:28 2013 -0700 [perl #72766] Allow huge pos() settings This is part of #116907, too. It also fixes #72924 as a side effect; the next commit will explain. The value of pos($foo) was being stored as an I32, not allowing values above I32_MAX. Change it to SSize_t (the signed equivalent of size_t, representing the maximum string length the OS/compiler supports). This is accomplished by changing the size of the entry in the magic struct, which is the simplest fix. Other parts of the code base can benefit from this, too. We actually cast the pos value to STRLEN (size_t) when reading it, to allow *very* long strings. Only the value -1 is special, meaning there is no pos. So the maximum supported offset is 2**sizeof(size_t)-2. The regexp engine itself still cannot handle large strings, so being able to set pos to large values is useless right now. This is but one piece in a larger puzzle. Changing the size of mg->mg_len also requires that Perl_hv_placeholders_p change its type. This function should in fact not be in the API, since it exists solely to implement the HvPLACEHOLDERS macro. See <https://rt.perl.org/rt3/Ticket/Display.html?id=116907#txn-1237043>. M MANIFEST M embed.fnc M hv.c M mg.c M mg.h M pp.c M proto.h A t/bigmem/pos.t commit ef54055c17bc9effcdf6a8135d2b375b7c35dd62 Author: Father Chrysostomos <[email protected]> Date: Sun Jul 21 00:30:20 2013 -0700 Add sv_pos_b2u_flags This, similar to sv_pos_u2b_flags, is a more friendly variant of sv_pos_u2b that works with 2GB strings and actually returns a value instead of modifying a passed-in value in place through a pointer. The next commit will use this. M embed.fnc M embed.h M proto.h M sv.c ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + embed.fnc | 4 ++- embed.h | 1 + hv.c | 2 +- mg.c | 8 ++--- mg.h | 2 +- pod/perl5180delta.pod | 2 +- pp.c | 8 ++--- proto.h | 7 +++- sv.c | 92 ++++++++++++++++++++++++++++++++------------------- t/bigmem/pos.t | 25 ++++++++++++++ t/op/pos.t | 4 ++- 12 files changed, 108 insertions(+), 48 deletions(-) create mode 100644 t/bigmem/pos.t diff --git a/MANIFEST b/MANIFEST index 8fb53e0..4b68b19 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4848,6 +4848,7 @@ t/base/rs.t See if record-read works t/base/term.t See if various terms work t/base/while.t See if while work t/benchmark/rt26188-speed-up-keys-on-empty-hash.t Benchmark if keys on empty hashes is fast enough +t/bigmem/pos.t Check that pos() handles large offsets t/bigmem/read.t Check read() handles large offsets t/bigmem/vec.t Check vec() handles large offsets t/cmd/elsif.t See if else-if works diff --git a/embed.fnc b/embed.fnc index 18919f1..93944ba 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1375,6 +1375,8 @@ Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *con Apd |STRLEN |sv_pos_u2b_flags|NN SV *const sv|STRLEN uoffset \ |NULLOK STRLEN *const lenp|U32 flags Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp +Apd |STRLEN |sv_pos_b2u_flags|NN SV *const sv|STRLEN const offset \ + |U32 flags Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp Apd |char* |sv_pvbyten_force|NN SV *const sv|NULLOK STRLEN *const lp @@ -2458,7 +2460,7 @@ poM |AV** |hv_backreferences_p |NN HV *hv poM |void |hv_kill_backrefs |NN HV *hv #endif Apd |void |hv_clear_placeholders |NN HV *hv -ApoR |I32* |hv_placeholders_p |NN HV *hv +XpoR |SSize_t*|hv_placeholders_p |NN HV *hv ApoR |I32 |hv_placeholders_get |NN const HV *hv Apo |void |hv_placeholders_set |NN HV *hv|I32 ph diff --git a/embed.h b/embed.h index 795dd8c..4a0d28b 100644 --- a/embed.h +++ b/embed.h @@ -607,6 +607,7 @@ #define sv_nv(a) Perl_sv_nv(aTHX_ a) #define sv_peek(a) Perl_sv_peek(aTHX_ a) #define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b) +#define sv_pos_b2u_flags(a,b,c) Perl_sv_pos_b2u_flags(aTHX_ a,b,c) #define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c) #define sv_pos_u2b_flags(a,b,c,d) Perl_sv_pos_u2b_flags(aTHX_ a,b,c,d) #define sv_pvbyten(a,b) Perl_sv_pvbyten(aTHX_ a,b) diff --git a/hv.c b/hv.c index a2db86a..22d5603 100644 --- a/hv.c +++ b/hv.c @@ -2895,7 +2895,7 @@ S_share_hek_flags(pTHX_ const char *str, I32 len, U32 hash, int flags) return HeKEY_hek(entry); } -I32 * +SSize_t * Perl_hv_placeholders_p(pTHX_ HV *hv) { dVAR; diff --git a/mg.c b/mg.c index 0dd23f6..4ef6c25 100644 --- a/mg.c +++ b/mg.c @@ -2096,11 +2096,11 @@ Perl_magic_getpos(pTHX_ SV *sv, MAGIC *mg) PERL_ARGS_ASSERT_MAGIC_GETPOS; PERL_UNUSED_ARG(mg); - if (found && found->mg_len >= 0) { - I32 i = found->mg_len; + if (found && found->mg_len != -1) { + STRLEN i = found->mg_len; if (DO_UTF8(lsv)) - sv_pos_b2u(lsv, &i); - sv_setiv(sv, i); + i = sv_pos_b2u_flags(lsv, i, SV_GMAGIC|SV_CONST_RETURN); + sv_setuv(sv, i); return 0; } SvOK_off(sv); diff --git a/mg.h b/mg.h index 4114941..de673d4 100644 --- a/mg.h +++ b/mg.h @@ -26,7 +26,7 @@ struct magic { U16 mg_private; char mg_type; U8 mg_flags; - I32 mg_len; + SSize_t mg_len; SV* mg_obj; char* mg_ptr; }; diff --git a/pod/perl5180delta.pod b/pod/perl5180delta.pod index 86c4289..6ce41ef 100644 --- a/pod/perl5180delta.pod +++ b/pod/perl5180delta.pod @@ -2032,7 +2032,7 @@ accepts the default value. This helps B<perldoc> when handling ANSI escapes The test suite now has a section for tests that require very large amounts of memory. These tests won't run by default; they can be enabled by setting the C<PERL_TEST_MEMORY> environment variable to the number of -gibibytes of memory that may be safely used. +gibabytes of memory that may be safely used. =back diff --git a/pp.c b/pp.c index 1aaeefc..cadfe96 100644 --- a/pp.c +++ b/pp.c @@ -439,12 +439,12 @@ PP(pp_pos) } else { const MAGIC * const mg = mg_find_mglob(sv); - if (mg && mg->mg_len >= 0) { + if (mg && mg->mg_len != -1) { dTARGET; - I32 i = mg->mg_len; + STRLEN i = mg->mg_len; if (DO_UTF8(sv)) - sv_pos_b2u(sv, &i); - PUSHi(i); + i = sv_pos_b2u_flags(sv, i, SV_GMAGIC|SV_CONST_RETURN); + PUSHu(i); RETURN; } RETPUSHUNDEF; diff --git a/proto.h b/proto.h index 32607ff..e1425d2 100644 --- a/proto.h +++ b/proto.h @@ -1631,7 +1631,7 @@ PERL_CALLCONV I32 Perl_hv_placeholders_get(pTHX_ const HV *hv) #define PERL_ARGS_ASSERT_HV_PLACEHOLDERS_GET \ assert(hv) -PERL_CALLCONV I32* Perl_hv_placeholders_p(pTHX_ HV *hv) +PERL_CALLCONV SSize_t* Perl_hv_placeholders_p(pTHX_ HV *hv) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_HV_PLACEHOLDERS_P \ @@ -4157,6 +4157,11 @@ PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) #define PERL_ARGS_ASSERT_SV_POS_B2U \ assert(offsetp) +PERL_CALLCONV STRLEN Perl_sv_pos_b2u_flags(pTHX_ SV *const sv, STRLEN const offset, U32 flags) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_SV_POS_B2U_FLAGS \ + assert(sv) + PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 *const lenp) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_SV_POS_U2B \ diff --git a/sv.c b/sv.c index 0f682ea..0dfff05 100644 --- a/sv.c +++ b/sv.c @@ -6984,7 +6984,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start /* =for apidoc sv_pos_u2b_flags -Converts the value pointed to by offsetp from a count of UTF-8 chars from +Converts the offset from a count of UTF-8 chars from the start of the string, to a count of the equivalent number of bytes; if lenp is non-zero, it does the same to lenp, but this time starting from the offset, rather than from the start @@ -7093,9 +7093,6 @@ S_utf8_mg_len_cache_update(pTHX_ SV *const sv, MAGIC **const mgp, assert(*mgp); (*mgp)->mg_len = ulen; - /* For now, treat "overflowed" as "still unknown". See RT #72924. */ - if (ulen != (STRLEN) (*mgp)->mg_len) - (*mgp)->mg_len = -1; } /* Create and update the UTF8 magic offset cache, with the proffered utf8/ @@ -7273,44 +7270,41 @@ S_sv_pos_b2u_midway(pTHX_ const U8 *const s, const U8 *const target, } /* -=for apidoc sv_pos_b2u +=for apidoc sv_pos_b2u_flags -Converts the value pointed to by offsetp from a count of bytes from the -start of the string, to a count of the equivalent number of UTF-8 chars. -Handles magic and type coercion. +Converts the offset from a count of bytes from the start of the string, to +a count of the equivalent number of UTF-8 chars. Handles type coercion. +I<flags> is passed to C<SvPV_flags>, and usually should be +C<SV_GMAGIC|SV_CONST_RETURN> to handle magic. =cut */ /* - * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential - * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and - * byte offsets. + * sv_pos_b2u_flags() uses, like sv_pos_u2b_flags(), the mg_ptr of the + * potential PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 + * and byte offsets. * */ -void -Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) +STRLEN +Perl_sv_pos_b2u_flags(pTHX_ SV *const sv, STRLEN const offset, U32 flags) { const U8* s; - const STRLEN byte = *offsetp; STRLEN len = 0; /* Actually always set, but let's keep gcc happy. */ STRLEN blen; MAGIC* mg = NULL; const U8* send; bool found = FALSE; - PERL_ARGS_ASSERT_SV_POS_B2U; - - if (!sv) - return; + PERL_ARGS_ASSERT_SV_POS_B2U_FLAGS; - s = (const U8*)SvPV_const(sv, blen); + s = (const U8*)SvPV_flags(sv, blen, flags); - if (blen < byte) + if (blen < offset) Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset, blen=%"UVuf - ", byte=%"UVuf, (UV)blen, (UV)byte); + ", byte=%"UVuf, (UV)blen, (UV)offset); - send = s + byte; + send = s + offset; if (!SvREADONLY(sv) && PL_utf8cache @@ -7319,18 +7313,16 @@ Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) { if (mg->mg_ptr) { STRLEN * const cache = (STRLEN *) mg->mg_ptr; - if (cache[1] == byte) { + if (cache[1] == offset) { /* An exact match. */ - *offsetp = cache[0]; - return; + return cache[0]; } - if (cache[3] == byte) { + if (cache[3] == offset) { /* An exact match. */ - *offsetp = cache[2]; - return; + return cache[2]; } - if (cache[1] < byte) { + if (cache[1] < offset) { /* We already know part of the way. */ if (mg->mg_len != -1) { /* Actually, we know the end too. */ @@ -7341,7 +7333,7 @@ Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) len = cache[0] + utf8_length(s + cache[1], send); } } - else if (cache[3] < byte) { + else if (cache[3] < offset) { /* We're between the two cached pairs, so we do the calculation offset by the byte/utf-8 positions for the earlier pair, then add the utf-8 characters from the string start to @@ -7351,7 +7343,7 @@ Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) + cache[2]; } - else { /* cache[3] > byte */ + else { /* cache[3] > offset */ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[3], cache[2]); @@ -7370,14 +7362,46 @@ Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) assert_uft8_cache_coherent("sv_pos_b2u", len, real_len, sv); len = real_len; } - *offsetp = len; if (PL_utf8cache) { - if (blen == byte) + if (blen == offset) utf8_mg_len_cache_update(sv, &mg, len); else - utf8_mg_pos_cache_update(sv, &mg, byte, len, blen); + utf8_mg_pos_cache_update(sv, &mg, offset, len, blen); } + + return len; +} + +/* +=for apidoc sv_pos_b2u + +Converts the value pointed to by offsetp from a count of bytes from the +start of the string, to a count of the equivalent number of UTF-8 chars. +Handles magic and type coercion. + +Use C<sv_pos_b2u_flags> in preference, which correctly handles strings +longer than 2Gb. + +=cut +*/ + +/* + * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential + * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and + * byte offsets. + * + */ +void +Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) +{ + PERL_ARGS_ASSERT_SV_POS_B2U; + + if (!sv) + return; + + *offsetp = (I32)sv_pos_b2u_flags(sv, (STRLEN)*offsetp, + SV_GMAGIC|SV_CONST_RETURN); } static void diff --git a/t/bigmem/pos.t b/t/bigmem/pos.t new file mode 100644 index 0000000..aad93f6 --- /dev/null +++ b/t/bigmem/pos.t @@ -0,0 +1,25 @@ +#!perl +BEGIN { + chdir 't'; + unshift @INC, "../lib"; + require './test.pl'; +} + +use Config qw(%Config); + +$ENV{PERL_TEST_MEMORY} >= 2 + or skip_all("Need ~2Gb for this test"); +$Config{ptrsize} >= 8 + or skip_all("Need 64-bit pointers for this test"); + +plan(3); + +# [perl #116907] +# ${\2} to defeat constant folding, which in this case actually slows +# things down +my $x=" "x(${\2}**31+20); +pos $x = 2**31-5; +is pos $x, 2147483643, 'setting pos on large string'; +pos $x += 10; +is pos $x, 2147483653, 'reading lvalue pos after setting it > 2**31'; +is scalar(pos $x), 2147483653, 'reading it with pos() in rvalue context'; diff --git a/t/op/pos.t b/t/op/pos.t index 4eca3a6..b9691ad 100644 --- a/t/op/pos.t +++ b/t/op/pos.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 21; +plan tests => 22; $x='banana'; $x=~/.a/g; @@ -36,6 +36,8 @@ $x = "\x{100}BC"; $x =~ m/.*/g; is(pos $x, 3, "utf8_mg_len_cache_update() test"); +is(scalar pos $x, 3, "rvalue pos() utf8 test"); + my $destroyed; { package Class; DESTROY { ++$destroyed; } } -- Perl5 Master Repository
