In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/bbc87e33f84c794a9dad986cd2ae8d6bf83c494f?hp=162b417c061ec9190135629d421e3685e8d31dc0>
- Log ----------------------------------------------------------------- commit bbc87e33f84c794a9dad986cd2ae8d6bf83c494f Author: David Mitchell <[email protected]> Date: Tue Jan 13 14:58:22 2015 +0000 update AUTHORS M AUTHORS commit 9ce1a4d5ec32720328a5dce6ee796ce4b79d6faf Author: Rostislav Skudnov <[email protected]> Date: Wed Dec 24 08:12:52 2014 +0200 make re_intuit_string() return correct string Fix #123469 - Bug in split function, with utf8 strings Each regex has two SV pointers, check_substr and check_utf8, which hold a constant string (if any) corresponding to the longest constant string in the regexp. When the regex is first compiled, only one pointer is set, depending on whether the pattern is utf8 or not; but subsequent usage of the regex can instantiate the other pointer too. So which of the two strings re_intuit_string() should return should be based on the UTF8ness of the pattern, not whether check_substr is set. M regcomp.c M t/op/split.t ----------------------------------------------------------------------- Summary of changes: AUTHORS | 1 + regcomp.c | 9 +++++---- t/op/split.t | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/AUTHORS b/AUTHORS index f4d429d..7f2b8f0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1037,6 +1037,7 @@ Rodolfo Carvalho <[email protected]> Ronald F. Guilmette <[email protected]> Ronald J. Kimball <[email protected]> Ronald Schmidt <[email protected]> +Rostislav Skudnov <[email protected]> Ruben Schattevoy <[email protected]> Rudolph Todd Maceyko <[email protected]> Rujith S. de Silva <[email protected]> diff --git a/regcomp.c b/regcomp.c index 12ba778..cb6322a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -16606,21 +16606,22 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) DEBUG_COMPILE_r( { - const char * const s = SvPV_nolen_const(prog->check_substr - ? prog->check_substr : prog->check_utf8); + const char * const s = SvPV_nolen_const(RX_UTF8(r) + ? prog->check_utf8 : prog->check_substr); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", PL_colors[4], - prog->check_substr ? "" : "utf8 ", + RX_UTF8(r) ? "utf8 " : "", PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr ? prog->check_substr : prog->check_utf8; + /* use UTF8 check substring if regexp pattern itself is in UTF8 */ + return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; } /* diff --git a/t/op/split.t b/t/op/split.t index 9afdd6e..5d5c19d 100644 --- a/t/op/split.t +++ b/t/op/split.t @@ -6,7 +6,7 @@ BEGIN { set_up_inc('../lib'); } -plan tests => 125; +plan tests => 131; $FS = ':'; @@ -374,6 +374,21 @@ is($cnt, scalar(@ary)); } { + # LATIN SMALL LETTER A WITH DIAERESIS, CYRILLIC SMALL LETTER I + for my $pattern ("\x{e4}", "\x{0437}") { + utf8::upgrade $pattern; + my @res; + for my $str ("a${pattern}b", "axb", "a${pattern}b") { + @split = split /$pattern/, $str; + push @res, scalar(@split); + } + is($res[0], 2); + is($res[1], 1); + is($res[2], 2, '#123469 - split with utf8 pattern after handling non-utf8 EXPR'); + } +} + +{ is (\@a, \@{"a"}, '@a must be global for following test'); $p=""; $n = @a = split /,/,$p; -- Perl5 Master Repository
