In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/fe2ba0a2de216bca4582bfb493b196d2eb4c94ae?hp=cbf40e71df30fba4761230a8b62a34d7bb247495>
- Log ----------------------------------------------------------------- commit fe2ba0a2de216bca4582bfb493b196d2eb4c94ae Author: Karl Williamson <[email protected]> Date: Tue Jan 31 11:15:08 2017 -0700 PATCH: [perl #130656] tr// failue with UTF-8 across lines This bug happend under things like tr/\x{101}-\x{200}/ \x{201}-\x{301}/ The newline in the middle was crucial. As a result the second line got parsed already knowing that the result was UTF-8, and as a result setting a variable got skipped which happens only when we discover we need to flip into UTF-8. The solution adopted here is to set the variable under other conditions, which leads to it getting set multiple times. But this extra branch and setting is confined to somehwat rare circumstances, leaving the mainline code untouched. ----------------------------------------------------------------------- Summary of changes: t/op/tr.t | 12 +++++++++++- toke.c | 19 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/t/op/tr.t b/t/op/tr.t index 25c397dec3..323a5c3fa7 100644 --- a/t/op/tr.t +++ b/t/op/tr.t @@ -13,7 +13,7 @@ BEGIN { use utf8; -plan tests => 215; +plan tests => 216; # Test this first before we extend the stack with other operations. # This caused an asan failure due to a bad write past the end of the stack. @@ -702,5 +702,15 @@ for ("", nullrocow) { } +{ # [perl #130656] This bug happens when the tr is split across lines, so + # that the first line causes it to go into UTF-8, and the 2nd is only + # things like \x + my $x = "\x{E235}"; + $x =~ tr + [\x{E234}-\x{E342}\x{E5B5}-\x{E5DF}] + [\x{E5CD}-\x{E5DF}\x{EA80}-\x{EAFA}\x{EB0E}-\x{EB8E}\x{EAFB}-\x{EB0D}\x{E5B5}-\x{E5CC}]; + + is $x, "\x{E5CE}", '[perl #130656]'; +} 1; diff --git a/toke.c b/toke.c index 7dcdd5afa1..9972b97418 100644 --- a/toke.c +++ b/toke.c @@ -2866,8 +2866,6 @@ S_scan_const(pTHX_ char *start) bool didrange = FALSE; /* did we just finish a range? */ bool in_charclass = FALSE; /* within /[...]/ */ bool has_utf8 = FALSE; /* Output constant is UTF8 */ - bool has_above_latin1 = FALSE; /* does something require special - handling in tr/// ? */ bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be UTF8? But, this can show as true when the source isn't utf8, as for @@ -2882,6 +2880,14 @@ S_scan_const(pTHX_ char *start) STRLEN offset_to_max; /* The offset in the output to where the range high-end character is temporarily placed */ + /* Does something require special handling in tr/// ? This avoids extra + * work in a less likely case. As such, khw didn't feel it was worth + * adding any branches to the more mainline code to handle this, which + * means that this doesn't get set in some circumstances when things like + * \x{100} get expanded out. As a result there needs to be extra testing + * done in the tr code */ + bool has_above_latin1 = FALSE; + /* Note on sizing: The scanned constant is placed into sv, which is * initialized by newSV() assuming one byte of output for every byte of * input. This routine expects newSV() to allocate an extra byte for a @@ -2962,7 +2968,7 @@ S_scan_const(pTHX_ char *start) /* The tests here for being above Latin1 and similar ones * in the following 'else' suffice to find all such * occurences in the constant, except those added by a - * backslash escape sequence, like \x{100}. And all those + * backslash escape sequence, like \x{100}. Mostly, those * set 'has_above_latin1' as appropriate */ if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { has_above_latin1 = TRUE; @@ -3026,6 +3032,13 @@ S_scan_const(pTHX_ char *start) min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1); range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL); range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL); + + /* This compensates for not all code setting + * 'has_above_latin1', so that we don't skip stuff that + * should be executed */ + if (range_max > 255) { + has_above_latin1 = TRUE; + } } else { min_ptr = max_ptr - 1; -- Perl5 Master Repository
