In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/65ab9279784aa811d78b2903b57bc0e7947dec78?hp=e57ed4ecd4d7de38a79a316da8d657dad656f93f>
- Log ----------------------------------------------------------------- commit 65ab9279784aa811d78b2903b57bc0e7947dec78 Author: Tony Cook <t...@develop-help.com> Date: Tue Mar 16 23:46:48 2010 +1100 handle perl extended utf8 start bytes perl uses UTF8_IS_START() to test if a byte is a valid start byte, this didn't take perl's extended UTF-8 range into account. ----------------------------------------------------------------------- Summary of changes: t/op/chop.t | 21 ++++++++++++++++++++- utf8.h | 4 +++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/t/op/chop.t b/t/op/chop.t index 30f7bff..36f8cad 100644 --- a/t/op/chop.t +++ b/t/op/chop.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 139; +plan tests => 143; $_ = 'abc'; $c = foo(); @@ -243,3 +243,22 @@ foreach my $start (@chars) { map chomp(+()), ('')x68; ok(1, "extend sp in pp_chomp"); } + +{ + # [perl #73246] chop doesn't support utf8 + # the problem was UTF8_IS_START() didn't handle perl's extended UTF8 + my $utf = "\x{80000001}\x{80000000}"; + my $result = chop($utf); + is($utf, "\x{80000001}", "chopping high 'unicode'- remnant"); + is($result, "\x{80000000}", "chopping high 'unicode' - result"); + + SKIP: { + use Config; + $Config{ivsize} >= 8 + or skip("this build can't handle very large characters", 2); + my $utf = "\x{ffffffffffffffff}\x{fffffffffffffffe}"; + my $result = chop $utf; + is($utf, "\x{ffffffffffffffff}", "chop even higher 'unicode' - remnant"); + is($result, "\x{fffffffffffffffe}", "chop even higher 'unicode' - result"); + } +} diff --git a/utf8.h b/utf8.h index e58dded..b0cfedf 100644 --- a/utf8.h +++ b/utf8.h @@ -104,13 +104,15 @@ As you can see, the continuation bytes all begin with C<10>, and the leading bits of the start byte tell how many bytes there are in the encoded character. +Perl's extended UTF-8 means we can have start bytes up to FF. + */ #define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) /* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the * below might ought to be C2 */ -#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) +#define UTF8_IS_START(c) (((U8)c) >= 0xc0) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) #define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0) -- Perl5 Master Repository