In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/52b9aa85a8c28ddc591b0a7f2b1f8c729075d9a1?hp=8f3d5996a665cf70e12a836b95e184e9ab628251>
- Log ----------------------------------------------------------------- commit 52b9aa85a8c28ddc591b0a7f2b1f8c729075d9a1 Author: Nicholas Clark <[email protected]> Date: Sun Oct 18 22:09:14 2009 +0100 In utf16_to_utf8(), fix off-by-one errors for the range of valid surrogates. Both high ends were one too low. M ext/XS-APItest/t/utf16_to_utf8.t M utf8.c commit dbde19516d139ef4237fc56ac1a14665a9f13c0b Author: Nicholas Clark <[email protected]> Date: Sun Oct 18 22:01:49 2009 +0100 utf16_to_utf8() should croak on encountering a bare low surrogate. M ext/XS-APItest/t/utf16_to_utf8.t M utf8.c commit 01ea242be7d23d3bfac7a37c0cdfaec0a8eb7e33 Author: Nicholas Clark <[email protected]> Date: Sun Oct 18 21:55:52 2009 +0100 utf16_to_utf8() should croak if the buffer ends without the second surrogate. M ext/XS-APItest/t/utf16_to_utf8.t M utf8.c commit e0ea5e2d50a479e160d39f481e02abd7c0c9cf91 Author: Nicholas Clark <[email protected]> Date: Sun Oct 18 21:30:41 2009 +0100 utf16_to_utf8_reversed() should croak early when passed an odd byte length. Rather than transposing n + 1 bytes, including 1 it was not passed, before calling utf16_to_utf8() and having that croak. e 69422~ M ext/XS-APItest/t/utf16_to_utf8.t M pod/perldiag.pod M utf8.c commit 30685b5659009a95642202219acc6ded18f74dbc Author: Nicholas Clark <[email protected]> Date: Sun Oct 18 21:06:06 2009 +0100 Expose utf16_to_utf8{,reversed} via XS::APItest, and provide some basic tests. M MANIFEST M ext/XS-APItest/APItest.pm M ext/XS-APItest/APItest.xs A ext/XS-APItest/t/utf16_to_utf8.t ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + ext/XS-APItest/APItest.pm | 4 +- ext/XS-APItest/APItest.xs | 29 ++++++++++++++++ ext/XS-APItest/t/utf16_to_utf8.t | 67 ++++++++++++++++++++++++++++++++++++++ pod/perldiag.pod | 5 +++ utf8.c | 20 ++++++++--- 6 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 ext/XS-APItest/t/utf16_to_utf8.t diff --git a/MANIFEST b/MANIFEST index 8b0f5d8..7d247df 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3204,6 +3204,7 @@ ext/XS-APItest/t/push.t XS::APItest extension ext/XS-APItest/t/rmagical.t XS::APItest extension ext/XS-APItest/t/svpeek.t XS::APItest extension ext/XS-APItest/t/svsetsv.t Test behaviour of sv_setsv with/without PERL_CORE +ext/XS-APItest/t/utf16_to_utf8.t Test behaviour of utf16_to_utf8{,reversed} ext/XS-APItest/t/xs_special_subs_require.t for require too ext/XS-APItest/t/xs_special_subs.t Test that XS BEGIN/CHECK/INIT/END work ext/XS-Typemap/Makefile.PL XS::Typemap extension diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm index 12d0a03..c40e4b8 100644 --- a/ext/XS-APItest/APItest.pm +++ b/ext/XS-APItest/APItest.pm @@ -23,10 +23,10 @@ our @EXPORT = qw( print_double print_int print_long my_cxt_getint my_cxt_getsv my_cxt_setint my_cxt_setsv sv_setsv_cow_hashkey_core sv_setsv_cow_hashkey_notcore rmagical_cast rmagical_flags - DPeek + DPeek utf16_to_utf8 utf16_to_utf8_reversed ); -our $VERSION = '0.15'; +our $VERSION = '0.16'; use vars '$WARNINGS_ON_BOOTSTRAP'; use vars map "\$${_}_called_PP", qw(BEGIN UNITCHECK CHECK INIT END); diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 7e7f78b..4eac4a6 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -892,3 +892,32 @@ void END() CODE: sv_inc(get_sv("XS::APItest::END_called", GV_ADD|GV_ADDMULTI)); + +void +utf16_to_utf8 (sv, ...) + SV* sv + ALIAS: + utf16_to_utf8_reversed = 1 + PREINIT: + STRLEN len; + U8 *source; + SV *dest; + I32 got; /* Gah, badly thought out APIs */ + CODE: + source = (U8 *)SvPVbyte(sv, len); + /* Optionally only convert part of the buffer. */ + if (items > 1) { + len = SvUV(ST(1)); + } + /* Mortalise this right now, as we'll be testing croak()s */ + dest = sv_2mortal(newSV(len * 3 / 2 + 1)); + if (ix) { + utf16_to_utf8_reversed(source, SvPVX(dest), len, &got); + } else { + utf16_to_utf8(source, SvPVX(dest), len, &got); + } + SvCUR_set(dest, got); + SvPVX(dest)[got] = '\0'; + SvPOK_on(dest); + ST(0) = dest; + XSRETURN(1); diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t new file mode 100644 index 0000000..592d0b1 --- /dev/null +++ b/ext/XS-APItest/t/utf16_to_utf8.t @@ -0,0 +1,67 @@ +#!perl -w + +use strict; +use Test::More 'no_plan'; +use Encode; + +use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed); + +for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD, + 0x10000, 0x10FC00, 0x103FF, 0x10FFFD) { + my $chr = chr $ord; + for my $prefix ('', "\0", 'Perl rules') { + for my $suffix ('', "\0", "Moo!") { + my $string = $prefix . $chr . $suffix; + my $name = sprintf "for chr $ord prefix %d, suffix %d", + length $prefix, length $suffix; + my $as_utf8 = encode('UTF-8', $string); + is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8, + "utf16_to_utf8 $name"); + is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8, + "utf16_to_utf8_reversed $name"); + } + } +} + +# Currently this is special-cased to work. Should it? + +is(utf16_to_utf8("\0"), "\0", 'Short string to utf16_to_utf8'); + +# But anything else is fatal + +my $got = eval {utf16_to_utf8('N')}; +like($@, qr/^panic: utf16_to_utf8: odd bytelen 1 at/, 'Odd byte length panics'); +is($got, undef, 'hence eval returns undef'); + +for (["\xD8\0\0\0", 'NULs'], + ["\xD8\0\xD8\0", '2 Lows'], + ["\xDC\0\0\0", 'High NUL'], + ["\xDC\0\xD8\0", 'High Low'], + ["\xDC\0\xDC\0", 'High High'], + ) { + my ($malformed, $name) = @$_; + $got = eval {utf16_to_utf8($malformed)}; + like($@, qr/^Malformed UTF-16 surrogate at/, + "Malformed surrogate $name croaks for utf16_to_utf8"); + is($got, undef, 'hence eval returns undef'); + + $malformed =~ s/(.)(.)/$2$1/gs; + $got = eval {utf16_to_utf8_reversed($malformed)}; + like($@, qr/^Malformed UTF-16 surrogate at/, + "Malformed surrogate $name croaks for utf16_to_utf8_reversed"); + is($got, undef, 'hence eval returns undef'); +} + +my $in = "NA"; +$got = eval {utf16_to_utf8_reversed($in, 1)}; +like($@, qr/^panic: utf16_to_utf8_reversed: odd bytelen 1 at/, + 'Odd byte length panics'); +is($got, undef, 'hence eval returns undef'); +is($in, "NA", 'and input unchanged'); + +$in = "\xD8\0\xDC\0"; +$got = eval {utf16_to_utf8($in, 2)}; +like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks'); +(ok(!defined $got, 'hence eval returns undef')) or + diag(join ', ', map {ord $_} split //, $got); + diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 255eb53..f1f081d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3220,6 +3220,11 @@ at run time. (P) Something tried to call utf16_to_utf8 with an odd (as opposed to even) byte length. +=item panic: utf16_to_utf8_reversed: odd bytelen + +(P) Something tried to call utf16_to_utf8_reversed with an odd (as opposed +to even) byte length. + =item panic: yylex (P) The lexer got into a bad state while processing a case modifier. diff --git a/utf8.c b/utf8.c index 7b7fd57..3de02ed 100644 --- a/utf8.c +++ b/utf8.c @@ -985,12 +985,18 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = (U8)(( uv & 0x3f) | 0x80); continue; } - if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ - UV low = (p[0] << 8) + p[1]; - p += 2; - if (low < 0xdc00 || low >= 0xdfff) + if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */ + if (p >= pend) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); - uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } else { + UV low = (p[0] << 8) + p[1]; + p += 2; + if (low < 0xdc00 || low > 0xdfff) + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); + uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } + } else if (uv >= 0xdc00 && uv <= 0xdfff) { + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } if (uv < 0x10000) { *d++ = (U8)(( uv >> 12) | 0xe0); @@ -1020,6 +1026,10 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; + if (bytelen & 1) + Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, + (UV)bytelen); + while (s < send) { const U8 tmp = s[0]; s[0] = s[1]; -- Perl5 Master Repository
