In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/52b9aa85a8c28ddc591b0a7f2b1f8c729075d9a1?hp=8f3d5996a665cf70e12a836b95e184e9ab628251>

- Log -----------------------------------------------------------------
commit 52b9aa85a8c28ddc591b0a7f2b1f8c729075d9a1
Author: Nicholas Clark <[email protected]>
Date:   Sun Oct 18 22:09:14 2009 +0100

    In utf16_to_utf8(), fix off-by-one errors for the range of valid surrogates.
    
    Both high ends were one too low.

M       ext/XS-APItest/t/utf16_to_utf8.t
M       utf8.c

commit dbde19516d139ef4237fc56ac1a14665a9f13c0b
Author: Nicholas Clark <[email protected]>
Date:   Sun Oct 18 22:01:49 2009 +0100

    utf16_to_utf8() should croak on encountering a bare low surrogate.

M       ext/XS-APItest/t/utf16_to_utf8.t
M       utf8.c

commit 01ea242be7d23d3bfac7a37c0cdfaec0a8eb7e33
Author: Nicholas Clark <[email protected]>
Date:   Sun Oct 18 21:55:52 2009 +0100

    utf16_to_utf8() should croak if the buffer ends without the second 
surrogate.

M       ext/XS-APItest/t/utf16_to_utf8.t
M       utf8.c

commit e0ea5e2d50a479e160d39f481e02abd7c0c9cf91
Author: Nicholas Clark <[email protected]>
Date:   Sun Oct 18 21:30:41 2009 +0100

    utf16_to_utf8_reversed() should croak early when passed an odd byte length.
    
    Rather than transposing n + 1 bytes, including 1 it was not passed, before
    calling utf16_to_utf8() and having that croak.
    e   69422~

M       ext/XS-APItest/t/utf16_to_utf8.t
M       pod/perldiag.pod
M       utf8.c

commit 30685b5659009a95642202219acc6ded18f74dbc
Author: Nicholas Clark <[email protected]>
Date:   Sun Oct 18 21:06:06 2009 +0100

    Expose utf16_to_utf8{,reversed} via XS::APItest, and provide some basic 
tests.

M       MANIFEST
M       ext/XS-APItest/APItest.pm
M       ext/XS-APItest/APItest.xs
A       ext/XS-APItest/t/utf16_to_utf8.t
-----------------------------------------------------------------------

Summary of changes:
 MANIFEST                         |    1 +
 ext/XS-APItest/APItest.pm        |    4 +-
 ext/XS-APItest/APItest.xs        |   29 ++++++++++++++++
 ext/XS-APItest/t/utf16_to_utf8.t |   67 ++++++++++++++++++++++++++++++++++++++
 pod/perldiag.pod                 |    5 +++
 utf8.c                           |   20 ++++++++---
 6 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 ext/XS-APItest/t/utf16_to_utf8.t

diff --git a/MANIFEST b/MANIFEST
index 8b0f5d8..7d247df 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3204,6 +3204,7 @@ ext/XS-APItest/t/push.t           XS::APItest extension
 ext/XS-APItest/t/rmagical.t    XS::APItest extension
 ext/XS-APItest/t/svpeek.t      XS::APItest extension
 ext/XS-APItest/t/svsetsv.t     Test behaviour of sv_setsv with/without 
PERL_CORE
+ext/XS-APItest/t/utf16_to_utf8.t       Test behaviour of 
utf16_to_utf8{,reversed}
 ext/XS-APItest/t/xs_special_subs_require.t     for require too
 ext/XS-APItest/t/xs_special_subs.t     Test that XS BEGIN/CHECK/INIT/END work
 ext/XS-Typemap/Makefile.PL     XS::Typemap extension
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm
index 12d0a03..c40e4b8 100644
--- a/ext/XS-APItest/APItest.pm
+++ b/ext/XS-APItest/APItest.pm
@@ -23,10 +23,10 @@ our @EXPORT = qw( print_double print_int print_long
                  my_cxt_getint my_cxt_getsv my_cxt_setint my_cxt_setsv
                  sv_setsv_cow_hashkey_core sv_setsv_cow_hashkey_notcore
                  rmagical_cast rmagical_flags
-                 DPeek
+                 DPeek utf16_to_utf8 utf16_to_utf8_reversed
 );
 
-our $VERSION = '0.15';
+our $VERSION = '0.16';
 
 use vars '$WARNINGS_ON_BOOTSTRAP';
 use vars map "\$${_}_called_PP", qw(BEGIN UNITCHECK CHECK INIT END);
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 7e7f78b..4eac4a6 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -892,3 +892,32 @@ void
 END()
     CODE:
        sv_inc(get_sv("XS::APItest::END_called", GV_ADD|GV_ADDMULTI));
+
+void
+utf16_to_utf8 (sv, ...)
+    SV* sv
+       ALIAS:
+           utf16_to_utf8_reversed = 1
+    PREINIT:
+        STRLEN len;
+       U8 *source;
+       SV *dest;
+       I32 got; /* Gah, badly thought out APIs */
+    CODE:
+       source = (U8 *)SvPVbyte(sv, len);
+       /* Optionally only convert part of the buffer.  */      
+       if (items > 1) {
+           len = SvUV(ST(1));
+       }
+       /* Mortalise this right now, as we'll be testing croak()s  */
+       dest = sv_2mortal(newSV(len * 3 / 2 + 1));
+       if (ix) {
+           utf16_to_utf8_reversed(source, SvPVX(dest), len, &got);
+       } else {
+           utf16_to_utf8(source, SvPVX(dest), len, &got);
+       }
+       SvCUR_set(dest, got);
+       SvPVX(dest)[got] = '\0';
+       SvPOK_on(dest);
+       ST(0) = dest;
+       XSRETURN(1);
diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t
new file mode 100644
index 0000000..592d0b1
--- /dev/null
+++ b/ext/XS-APItest/t/utf16_to_utf8.t
@@ -0,0 +1,67 @@
+#!perl -w
+
+use strict;
+use Test::More 'no_plan';
+use Encode;
+
+use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed);
+
+for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
+            0x10000, 0x10FC00, 0x103FF, 0x10FFFD) {
+    my $chr = chr $ord;
+    for my $prefix ('', "\0", 'Perl rules') {
+       for my $suffix ('', "\0", "Moo!") {
+           my $string = $prefix . $chr . $suffix;
+           my $name = sprintf "for chr $ord prefix %d, suffix %d",
+               length $prefix, length $suffix;
+           my $as_utf8 = encode('UTF-8', $string);
+           is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8,
+              "utf16_to_utf8 $name");
+           is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8,
+              "utf16_to_utf8_reversed $name");
+       }
+    }
+}
+
+# Currently this is special-cased to work. Should it?
+
+is(utf16_to_utf8("\0"), "\0", 'Short string to utf16_to_utf8');
+
+# But anything else is fatal
+
+my $got = eval {utf16_to_utf8('N')};
+like($@, qr/^panic: utf16_to_utf8: odd bytelen 1 at/, 'Odd byte length 
panics');
+is($got, undef, 'hence eval returns undef');
+
+for (["\xD8\0\0\0", 'NULs'],
+     ["\xD8\0\xD8\0", '2 Lows'],
+     ["\xDC\0\0\0", 'High NUL'],
+     ["\xDC\0\xD8\0", 'High Low'],
+     ["\xDC\0\xDC\0", 'High High'],
+    ) {
+    my ($malformed, $name) = @$_;
+    $got = eval {utf16_to_utf8($malformed)};
+    like($@, qr/^Malformed UTF-16 surrogate at/,
+        "Malformed surrogate $name croaks for utf16_to_utf8");
+    is($got, undef, 'hence eval returns undef');
+
+    $malformed =~ s/(.)(.)/$2$1/gs;
+    $got = eval {utf16_to_utf8_reversed($malformed)};
+    like($@, qr/^Malformed UTF-16 surrogate at/,
+        "Malformed surrogate $name croaks for utf16_to_utf8_reversed");
+    is($got, undef, 'hence eval returns undef');
+}
+
+my $in = "NA";
+$got = eval {utf16_to_utf8_reversed($in, 1)};
+like($@, qr/^panic: utf16_to_utf8_reversed: odd bytelen 1 at/,
+     'Odd byte length panics');
+is($got, undef, 'hence eval returns undef');
+is($in, "NA", 'and input unchanged');
+
+$in = "\xD8\0\xDC\0";
+$got = eval {utf16_to_utf8($in, 2)};
+like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks');
+(ok(!defined $got, 'hence eval returns undef')) or
+    diag(join ', ', map {ord $_} split //, $got);
+
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 255eb53..f1f081d 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3220,6 +3220,11 @@ at run time.
 (P) Something tried to call utf16_to_utf8 with an odd (as opposed
 to even) byte length.
 
+=item panic: utf16_to_utf8_reversed: odd bytelen
+
+(P) Something tried to call utf16_to_utf8_reversed with an odd (as opposed
+to even) byte length.
+
 =item panic: yylex
 
 (P) The lexer got into a bad state while processing a case modifier.
diff --git a/utf8.c b/utf8.c
index 7b7fd57..3de02ed 100644
--- a/utf8.c
+++ b/utf8.c
@@ -985,12 +985,18 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 
*newlen)
            *d++ = (U8)(( uv        & 0x3f) | 0x80);
            continue;
        }
-       if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
-           UV low = (p[0] << 8) + p[1];
-           p += 2;
-           if (low < 0xdc00 || low >= 0xdfff)
+       if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
+           if (p >= pend) {
                Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
-           uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           } else {
+               UV low = (p[0] << 8) + p[1];
+               p += 2;
+               if (low < 0xdc00 || low > 0xdfff)
+                   Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+               uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           }
+       } else if (uv >= 0xdc00 && uv <= 0xdfff) {
+           Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
        }
        if (uv < 0x10000) {
            *d++ = (U8)(( uv >> 12)         | 0xe0);
@@ -1020,6 +1026,10 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 
bytelen, I32 *newlen)
 
     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
 
+    if (bytelen & 1)
+       Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
+                  (UV)bytelen);
+
     while (s < send) {
        const U8 tmp = s[0];
        s[0] = s[1];

--
Perl5 Master Repository

Reply via email to