In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/26cc780bb8911dcc1e2c3e971db95322db3e2d9f?hp=1bb4187059cebfeb80fd44d4a65c5daa04be9ff4>
- Log ----------------------------------------------------------------- commit 26cc780bb8911dcc1e2c3e971db95322db3e2d9f Author: Nicholas Clark <[email protected]> Date: Thu Oct 22 13:30:03 2009 +0100 Perl_utf16_to_utf8() should treat "\0" like any every other odd-length input. The "be understanding" bodge to not panic, introduced in 1de9afcdf18cf98b, is no longer needed now that c28d61051c446453 fixes the underlying problem. M ext/XS-APItest/t/utf16_to_utf8.t M utf8.c commit c28d61051c446453c532f387d478df78d6f95c55 Author: Nicholas Clark <[email protected]> Date: Thu Oct 22 11:50:40 2009 +0100 Re-write S_utf16_textfilter() to correctly handle partial reads of UTF-16. Treat any (and all) octects after the BOM (or all, if there was no BOM) as initial read data for the filter, and call it to convert them to the first line, reading more if necessary. This correctly handles the "problem" that UTF-16LE read as a line, on the assumption that it's ASCII/ISO-8859-*/UTF-8/etc will be truncated after the first octect of the "\n\0" pair that is "\n" encoded as UTF-16LE. This fixes bug #69678. Read from the upstream filter in block mode, rather than line mode. M t/comp/utf.t M toke.c commit 9fb03e618192b6b5d49274cc64422acee51fe198 Author: Nicholas Clark <[email protected]> Date: Thu Oct 22 09:26:58 2009 +0100 Remove the "hack" that removes SVt_UTF8 in the UTF16 filter, by fixing t/TEST Given that t/TEST already had code to add -I../lib when testing UTF-8 with -utf8, do likewise for testing UTF-16 with -utf16. M t/TEST M toke.c ----------------------------------------------------------------------- Summary of changes: ext/XS-APItest/t/utf16_to_utf8.t | 20 +++---- t/TEST | 2 +- t/comp/utf.t | 16 +++--- toke.c | 105 +++++++++++++++++++++++++------------ utf8.c | 6 -- 5 files changed, 90 insertions(+), 59 deletions(-) diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t index 592d0b1..40a6288 100644 --- a/ext/XS-APItest/t/utf16_to_utf8.t +++ b/ext/XS-APItest/t/utf16_to_utf8.t @@ -23,15 +23,13 @@ for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD, } } -# Currently this is special-cased to work. Should it? - -is(utf16_to_utf8("\0"), "\0", 'Short string to utf16_to_utf8'); - -# But anything else is fatal - -my $got = eval {utf16_to_utf8('N')}; -like($@, qr/^panic: utf16_to_utf8: odd bytelen 1 at/, 'Odd byte length panics'); -is($got, undef, 'hence eval returns undef'); +foreach ("\0", 'N', 'Perl rules!') { + my $length = length $_; + my $got = eval {utf16_to_utf8($_)}; + like($@, qr/^panic: utf16_to_utf8: odd bytelen $length at/, + "Odd byte length panics for '$_'"); + is($got, undef, 'hence eval returns undef'); +} for (["\xD8\0\0\0", 'NULs'], ["\xD8\0\xD8\0", '2 Lows'], @@ -40,7 +38,7 @@ for (["\xD8\0\0\0", 'NULs'], ["\xDC\0\xDC\0", 'High High'], ) { my ($malformed, $name) = @$_; - $got = eval {utf16_to_utf8($malformed)}; + my $got = eval {utf16_to_utf8($malformed)}; like($@, qr/^Malformed UTF-16 surrogate at/, "Malformed surrogate $name croaks for utf16_to_utf8"); is($got, undef, 'hence eval returns undef'); @@ -53,7 +51,7 @@ for (["\xD8\0\0\0", 'NULs'], } my $in = "NA"; -$got = eval {utf16_to_utf8_reversed($in, 1)}; +my $got = eval {utf16_to_utf8_reversed($in, 1)}; like($@, qr/^panic: utf16_to_utf8_reversed: odd bytelen 1 at/, 'Odd byte length panics'); is($got, undef, 'hence eval returns undef'); diff --git a/t/TEST b/t/TEST index 10a0e85..4dee636 100755 --- a/t/TEST +++ b/t/TEST @@ -246,7 +246,7 @@ sub _scan_test { } } - my $utf8 = $::with_utf8 ? "-I$lib -Mutf8" : ''; + my $utf8 = ($::with_utf8 || $::with_utf16) ? "-I$lib -Mutf8" : ''; my %options = ( perl => $perl, diff --git a/t/comp/utf.t b/t/comp/utf.t index 6f79d27..c1a3e82 100644 --- a/t/comp/utf.t +++ b/t/comp/utf.t @@ -1,6 +1,6 @@ #!./perl -w -print "1..18\n"; +print "1..36\n"; my $test = 0; my %templates = ( @@ -17,26 +17,28 @@ sub bytes_to_utf { } sub test { - my ($enc, $tag, $bom) = @_; + my ($enc, $tag, $bom, $nl) = @_; open my $fh, ">", "utf$$.pl" or die "utf.pl: $!"; binmode $fh; - print $fh bytes_to_utf($enc, "$tag\n", $bom); + print $fh bytes_to_utf($enc, $tag . ($nl ? "\n" : ''), $bom); close $fh or die $!; my $got = do "./utf$$.pl"; $test = $test + 1; if (!defined $got) { - print "not ok $test # $enc $tag $bom; got undef\n"; + print "not ok $test # $enc $tag $bom $nl; got undef\n"; } elsif ($got ne $tag) { - print "not ok $test # $enc $tag $bom; got '$got'\n"; + print "not ok $test # $enc $tag $bom $nl; got '$got'\n"; } else { - print "ok $test\n"; + print "ok $test # $enc $tag $bom $nl\n"; } } for my $bom (0, 1) { for my $enc (qw(utf16le utf16be utf8)) { for my $value (123, 1234, 12345) { - test($enc, $value, $bom); + for my $nl (1, 0) { + test($enc, $value, $bom, $nl); + } } } } diff --git a/toke.c b/toke.c index 7edccf4..f795707 100644 --- a/toke.c +++ b/toke.c @@ -12780,7 +12780,8 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) return. */ SV *const filter = FILTER_DATA(idx); SV *const utf16_buffer = MUTABLE_SV(IoTOP_GV(filter)); - const I32 count = FILTER_READ(idx+1, utf16_buffer, maxlen); + SV *const utf8_buffer = MUTABLE_SV(IoFMT_GV(filter)); + IV status = IoPAGE(filter); const bool reverse = IoLINES(filter); /* As we're automatically added, at the lowest level, and hence only called @@ -12789,61 +12790,97 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) if (maxlen) { Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen); } + if (status < 0) { + Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%"IVdf")", status); + } DEBUG_P(PerlIO_printf(Perl_debug_log, - "utf16%s_textfilter(%p): %d %d (%d)\n", - reverse ? "rev" : "", + "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", FPTR2DPTR(void *, S_utf16_textfilter), - idx, maxlen, (int) count)); - if (count > 0) { - const STRLEN old = SvCUR(sv); + reverse ? 'l' : 'b', idx, maxlen, status, + (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); + + while (1) { + STRLEN chars; + STRLEN have; I32 newlen; U8 *end; + /* First, look in our buffer of existing UTF-8 data: */ + char *nl = (char *)memchr(SvPVX(utf8_buffer), '\n', SvCUR(utf8_buffer)); + + if (nl) { + ++nl; + } else if (status == 0) { + /* EOF */ + IoPAGE(filter) = 0; + nl = SvEND(utf8_buffer); + } + if (nl) { + sv_catpvn(sv, SvPVX(utf8_buffer), nl - SvPVX(utf8_buffer)); + /* Everything else in this code works just fine if SVp_POK isn't + set. This, however, needs it, and we need it to work, else + we loop infinitely because the buffer is never consumed. */ + sv_chop(utf8_buffer, nl); + break; + } + /* OK, not a complete line there, so need to read some more UTF-16. + Read an extra octect if the buffer currently has an odd number. */ + + while(SvCUR(utf16_buffer) < 2 && status > 0) { + status = FILTER_READ(idx + 1, utf16_buffer, + 160 + (SvCUR(utf16_buffer) & 1)); + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer))); + if (status < 0) { + /* Error */ + IoPAGE(filter) = status; + return status; + } + } + + chars = SvCUR(utf16_buffer) >> 1; + have = SvCUR(utf8_buffer); + SvGROW(utf8_buffer, have + chars * 3 + 1); - SvGROW(sv, old + SvCUR(sv) * 3 / 2 + 1); if (reverse) { - /* You would expect this to be utf16_to_utf8_reversed() - It was, prior to 1de9afcdf18cf98bbdecaa782da93e907be6fe4e - Effectively, right now, UTF-16LE is being read in off-by-one - See RT #69678 */ - end = utf16_to_utf8((U8*)SvPVX(utf16_buffer), - (U8*)SvPVX_const(sv) + old, - SvCUR(utf16_buffer), &newlen); + end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer), + (U8*)SvPVX_const(utf8_buffer) + have, + chars * 2, &newlen); } else { end = utf16_to_utf8((U8*)SvPVX(utf16_buffer), - (U8*)SvPVX_const(sv) + old, - SvCUR(utf16_buffer), &newlen); + (U8*)SvPVX_const(utf8_buffer) + have, + chars * 2, &newlen); } - SvCUR_set(sv, old + newlen); + SvCUR_set(utf8_buffer, have + newlen); *end = '\0'; + + sv_chop(utf16_buffer, SvPVX(utf16_buffer) + chars * 2); } - SvCUR_set(utf16_buffer, 0); - /* This is to be bug-for-bug faithful with the implementation we've just - replaced. Without this, ./TEST -utf16 base/lex.t fails, attempting to - load utf8.pm */ - SvUTF8_off(sv); - DEBUG_P({sv_dump(sv);}); + DEBUG_P(PerlIO_printf(Perl_debug_log, + "utf16_textfilter: returns, status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", + status, + (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); + DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);}); return SvCUR(sv); } static U8 * S_add_utf16_textfilter(pTHX_ U8 *const s, bool reversed) { - U8 *news; - I32 newlen; SV *filter = filter_add(S_utf16_textfilter, NULL); - IoTOP_GV(filter) = MUTABLE_GV(newSV(160)); + IoTOP_GV(filter) = MUTABLE_GV(newSVpvn((char *)s, PL_bufend - (char*)s)); + IoFMT_GV(filter) = MUTABLE_GV(newSVpvs("")); IoLINES(filter) = reversed; - Newx(news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - if (reversed) { - utf16_to_utf8_reversed(s, news, PL_bufend - (char*)s - 1, &newlen); + IoPAGE(filter) = 1; /* Not EOF */ + + /* Sadly, we have to return a valid pointer, come what may, so we have to + ignore any error return from this. */ + SvCUR_set(PL_linestr, 0); + if (FILTER_READ(0, PL_linestr, 0)) { + SvUTF8_on(PL_linestr); } else { - utf16_to_utf8(s, news, PL_bufend - (char*)s, &newlen); + SvUTF8_on(PL_linestr); } - sv_setpvn(PL_linestr, (const char*)news, newlen); - Safefree(news); - SvUTF8_on(PL_linestr); - PL_bufend = SvPVX(PL_linestr) + newlen; + PL_bufend = SvEND(PL_linestr); return (U8*)SvPVX(PL_linestr); } #endif diff --git a/utf8.c b/utf8.c index dc12df8..c504891 100644 --- a/utf8.c +++ b/utf8.c @@ -958,12 +958,6 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8; - if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ - d[0] = 0; - *newlen = 1; - return d + 1; - } - if (bytelen & 1) Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); -- Perl5 Master Repository
