Dan Kogai <[EMAIL PROTECTED]> writes: >On Sunday, Oct 20, 2002, at 22:49 Asia/Tokyo, Nick Ing-Simmons wrote: >> Attached is patch that implements ->decode and ->encode of >> Encode::utf8 as XS code that obeys all the rules that Encode::XS does. >> This allows :encoding(UTF-8) to handle partial chars at end of buffers >> correctly. > >Thanks! applied & tested. Bad news is that it fails at test #4 in >t/mime-header.t > > > perl -Mblib t/mime-header.t >not ok 4 - decode Q > >I am currently investigating the cause....
Cause seems to be incoming octet sequence string has SvUTF8_on() true i.e. the octets have got themselves UTF-8 encoded. Attached "patch" is relative to bleadperl Attached "incremental" is on top of last patch. Checked in as //depot/perlio/...@18040 It does a downgrade on the source string. It is not clear at the moment if Encode::XS::decode should do likewise, or if it already does via another route. This time I ran make test in Encode... -- Nick Ing-Simmons http://www.ni-s.u-net.com/
--- devperl/ext/Encode/Encode.pm Sun Oct 20 13:58:28 2002 +++ perlio/ext/Encode/Encode.pm Sun Oct 20 13:46:30 2002 @@ -243,21 +243,7 @@ # was in Encode::utf8 package Encode::utf8; push @Encode::utf8::ISA, 'Encode::Encoding'; - *decode = sub{ - my ($obj,$octets,$chk) = @_; - my $str = Encode::decode_utf8($octets); - if (defined $str) { - $_[1] = '' if $chk; - return $str; - } - return undef; - }; - *encode = sub { - my ($obj,$string,$chk) = @_; - my $octets = Encode::encode_utf8($string); - $_[1] = '' if $chk; - return $octets; - }; + # encode and decode methods now in Encode.xs $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } --- devperl/ext/PerlIO/t/encoding.t Thu Jun 20 08:15:31 2002 +++ perlio/ext/PerlIO/t/encoding.t Sun Oct 20 14:42:51 2002 @@ -12,13 +12,14 @@ } } -print "1..13\n"; +print "1..14\n"; my $grk = "grk$$"; my $utf = "utf$$"; my $fail1 = "fa$$"; my $fail2 = "fb$$"; my $russki = "koi8r$$"; +my $threebyte = "3byte$$"; if (open(GRK, ">$grk")) { binmode(GRK, ":bytes"); @@ -131,6 +132,21 @@ print "$warn"; } +# Create a string of chars that are 3 bytes in UTF-8 +my $str = "\x{1f80}" x 2048; + +# Write them to a file +open(F,'>:utf8',$threebyte) || die "Cannot open $threebyte:$!"; +print F $str; +close(F); + +# Read file back as UTF-8 +open(F,'<:encoding(utf-8)',$threebyte) || die "Cannot open $threebyte:$!"; +my $dstr = <F>; +close(F); +print "not " unless ($dstr eq $str); +print "ok 14\n"; + END { - unlink($grk, $utf, $fail1, $fail2, $russki); + unlink($grk, $utf, $fail1, $fail2, $russki, $threebyte); } --- devperl/ext/Encode/Encode.xs Wed May 29 16:18:55 2002 +++ perlio/ext/Encode/Encode.xs Sun Oct 20 18:16:48 2002 @@ -238,6 +238,134 @@ return dst; } +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +void +Method_decode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen); + SvPOK_only(dst); + SvCUR_set(dst,0); + if (SvUTF8(src)) { + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } + } + while (s < e) { + if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character - done */ + break; + } + else if (is_utf8_char(s)) { + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + else { + /* starts ok but isn't "good" */ + } + } + else { + /* Invalid start byte */ + } + /* If we get here there is something wrong with alleged UTF-8 */ + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); + XSRETURN(0); + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", (UV)*s); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = newSVpvf("\\x%02" UVXf, (UV)*s); + sv_catsv(dst, perlqq); + SvREFCNT_dec(perlqq); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s++; + } + *SvEND(dst) = '\0'; + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvUTF8_on(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + +void +Method_encode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen); + if (SvUTF8(src)) { + /* Already encoded - trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } + else { + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst,2*slen); + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; + } + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvPOK_only(dst); + SvUTF8_off(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE
--- Encode.xs.prev Sun Oct 20 18:21:32 2002 +++ Encode.xs Sun Oct 20 18:16:48 2002 @@ -253,6 +253,17 @@ SV *dst = newSV(slen); SvPOK_only(dst); SvCUR_set(dst,0); + if (SvUTF8(src)) { + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } + } while (s < e) { if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { U8 skip = UTF8SKIP(s);