In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/15f5e486022b574631307e6a27ca1b961591e561?hp=5aa240eab7dbaa91f98c2fee1f04b6c0b5a9b9e3>
- Log ----------------------------------------------------------------- commit 15f5e486022b574631307e6a27ca1b961591e561 Author: Steve Hay <[email protected]> Date: Tue Dec 6 08:41:46 2016 +0000 Upgrade Encode from version 2.86 to 2.88 (Unicode.pm is customized for a version-bump only, to silence t/porting/cmp_version.t since Unicode.xs has changed.) ----------------------------------------------------------------------- Summary of changes: MANIFEST | 6 + Porting/Maintainers.pl | 4 +- cpan/Encode/Encode.pm | 31 +- cpan/Encode/Encode.xs | 519 +++++++++++++++++----------------- cpan/Encode/Encode/_T.e2x | 6 +- cpan/Encode/Makefile.PL | 13 +- cpan/Encode/Unicode/Makefile.PL | 2 +- cpan/Encode/Unicode/Unicode.pm | 2 +- cpan/Encode/Unicode/Unicode.xs | 110 +++++-- cpan/Encode/bin/enc2xs | 58 +++- cpan/Encode/encoding.pm | 4 +- cpan/Encode/lib/Encode/Alias.pm | 14 +- cpan/Encode/lib/Encode/CN/HZ.pm | 5 +- cpan/Encode/lib/Encode/MIME/Header.pm | 471 +++++++++++++++++++----------- cpan/Encode/lib/Encode/MIME/Name.pm | 14 +- cpan/Encode/t/Aliases.t | 2 +- cpan/Encode/t/Encode.t | 54 +++- cpan/Encode/t/at-cn.t | 4 +- cpan/Encode/t/at-tw.t | 4 +- cpan/Encode/t/decode.t | 56 +++- cpan/Encode/t/enc_data.t | 8 +- cpan/Encode/t/enc_eucjp.t | 2 +- cpan/Encode/t/enc_module.t | 8 +- cpan/Encode/t/enc_utf8.t | 2 +- cpan/Encode/t/encoding-locale.t | 2 +- cpan/Encode/t/encoding.t | 6 +- cpan/Encode/t/fallback.t | 2 +- cpan/Encode/t/jperl.t | 6 +- cpan/Encode/t/magic.t | 144 ++++++++++ cpan/Encode/t/mime-header.t | 215 +++++++++++++- cpan/Encode/t/mime-name.t | 34 ++- cpan/Encode/t/rt113164.t | 38 +++ cpan/Encode/t/rt65541.t | 29 ++ cpan/Encode/t/rt76824.t | 60 ++++ cpan/Encode/t/rt85489.t | 48 ++++ cpan/Encode/t/rt86327.t | 33 +++ cpan/Encode/t/taint.t | 28 +- cpan/Encode/t/utf8ref.t | 21 +- cpan/Encode/t/utf8strict.t | 51 +++- t/porting/customized.dat | 2 +- 40 files changed, 1568 insertions(+), 550 deletions(-) create mode 100644 cpan/Encode/t/magic.t create mode 100644 cpan/Encode/t/rt113164.t create mode 100644 cpan/Encode/t/rt65541.t create mode 100644 cpan/Encode/t/rt76824.t create mode 100644 cpan/Encode/t/rt85489.t create mode 100644 cpan/Encode/t/rt86327.t diff --git a/MANIFEST b/MANIFEST index 69df013795..be93d824a7 100644 --- a/MANIFEST +++ b/MANIFEST @@ -968,6 +968,7 @@ cpan/Encode/t/jisx0212.utf test data cpan/Encode/t/jperl.t test script cpan/Encode/t/ksc5601.enc test data cpan/Encode/t/ksc5601.utf test data +cpan/Encode/t/magic.t test script cpan/Encode/t/mime-header.t test script cpan/Encode/t/mime-name.t test script cpan/Encode/t/mime_header_iso2022jp.t test script @@ -975,6 +976,11 @@ cpan/Encode/t/Mod_EUCJP.pm module that t/enc_module.enc uses cpan/Encode/t/perlio.t test script cpan/Encode/t/piconv.t Test for piconv.t cpan/Encode/t/rt.pl test script +cpan/Encode/t/rt113164.t test script +cpan/Encode/t/rt65541.t test script +cpan/Encode/t/rt76824.t test script +cpan/Encode/t/rt85489.t test script +cpan/Encode/t/rt86327.t test script cpan/Encode/t/taint.t cpan/Encode/t/unibench.pl benchmark script cpan/Encode/t/Unicode.t test script diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index 49bdc31474..c58ee5b951 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -397,9 +397,9 @@ use File::Glob qw(:case); }, 'Encode' => { - 'DISTRIBUTION' => 'DANKOGAI/Encode-2.86.tar.gz', + 'DISTRIBUTION' => 'DANKOGAI/Encode-2.88.tar.gz', 'FILES' => q[cpan/Encode], - 'CUSTOMIZED' => [ qw[ Encode.xs ] ], + 'CUSTOMIZED' => [ qw(Unicode/Unicode.pm) ], }, 'encoding::warnings' => { diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm index bda8e1b316..57b4292279 100644 --- a/cpan/Encode/Encode.pm +++ b/cpan/Encode/Encode.pm @@ -1,10 +1,10 @@ # -# $Id: Encode.pm,v 2.86 2016/08/10 18:08:01 dankogai Exp $ +# $Id: Encode.pm,v 2.88 2016/11/29 23:30:30 dankogai Exp dankogai $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.86 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.88 $ =~ /(\d+)/g; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; use XSLoader (); XSLoader::load( __PACKAGE__, $VERSION ); @@ -15,7 +15,7 @@ use Exporter 5.57 'import'; our @EXPORT = qw( decode decode_utf8 encode encode_utf8 str2bytes bytes2str - encodings find_encoding clone_encoding + encodings find_encoding find_mime_encoding clone_encoding ); our @FB_FLAGS = qw( DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC @@ -102,6 +102,8 @@ sub define_encoding { sub getEncoding { my ( $class, $name, $skip_external ) = @_; + defined($name) or return; + $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 ref($name) && $name->can('renew') and return $name; @@ -130,6 +132,14 @@ sub find_encoding($;$) { return __PACKAGE__->getEncoding( $name, $skip_external ); } +sub find_mime_encoding($;$) { + my ( $mime_name, $skip_external ) = @_; + eval { require Encode::MIME::Name; }; + $@ and return; + my $name = Encode::MIME::Name::get_encode_name( $mime_name ); + return find_encoding( $name, $skip_external ); +} + sub resolve_alias($) { my $obj = find_encoding(shift); defined $obj and return $obj->name; @@ -254,6 +264,7 @@ sub from_to($$$;$) { sub encode_utf8($) { my ($str) = @_; + return undef unless defined $str; utf8::encode($str); return $str; } @@ -576,6 +587,20 @@ name of the encoding object. See L<Encode::Encoding> for details. +=head3 find_mime_encoding + + [$obj =] find_mime_encoding(MIME_ENCODING) + +Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts +same as C<find_encoding()> but C<mime_name()> of returned object must +match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> +canonical names and aliases are not used when searching for object. + + find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING> + find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" + find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive + find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING> + =head3 from_to [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs index 222f39b2ea..b5160d2516 100644 --- a/cpan/Encode/Encode.xs +++ b/cpan/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.37 2016/08/10 18:08:45 dankogai Exp dankogai $ + $Id: Encode.xs,v 2.39 2016/11/29 23:29:23 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -31,6 +31,10 @@ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +#ifndef SvIV_nomg +#define SvIV_nomg SvIV +#endif + #ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE # define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE #else @@ -76,6 +80,37 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) PERL_UNUSED_VAR(orig); } +static void +utf8_safe_downgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) +{ + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)*s, *slen)); + SvUTF8_on(tmp); + if (SvTAINTED(*src)) + SvTAINTED_on(tmp); + *src = tmp; + *s = (U8 *)SvPVX(*src); + } + if (*slen) { + if (!utf8_to_bytes(*s, slen)) + croak("Wide character"); + SvCUR_set(*src, *slen); + } + SvUTF8_off(*src); +} + +static void +utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) +{ + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)*s, *slen)); + if (SvTAINTED(*src)) + SvTAINTED_on(tmp); + *src = tmp; + } + sv_utf8_upgrade_nomg(*src); + *s = (U8 *)SvPV_nomg(*src, *slen); +} #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" @@ -104,18 +139,16 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) } static SV * -encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, +encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen, int check, STRLEN * offset, SV * term, int * retcode, SV *fallback_cb) { - STRLEN slen; - U8 *s = (U8 *) SvPV(src, slen); STRLEN tlen = slen; STRLEN ddone = 0; STRLEN sdone = 0; /* We allocate slen+1. PerlIO dumps core if this value is smaller than this. */ - SV *dst = sv_2mortal(newSV(slen+1)); + SV *dst = newSV(slen+1); U8 *d = (U8 *)SvPVX(dst); STRLEN dlen = SvLEN(dst)-1; int code = 0; @@ -191,10 +224,10 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, if (dir == enc->f_utf8) { STRLEN clen; UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + utf8n_to_uvuni(s+slen, (tlen-sdone-slen), &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); /* if non-representable multibyte prefix at end of current buffer - break*/ - if (clen > tlen - sdone) break; + if (clen > tlen - sdone - slen) break; if (check & ENCODE_DIE_ON_ERR) { Perl_croak(aTHX_ ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); @@ -211,7 +244,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, SV* subchar = (fallback_cb != &PL_sv_undef) ? do_fallback_cb(aTHX_ ch, fallback_cb) - : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : + : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04" UVxf "}" : check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : "&#x%" UVxf ";", (UV)ch); SvUTF8_off(subchar); /* make sure no decoded string gets in */ @@ -279,6 +312,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, sv_setpvn(src, (char*)s+slen, sdone); } SvCUR_set(src, sdone); + SvSETMAGIC(src); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ @@ -318,6 +352,62 @@ strict_utf8(pTHX_ SV* sv) return SvTRUE(*svp); } +/* + * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126 + */ +#ifndef UNICODE_IS_NONCHAR +#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE) +#endif + +#ifndef UNICODE_IS_SUPER +#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX) +#endif + +#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c)) + +#ifndef UTF_ACCUMULATION_OVERFLOW_MASK +#ifndef CHARBITS +#define CHARBITS CHAR_BIT +#endif +#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT)) +#endif + +/* + * Convert non strict utf8 sequence of len >= 2 to unicode codepoint + */ +static UV +convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) +{ + UV uv; + U8 *ptr = s; + bool overflowed = 0; + + uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len); + + len--; + s++; + + while (len--) { + if (!UTF8_IS_CONTINUATION(*s)) { + *rlen = s-ptr; + return 0; + } + if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) + overflowed = 1; + uv = UTF8_ACCUMULATE(uv, *s); + s++; + } + + *rlen = s-ptr; + + if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) { + *rlen = 1; + return 0; + } + + return uv; +} + static U8* process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, bool encode, bool strict, bool stop_at_partial) @@ -336,7 +426,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, } else { fallback_cb = &PL_sv_undef; - check = SvIV(check_sv); + check = SvIV_nomg(check_sv); } SvPOK_only(dst); @@ -351,39 +441,30 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, continue; } + ulen = 1; if (UTF8_IS_START(*s)) { U8 skip = UTF8SKIP(s); if ((s + skip) > e) { if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) { const U8 *p = s + 1; for (; p < e; p++) { - if (!UTF8_IS_CONTINUATION(*p)) + if (!UTF8_IS_CONTINUATION(*p)) { + ulen = p-s; goto malformed_byte; + } } break; } + ulen = e-s; goto malformed_byte; } - uv = utf8n_to_uvuni(s, e - s, &ulen, - UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : - UTF8_ALLOW_NONSTRICT) - ); -#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ - if (strict && uv > PERL_UNICODE_MAX) - ulen = (STRLEN) -1; -#endif - if (ulen == (STRLEN) -1) { - if (strict) { - uv = utf8n_to_uvuni(s, e - s, &ulen, - UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); - if (ulen == (STRLEN) -1) - goto malformed_byte; - goto malformed; - } + uv = convert_utf8_multi_seq(s, skip, &ulen); + if (uv == 0) goto malformed_byte; - } + else if (strict && !UNICODE_IS_STRICT(uv)) + goto malformed; /* Whole char is good */ @@ -396,7 +477,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, /* If we get here there is something wrong with alleged UTF-8 */ malformed_byte: uv = (UV)*s; - ulen = 1; + if (ulen == 0) + ulen = 1; malformed: if (check & ENCODE_DIE_ON_ERR){ @@ -456,10 +538,6 @@ MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ PROTOTYPES: DISABLE -#ifndef SvIsCOW -# define SvIsCOW(sv) (SvREADONLY(sv) && SvFAKE(sv)) -#endif - void Method_decode_xs(obj,src,check_sv = &PL_sv_no) SV * obj @@ -472,23 +550,26 @@ PREINIT: SV *dst; bool renewed = 0; int check; + bool modify; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + modify = (check && !(check & ENCODE_LEAVE_SRC)); CODE: { - dSP; ENTER; SAVETMPS; - if (src == &PL_sv_undef || SvROK(src)) src = sv_2mortal(newSV(0)); - check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); - if (!(check & ENCODE_LEAVE_SRC) && SvIsCOW(src)) { - /* - * disassociate from any other scalars before doing - * in-place modifications - */ - sv_force_normal(src); - } - s = (U8 *) SvPV(src, slen); - e = (U8 *) SvEND(src); + dSP; + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + e = s+slen; + /* * PerlIO check -- we assume the object is of PerlIO if renewed */ + ENTER; SAVETMPS; PUSHMARK(sp); XPUSHs(obj); PUTBACK; @@ -503,28 +584,17 @@ CODE: FREETMPS; LEAVE; /* end PerlIO check */ - if (SvUTF8(src)) { - s = utf8_to_bytes(s,&slen); - if (s) { - SvCUR_set(src,slen); - SvUTF8_off(src); - e = s+slen; - } - else { - croak("Cannot decode string with wide characters"); - } - } - dst = sv_2mortal(newSV(slen>0?slen:1)); /* newSV() abhors 0 -- inaba */ s = process_utf8(aTHX_ dst, s, e, check_sv, 0, strict_utf8(aTHX_ obj), renewed); /* Clear out translated part of source unless asked not to */ - if (check && !(check & ENCODE_LEAVE_SRC)){ + if (modify) { slen = e-s; if (slen) { sv_setpvn(src, (char*)s, slen); } SvCUR_set(src, slen); + SvSETMAGIC(src); } SvUTF8_on(dst); if (SvTAINTED(src)) SvTAINTED_on(dst); /* propagate taintedness */ @@ -543,12 +613,18 @@ PREINIT: U8 *e; SV *dst; int check; + bool modify; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + modify = (check && !(check & ENCODE_LEAVE_SRC)); CODE: { - check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); - if (src == &PL_sv_undef || SvROK(src)) src = sv_2mortal(newSV(0)); - s = (U8 *) SvPV(src, slen); - e = (U8 *) SvEND(src); + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + e = s+slen; dst = sv_2mortal(newSV(slen>0?slen:1)); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { /* Already encoded */ @@ -584,12 +660,13 @@ CODE: } /* Clear out translated part of source unless asked not to */ - if (check && !(check & ENCODE_LEAVE_SRC)){ + if (modify) { slen = e-s; if (slen) { sv_setpvn(src, (char*)s, slen); } SvCUR_set(src, slen); + SvSETMAGIC(src); } SvPOK_only(dst); SvUTF8_off(dst); @@ -638,24 +715,35 @@ SV * src SV * off SV * term SV * check_sv -CODE: -{ +PREINIT: int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - STRLEN offset = (STRLEN)SvIV(off); + SV *fallback_cb; + bool modify; + encode_t *enc; + STRLEN offset; int code = 0; - if (SvUTF8(src)) { - sv_utf8_downgrade(src, FALSE); - } - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, - &offset, term, &code, fallback_cb)); + U8 *s; + STRLEN slen; + SV *tmp; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + offset = (STRLEN)SvIV(off); +CODE: +{ + if (!SvOK(src)) + XSRETURN_NO; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + tmp = encode_method(aTHX_ enc, enc->t_utf8, src, s, slen, check, + &offset, term, &code, fallback_cb); + sv_catsv(dst, tmp); + SvREFCNT_dec(tmp); SvIV_set(off, (IV)offset); if (code == ENCODE_FOUND_TERM) { ST(0) = &PL_sv_yes; @@ -665,79 +753,70 @@ CODE: XSRETURN(1); } -void +SV * Method_decode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src SV * check_sv +PREINIT: + int check; + SV *fallback_cb; + bool modify; + encode_t *enc; + U8 *s; + STRLEN slen; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); CODE: { - int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - if (SvREADONLY(src) || SvSMAGICAL(src) || SvGMAGICAL(src) || !SvPOK(src)) { - SV *tmp; - tmp = sv_newmortal(); - sv_copypv(tmp, src); - src = tmp; - } - if (SvUTF8(src)) { - sv_utf8_downgrade(src, FALSE); - } - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + RETVAL = encode_method(aTHX_ enc, enc->t_utf8, src, s, slen, check, NULL, Nullsv, NULL, fallback_cb); - SvUTF8_on(ST(0)); - XSRETURN(1); + SvUTF8_on(RETVAL); } +OUTPUT: + RETVAL - -#ifndef SvPV_force_nolen -# define SvPV_force_nolen(sv) SvPV_force_flags_nolen(sv, SV_GMAGIC) -#endif - -#ifndef SvPV_force_flags_nolen -# define SvPV_force_flags_nolen(sv, flags) \ - ((SvFLAGS(sv) & (SVf_POK|SVf_THINKFIRST)) == SVf_POK \ - ? SvPVX(sv) : sv_pvn_force_flags(sv, &PL_na, flags)) -#endif - -void +SV * Method_encode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src SV * check_sv +PREINIT: + int check; + SV *fallback_cb; + bool modify; + encode_t *enc; + U8 *s; + STRLEN slen; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); CODE: { - int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - if (SvREADONLY(src) || SvSMAGICAL(src) || SvGMAGICAL(src) || !SvPOK(src)) { - /* - SV *tmp; - tmp = sv_newmortal(); - sv_copypv(tmp, src); - src = tmp; - */ - src = sv_mortalcopy(src); - SvPV_force_nolen(src); - } - sv_utf8_upgrade(src); - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (!SvUTF8(src)) + utf8_safe_upgrade(aTHX_ &src, &s, &slen, modify); + RETVAL = encode_method(aTHX_ enc, enc->f_utf8, src, s, slen, check, NULL, Nullsv, NULL, fallback_cb); - XSRETURN(1); } +OUTPUT: + RETVAL void Method_needs_lines(obj) @@ -753,6 +832,8 @@ CODE: void Method_perlio_ok(obj) SV * obj +PREINIT: + SV *sv; CODE: { /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ @@ -762,7 +843,8 @@ CODE: eval_pv("require PerlIO::encoding", 0); SPAGAIN; - if (SvTRUE(get_sv("@", 0))) { + sv = get_sv("@", 0); + if (SvTRUE(sv)) { ST(0) = &PL_sv_no; }else{ ST(0) = &PL_sv_yes; @@ -773,6 +855,8 @@ CODE: void Method_mime_name(obj) SV * obj +PREINIT: + SV *sv; CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); @@ -780,7 +864,8 @@ CODE: eval_pv("require Encode::MIME::Name", 0); SPAGAIN; - if (SvTRUE(get_sv("@", 0))) { + sv = get_sv("@", 0); + if (SvTRUE(sv)) { ST(0) = &PL_sv_undef; }else{ ENTER; @@ -903,17 +988,16 @@ bool is_utf8(sv, check = 0) SV * sv int check +PREINIT: + char *str; + STRLEN len; CODE: { - if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ + SvGETMAGIC(sv); /* SvGETMAGIC() can modify SvOK flag */ + str = SvOK(sv) ? SvPV_nomg(sv, len) : NULL; /* SvPV() can modify SvUTF8 flag */ RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) + if (RETVAL && check && (!str || !is_utf8_string((U8 *)str, len))) RETVAL = FALSE; - if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ } OUTPUT: RETVAL @@ -923,13 +1007,14 @@ _utf8_on(sv) SV * sv CODE: { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - if (SvIsCOW(sv)) sv_force_normal(sv); - SvUTF8_on(sv); + SvGETMAGIC(sv); + if (!SvTAINTED(sv) && SvPOKp(sv)) { + if (SvTHINKFIRST(sv)) sv_force_normal(sv); + RETVAL = newSViv(SvUTF8(sv)); + SvUTF8_on(sv); + SvSETMAGIC(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: @@ -940,124 +1025,38 @@ _utf8_off(sv) SV * sv CODE: { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - if (SvIsCOW(sv)) sv_force_normal(sv); - SvUTF8_off(sv); + SvGETMAGIC(sv); + if (!SvTAINTED(sv) && SvPOKp(sv)) { + if (SvTHINKFIRST(sv)) sv_force_normal(sv); + RETVAL = newSViv(SvUTF8(sv)); + SvUTF8_off(sv); + SvSETMAGIC(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: RETVAL -int -DIE_ON_ERR() -CODE: - RETVAL = ENCODE_DIE_ON_ERR; -OUTPUT: - RETVAL - -int -WARN_ON_ERR() -CODE: - RETVAL = ENCODE_WARN_ON_ERR; -OUTPUT: - RETVAL - -int -LEAVE_SRC() -CODE: - RETVAL = ENCODE_LEAVE_SRC; -OUTPUT: - RETVAL - -int -RETURN_ON_ERR() -CODE: - RETVAL = ENCODE_RETURN_ON_ERR; -OUTPUT: - RETVAL - -int -PERLQQ() -CODE: - RETVAL = ENCODE_PERLQQ; -OUTPUT: - RETVAL - -int -HTMLCREF() -CODE: - RETVAL = ENCODE_HTMLCREF; -OUTPUT: - RETVAL - -int -XMLCREF() -CODE: - RETVAL = ENCODE_XMLCREF; -OUTPUT: - RETVAL - -int -STOP_AT_PARTIAL() -CODE: - RETVAL = ENCODE_STOP_AT_PARTIAL; -OUTPUT: - RETVAL - -int -FB_DEFAULT() -CODE: - RETVAL = ENCODE_FB_DEFAULT; -OUTPUT: - RETVAL - -int -FB_CROAK() -CODE: - RETVAL = ENCODE_FB_CROAK; -OUTPUT: - RETVAL - -int -FB_QUIET() -CODE: - RETVAL = ENCODE_FB_QUIET; -OUTPUT: - RETVAL - -int -FB_WARN() -CODE: - RETVAL = ENCODE_FB_WARN; -OUTPUT: - RETVAL - -int -FB_PERLQQ() -CODE: - RETVAL = ENCODE_FB_PERLQQ; -OUTPUT: - RETVAL - -int -FB_HTMLCREF() -CODE: - RETVAL = ENCODE_FB_HTMLCREF; -OUTPUT: - RETVAL - -int -FB_XMLCREF() -CODE: - RETVAL = ENCODE_FB_XMLCREF; -OUTPUT: - RETVAL - BOOT: { + HV *stash = gv_stashpvn("Encode", strlen("Encode"), GV_ADD); + newCONSTSUB(stash, "DIE_ON_ERR", newSViv(ENCODE_DIE_ON_ERR)); + newCONSTSUB(stash, "WARN_ON_ERR", newSViv(ENCODE_WARN_ON_ERR)); + newCONSTSUB(stash, "RETURN_ON_ERR", newSViv(ENCODE_RETURN_ON_ERR)); + newCONSTSUB(stash, "LEAVE_SRC", newSViv(ENCODE_LEAVE_SRC)); + newCONSTSUB(stash, "PERLQQ", newSViv(ENCODE_PERLQQ)); + newCONSTSUB(stash, "HTMLCREF", newSViv(ENCODE_HTMLCREF)); + newCONSTSUB(stash, "XMLCREF", newSViv(ENCODE_XMLCREF)); + newCONSTSUB(stash, "STOP_AT_PARTIAL", newSViv(ENCODE_STOP_AT_PARTIAL)); + newCONSTSUB(stash, "FB_DEFAULT", newSViv(ENCODE_FB_DEFAULT)); + newCONSTSUB(stash, "FB_CROAK", newSViv(ENCODE_FB_CROAK)); + newCONSTSUB(stash, "FB_QUIET", newSViv(ENCODE_FB_QUIET)); + newCONSTSUB(stash, "FB_WARN", newSViv(ENCODE_FB_WARN)); + newCONSTSUB(stash, "FB_PERLQQ", newSViv(ENCODE_FB_PERLQQ)); + newCONSTSUB(stash, "FB_HTMLCREF", newSViv(ENCODE_FB_HTMLCREF)); + newCONSTSUB(stash, "FB_XMLCREF", newSViv(ENCODE_FB_XMLCREF)); +} +{ #include "def_t.exh" } diff --git a/cpan/Encode/Encode/_T.e2x b/cpan/Encode/Encode/_T.e2x index 6cf5f293d5..7b9a67e43d 100644 --- a/cpan/Encode/Encode/_T.e2x +++ b/cpan/Encode/Encode/_T.e2x @@ -2,6 +2,8 @@ use strict; # Adjust the number here! use Test::More tests => 2; -use_ok('Encode'); -use_ok('Encode::$_Name_'); +BEGIN { + use_ok('Encode'); + use_ok('Encode::$_Name_'); +} # Add more test here! diff --git a/cpan/Encode/Makefile.PL b/cpan/Encode/Makefile.PL index c87153bbb3..8203105247 100644 --- a/cpan/Encode/Makefile.PL +++ b/cpan/Encode/Makefile.PL @@ -1,16 +1,26 @@ # -# $Id: Makefile.PL,v 2.17 2016/08/04 03:15:58 dankogai Exp $ +# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp dankogai $ # use 5.007003; use strict; use warnings; use ExtUtils::MakeMaker; use File::Spec; +use Config; # Just for sure :) my %ARGV = map { my @r = split /=/,$_; defined $r[1] or $r[1]=1; @r } @ARGV; $ARGV{DEBUG} and warn "$_ => $ARGV{$_}\n" for sort keys %ARGV; $ENV{PERL_CORE} ||= $ARGV{PERL_CORE} if $ARGV{PERL_CORE}; +# similar strictness as in core +my $ccflags = $Config{ccflags}; +if (!$ENV{PERL_CORE}) { + if ($Config{gccversion}) { + $ccflags .= ' -Werror=declaration-after-statement'; + $ccflags .= ' -Wpointer-sign' unless $Config{d_cplusplus}; + $ccflags .= ' -fpermissive' if $Config{d_cplusplus}; + } +} my %tables = ( @@ -45,6 +55,7 @@ WriteMakefile( SUFFIX => 'gz', DIST_DEFAULT => 'all tardist', }, + CCFLAGS => $ccflags, INC => '-I' . File::Spec->catfile( '.', 'Encode' ), LICENSE => 'perl', PREREQ_PM => { diff --git a/cpan/Encode/Unicode/Makefile.PL b/cpan/Encode/Unicode/Makefile.PL index ce48b7aace..b28d16bb96 100644 --- a/cpan/Encode/Unicode/Makefile.PL +++ b/cpan/Encode/Unicode/Makefile.PL @@ -3,7 +3,7 @@ use strict; use ExtUtils::MakeMaker; WriteMakefile( - INC => "-I../Encode", + INC => "-I../Encode", NAME => 'Encode::Unicode', VERSION_FROM => "Unicode.pm", MAN3PODS => {}, diff --git a/cpan/Encode/Unicode/Unicode.pm b/cpan/Encode/Unicode/Unicode.pm index 7dec3e3815..fc1d3d1382 100644 --- a/cpan/Encode/Unicode/Unicode.pm +++ b/cpan/Encode/Unicode/Unicode.pm @@ -4,7 +4,7 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.15 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.15_01 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); diff --git a/cpan/Encode/Unicode/Unicode.xs b/cpan/Encode/Unicode/Unicode.xs index 3bad2adae0..117e14d83f 100644 --- a/cpan/Encode/Unicode/Unicode.xs +++ b/cpan/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.14 2016/01/22 06:33:07 dankogai Exp $ + $Id: Unicode.xs,v 2.15 2016/11/29 23:29:23 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -125,8 +125,6 @@ PROTOTYPES: DISABLE #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) -#define attr_true(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ - SvTRUE(*hv_fetch((HV *)SvRV(obj),k,l,0)) : FALSE) void decode_xs(obj, str, check = 0) @@ -135,26 +133,54 @@ SV * str IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - int size = SvIV(attr("size", 4)); + SV *sve = attr("endian", 6); + U8 endian = *((U8 *)SvPV_nolen(sve)); + SV *svs = attr("size", 4); + int size = SvIV(svs); int ucs2 = -1; /* only needed in the event of surrogate pairs */ SV *result = newSVpvn("",0); STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ STRLEN ulen; STRLEN resultbuflen; U8 *resultbuf; - U8 *s = (U8 *)SvPVbyte(str,ulen); - U8 *e = (U8 *)SvEND(str); + U8 *s; + U8 *e; + bool modify = (check && !(check & ENCODE_LEAVE_SRC)); + bool temp_result; + + SvGETMAGIC(str); + if (!SvOK(str)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen); + if (SvUTF8(str)) { + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); + SvUTF8_on(tmp); + if (SvTAINTED(str)) + SvTAINTED_on(tmp); + str = tmp; + s = (U8 *)SvPVX(str); + } + if (ulen) { + if (!utf8_to_bytes(s, &ulen)) + croak("Wide character"); + SvCUR_set(str, ulen); + } + SvUTF8_off(str); + } + e = s+ulen; + /* Optimise for the common case of being called from PerlIOEncode_fill() with a standard length buffer. In this case the result SV's buffer is only used temporarily, so we can afford to allocate the maximum needed and not care about unused space. */ - const bool temp_result = (ulen == PERLIO_BUFSIZ); + temp_result = (ulen == PERLIO_BUFSIZ); ST(0) = sv_2mortal(result); SvUTF8_on(result); if (!endian && s+size <= e) { + SV *sv; UV bom; endian = (size == 4) ? 'N' : 'n'; bom = enc_unpack(aTHX_ &s,e,size,endian); @@ -183,8 +209,9 @@ CODE: } #if 1 /* Update endian for next sequence */ - if (attr_true("renewed", 7)) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + sv = attr("renewed", 7); + if (SvTRUE(sv)) { + (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); } #endif } @@ -202,11 +229,12 @@ CODE: U8 *d; if (issurrogate(ord)) { if (ucs2 == -1) { - ucs2 = attr_true("ucs2", 4); + SV *sv = attr("ucs2", 4); + ucs2 = SvTRUE(sv); } if (ucs2 || size == 4) { if (check) { - croak("%"SVf":no surrogates allowed %"UVxf, + croak("%" SVf ":no surrogates allowed %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -216,7 +244,7 @@ CODE: UV lo; if (!isHiSurrogate(ord)) { if (check) { - croak("%"SVf":Malformed HI surrogate %"UVxf, + croak("%" SVf ":Malformed HI surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -231,7 +259,7 @@ CODE: break; } else { - croak("%"SVf":Malformed HI surrogate %"UVxf, + croak("%" SVf ":Malformed HI surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -244,7 +272,7 @@ CODE: lo = enc_unpack(aTHX_ &s,e,size,endian); if (!isLoSurrogate(lo)) { if (check) { - croak("%"SVf":Malformed LO surrogate %"UVxf, + croak("%" SVf ":Malformed LO surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -262,7 +290,7 @@ CODE: if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { if (check) { - croak("%"SVf":Unicode character %"UVxf" is illegal", + croak("%" SVf ":Unicode character %" UVxf " is illegal", *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } else { @@ -295,7 +323,7 @@ CODE: if (s < e) { /* unlikely to happen because it's fixed-length -- dankogai */ if (check & ENCODE_WARN_ON_ERR) { - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character", *hv_fetch((HV *)SvRV(obj),"Name",4,0)); } } @@ -308,6 +336,7 @@ CODE: SvCUR_set(str,0); } *SvEND(str) = '\0'; + SvSETMAGIC(str); } if (!temp_result) shrink_buffer(result); @@ -322,19 +351,40 @@ SV * utf8 IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - const int size = SvIV(attr("size", 4)); + SV *sve = attr("endian", 6); + U8 endian = *((U8 *)SvPV_nolen(sve)); + SV *svs = attr("size", 4); + const int size = SvIV(svs); int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ const STRLEN usize = (size > 0 ? size : 1); SV *result = newSVpvn("", 0); STRLEN ulen; - U8 *s = (U8 *) SvPVutf8(utf8, ulen); - const U8 *e = (U8 *) SvEND(utf8); + U8 *s; + U8 *e; + bool modify = (check && !(check & ENCODE_LEAVE_SRC)); + bool temp_result; + + SvGETMAGIC(utf8); + if (!SvOK(utf8)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen); + if (!SvUTF8(utf8)) { + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); + if (SvTAINTED(utf8)) + SvTAINTED_on(tmp); + utf8 = tmp; + } + sv_utf8_upgrade_nomg(utf8); + s = (U8 *)SvPV_nomg(utf8, ulen); + } + e = s+ulen; + /* Optimise for the common case of being called from PerlIOEncode_flush() with a standard length buffer. In this case the result SV's buffer is only used temporarily, so we can afford to allocate the maximum needed and not care about unused space. */ - const bool temp_result = (ulen == PERLIO_BUFSIZ); + temp_result = (ulen == PERLIO_BUFSIZ); ST(0) = sv_2mortal(result); @@ -344,12 +394,14 @@ CODE: SvGROW(result, ((ulen+1) * usize)); if (!endian) { + SV *sv; endian = (size == 4) ? 'N' : 'n'; enc_pack(aTHX_ result,size,endian,BOM_BE); #if 1 /* Update endian for next sequence */ - if (attr_true("renewed", 7)) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + sv = attr("renewed", 7); + if (SvTRUE(sv)) { + (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); } #endif } @@ -364,11 +416,12 @@ CODE: if (size != 4 && invalid_ucs2(ord)) { if (!issurrogate(ord)) { if (ucs2 == -1) { - ucs2 = attr_true("ucs2", 4); + SV *sv = attr("ucs2", 4); + ucs2 = SvTRUE(sv); } if (ucs2 || ord > 0x10FFFF) { if (check) { - croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", + croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high", *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); } enc_pack(aTHX_ result,size,endian,FBCHAR); @@ -394,7 +447,7 @@ CODE: But this is critical when you choose to LEAVE_SRC in which case we die */ if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { - Perl_croak(aTHX_ "%"SVf":partial character is not allowed " + Perl_croak(aTHX_ "%" SVf ":partial character is not allowed " "when CHECK = 0x%" UVuf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); } @@ -408,12 +461,11 @@ CODE: SvCUR_set(utf8,0); } *SvEND(utf8) = '\0'; + SvSETMAGIC(utf8); } if (!temp_result) shrink_buffer(result); if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ - SvSETMAGIC(utf8); - XSRETURN(1); } diff --git a/cpan/Encode/bin/enc2xs b/cpan/Encode/bin/enc2xs index f2a228f68b..bd39639ae8 100644 --- a/cpan/Encode/bin/enc2xs +++ b/cpan/Encode/bin/enc2xs @@ -11,7 +11,7 @@ use warnings; use Getopt::Std; use Config; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 2.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -123,7 +123,10 @@ my %encode_types = (U => \&encode_U, ); # Win32 does not expand globs on command line -eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); +if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) { + eval "\@ARGV = map(glob(\$_),\@ARGV)"; + @ARGV = @orig_ARGV unless @ARGV; +} my %opt; # I think these are: @@ -134,6 +137,8 @@ my %opt; # -o <output> to specify the output file name (else it's the first arg) # -f <inlist> to give a file with a list of input files (else use the args) # -n <name> to name the encoding (else use the basename of the input file. +#Getopt::Long::Configure("bundling"); +#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v)); getopts('CM:SQqOo:f:n:v',\%opt); $opt{M} and make_makefile_pl($opt{M}, @ARGV); @@ -196,9 +201,9 @@ sub compiler_info { # This really should go first, else the die here causes empty (non-erroneous) # output files to be written. my @encfiles; -if (exists $opt{'f'}) { +if (exists $opt{f}) { # -F is followed by name of file containing list of filenames - my $flist = $opt{'f'}; + my $flist = $opt{f}; open(FLIST,$flist) || die "Cannot open $flist:$!"; chomp(@encfiles = <FLIST>); close(FLIST); @@ -206,9 +211,15 @@ if (exists $opt{'f'}) { @encfiles = @ARGV; } -my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); +my $cname = $opt{o} ? $opt{o} : shift(@ARGV); +unless ($cname) { #debuging a win32 nmake error-only. works via cmdline + print "\nARGV:"; + print "$_ " for @ARGV; + print "\nopt:"; + print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt; +} chmod(0666,$cname) if -f $cname && !-w $cname; -open(C,">$cname") || die "Cannot open $cname:$!"; +open(C,">", $cname) || die "Cannot open $cname:$!"; my $dname = $cname; my $hname = $cname; @@ -220,10 +231,10 @@ if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARS $doC = 1; $dname =~ s/(\.[^\.]*)?$/.exh/; chmod(0666,$dname) if -f $cname && !-w $dname; - open(D,">$dname") || die "Cannot open $dname:$!"; + open(D,">", $dname) || die "Cannot open $dname:$!"; $hname =~ s/(\.[^\.]*)?$/.h/; chmod(0666,$hname) if -f $cname && !-w $hname; - open(H,">$hname") || die "Cannot open $hname:$!"; + open(H,">", $hname) || die "Cannot open $hname:$!"; foreach my $fh (\*C,\*D,\*H) { @@ -469,7 +480,9 @@ sub compile_ucm $erep = $attr{'subchar'}; $erep =~ s/^\s+//; $erep =~ s/\s+$//; } - print "Reading $name ($cs)\n"; + print "Reading $name ($cs)\n" + unless defined $ENV{MAKEFLAGS} + and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/; my $nfb = 0; my $hfb = 0; while (<$fh>) @@ -755,9 +768,17 @@ sub addstrings if ($a->{'Forward'}) { my ($cpp, $static, $sized) = compiler_info(1); - my $var = $static ? 'static const' : 'extern'; my $count = $sized ? scalar(@{$a->{'Entries'}}) : ''; - print $fh "$var encpage_t $name\[$count];\n"; + if ($static) { + # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline + print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 + print $fh "extern encpage_t $name\[$count];\n"; + print $fh "#else\n"; + print $fh "static const encpage_t $name\[$count];\n"; + print $fh "#endif\n"; + } else { + print $fh "extern encpage_t $name\[$count];\n"; + } } $a->{'DoneStrings'} = 1; foreach my $b (@{$a->{'Entries'}}) @@ -848,9 +869,16 @@ sub outtable outtable($fh,$t,$bigname) unless $t->{'Done'}; } my ($cpp, $static) = compiler_info(0); - my $var = $static ? 'static const ' : ''; - print $fh "\n${var}encpage_t $name\[", - scalar(@{$a->{'Entries'}}), "] = {\n"; + my $count = scalar(@{$a->{'Entries'}}); + if ($static) { + print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 + print $fh "encpage_t $name\[$count] = {\n"; + print $fh "#else\n"; + print $fh "static const encpage_t $name\[$count] = {\n"; + print $fh "#endif\n"; + } else { + print $fh "\nencpage_t $name\[$count] = {\n"; + } foreach my $b (@{$a->{'Entries'}}) { my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; @@ -1104,7 +1132,7 @@ sub _print_expand{ if ((my $d = dirname($dst)) ne '.'){ -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; } - open my $out, ">$dst" or die "$!"; + open my $out, ">", $dst or die "$!"; my $asis = 0; while (<$in>){ if (/^#### END_OF_HEADER/){ diff --git a/cpan/Encode/encoding.pm b/cpan/Encode/encoding.pm index 754b3acb03..dc342683ee 100644 --- a/cpan/Encode/encoding.pm +++ b/cpan/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.18 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: encoding.pm,v 2.19 2016/11/01 13:30:38 dankogai Exp $ package encoding; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.18 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.19 $ =~ /(\d+)/g; use Encode; use strict; diff --git a/cpan/Encode/lib/Encode/Alias.pm b/cpan/Encode/lib/Encode/Alias.pm index 04ad4967c9..0a252560f5 100644 --- a/cpan/Encode/lib/Encode/Alias.pm +++ b/cpan/Encode/lib/Encode/Alias.pm @@ -2,7 +2,7 @@ package Encode::Alias; use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.20 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.21 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; use Exporter 'import'; @@ -79,8 +79,10 @@ sub find_alias { sub define_alias { while (@_) { - my ( $alias, $name ) = splice( @_, 0, 2 ); - unshift( @Alias, $alias => $name ); # newer one has precedence + my $alias = shift; + my $name = shift; + unshift( @Alias, $alias => $name ) # newer one has precedence + if defined $alias; if ( ref($alias) ) { # clear %Alias cache to allow overrides @@ -96,10 +98,14 @@ sub define_alias { } } } - else { + elsif (defined $alias) { DEBUG and warn "delete \$Alias\{$alias\}"; delete $Alias{$alias}; } + elsif (DEBUG) { + require Carp; + Carp::croak("undef \$alias"); + } } } diff --git a/cpan/Encode/lib/Encode/CN/HZ.pm b/cpan/Encode/lib/Encode/CN/HZ.pm index f035d821f5..4510b0b400 100644 --- a/cpan/Encode/lib/Encode/CN/HZ.pm +++ b/cpan/Encode/lib/Encode/CN/HZ.pm @@ -5,7 +5,7 @@ use warnings; use utf8 (); use vars qw($VERSION); -$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +$VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -49,7 +49,8 @@ sub decode ($$;$) { else { # GB mode; the byte ranges are as in RFC 1843. no warnings 'uninitialized'; if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { - $ret .= $GB->decode( $1, $chk ); + my $prefix = $1; + $ret .= $GB->decode( $prefix, $chk ); } elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' $in_ascii = 1; diff --git a/cpan/Encode/lib/Encode/MIME/Header.pm b/cpan/Encode/lib/Encode/MIME/Header.pm index d74d453b8b..ad14dba374 100644 --- a/cpan/Encode/lib/Encode/MIME/Header.pm +++ b/cpan/Encode/lib/Encode/MIME/Header.pm @@ -1,22 +1,25 @@ package Encode::MIME::Header; use strict; use warnings; -no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.23 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; -use Encode qw(find_encoding encode_utf8 decode_utf8); -use MIME::Base64; -use Carp; +our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Carp (); +use Encode (); +use MIME::Base64 (); my %seed = ( - decode_b => '1', # decodes 'B' encoding ? - decode_q => '1', # decodes 'Q' encoding ? - encode => 'B', # encode with 'B' or 'Q' ? - bpl => 75, # bytes per line + decode_b => 1, # decodes 'B' encoding ? + decode_q => 1, # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + charset => 'UTF-8', # encode charset + bpl => 75, # bytes per line ); -$Encode::Encoding{'MIME-Header'} = - bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; +$Encode::Encoding{'MIME-Header'} = bless { + %seed, + Name => 'MIME-Header', +} => __PACKAGE__; $Encode::Encoding{'MIME-B'} = bless { %seed, @@ -37,107 +40,186 @@ sub needs_lines { 1 } sub perlio_ok { 0 } # RFC 2047 and RFC 2231 grammar -my $re_charset = qr/[-0-9A-Za-z_]+/; -my $re_language = qr/[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*/; +my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/; +my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/; my $re_encoding = qr/[QqBb]/; -my $re_encoded_text = qr/[^\?\s]*/; +my $re_encoded_text = qr/[^\?]*/; my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/; -my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/; +my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/; +my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/; + +# in strict mode check also for valid base64 characters and also for valid quoted printable codes +my $re_encoding_strict_b = qr/[Bb]/; +my $re_encoding_strict_q = qr/[Qq]/; +my $re_encoded_text_strict_b = qr/[0-9A-Za-z\+\/]*={0,2}/; +my $re_encoded_text_strict_q = qr/(?:[^\?\s=]|=[0-9A-Fa-f]{2})*/; +my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; +my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; + +my $re_newline = qr/(?:\r\n|[\r\n])/; + +# in strict mode encoded words must be always separated by spaces or tabs (or folded newline) +# except in comments when separator between words and comment round brackets can be omitted +my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/; +my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/; +my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/; + +my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/; +my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/; + +my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/; +my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/; our $STRICT_DECODE = 0; sub decode($$;$) { - use utf8; - my ( $obj, $str, $chk ) = @_; + my ($obj, $str, $chk) = @_; - # multi-line header to single line - $str =~ s/(?:\r\n|[\r\n])([ \t])/$1/gos; + my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match; + my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture; - # decode each line separately - my @input = split /(\r\n|\r|\n)/o, $str; + my $stop = 0; my $output = substr($str, 0, 0); # to propagate taintedness - while ( @input ) { + # decode each line separately, match whole continuous folded line at one call + 1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{ - my $line = shift @input; - my $sep = shift @input; + my $line = $1; + my $sep = defined $2 ? $2 : ''; - # in strict mode encoded words must be always separated by spaces or tabs - # except in comments when separator between words and comment round brackets can be omitted - my $re_word_begin = $STRICT_DECODE ? qr/(?:[ \t\n]|\A)\(?/ : qr//; - my $re_word_sep = $STRICT_DECODE ? qr/[ \t]+/ : qr/\s*/; - my $re_word_end = $STRICT_DECODE ? qr/\)?(?:[ \t\n]|\z)/ : qr//; + $stop = 1 unless length($line) or length($sep); - # concat consecutive encoded mime words with same charset, language and encoding + # NOTE: this code partially could break $chk support + # in non strict mode concat consecutive encoded mime words with same charset, language and encoding # fixes breaking inside multi-byte characters - 1 while $line =~ s/($re_word_begin)$re_capture_encoded_word$re_word_sep=\?\2\3\?\4\?($re_encoded_text)\?=(?=$re_word_end)/$1=\?$2$3\?$4\?$5$6\?=/; - - $line =~ s{($re_word_begin)((?:$re_encoded_word$re_word_sep)*$re_encoded_word)(?=$re_word_end)}{ - my $begin = $1; - my $words = $2; - $words =~ s{$re_capture_encoded_word$re_word_sep?}{ - if (uc($3) eq 'B') { - $obj->{decode_b} or croak qq(MIME "B" unsupported); - decode_b($1, $4, $chk); - } elsif (uc($3) eq 'Q') { - $obj->{decode_q} or croak qq(MIME "Q" unsupported); - decode_q($1, $4, $chk); + 1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so; + + # process sequence of encoded MIME words at once + 1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{ + + my $begin = $1 . $2; + my $words = $3; + + $begin =~ tr/\r\n//d; + $output .= $begin; + + # decode one MIME word + 1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{ + + $output .= $1; + my $orig = $2; + my $charset = $3; + my ($mime_enc, $text) = split /\?/, $5; + + $text =~ tr/\r\n//d; + + my $enc = Encode::find_mime_encoding($charset); + + # in non strict mode allow also perl encoding aliases + if ( not defined $enc and not $STRICT_DECODE ) { + # make sure that decoded string will be always strict UTF-8 + $charset = 'UTF-8' if lc($charset) eq 'utf8'; + $enc = Encode::find_encoding($charset); + } + + if ( not defined $enc ) { + Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk & Encode::DIE_ON_ERR; + Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk & Encode::WARN_ON_ERR; + $stop = 1 if not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace + $stop ? $orig : ''; } else { - croak qq(MIME "$3" encoding is nonexistent!); + if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) { + my $decoded = _decode_b($enc, $text, $chk); + $stop = 1 if not defined $decoded and not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= (defined $decoded ? $decoded : $text) unless $stop; + $stop ? $orig : ''; + } elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) { + my $decoded = _decode_q($enc, $text, $chk); + $stop = 1 if not defined $decoded and not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= (defined $decoded ? $decoded : $text) unless $stop; + $stop ? $orig : ''; + } else { + Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk & Encode::DIE_ON_ERR; + Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk & Encode::WARN_ON_ERR; + $stop = 1 if not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace + $stop ? $orig : ''; + } } - }eg; - $begin . $words; - }eg; - $output .= $line; - $output .= $sep if defined $sep; + }se; - } + if ( not $stop ) { + $output .= $words; + $words = ''; + } + + $words; + + }se; + + if ( not $stop ) { + $line =~ tr/\r\n//d; + $output .= $line . $sep; + $line = ''; + $sep = ''; + } + + $line . $sep; - $_[1] = '' if $chk; # empty the input string in the stack so perlio is ok + }se; + + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return $output; } -sub decode_b { - my ( $enc, $b, $chk ) = @_; - my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); - # MIME::Base64::decode_base64 ignores everything after a '=' padding character - # split string after each sequence of padding characters and decode each substring - my $db64 = join('', map { decode_base64($_) } split /(?<==)(?=[^=])/, $b); - return $d->name eq 'utf8' - ? Encode::decode_utf8($db64) - : $d->decode( $db64, $chk || Encode::FB_PERLQQ ); +sub _decode_b { + my ($enc, $text, $chk) = @_; + # MIME::Base64::decode ignores everything after a '=' padding character + # in non strict mode split string after each sequence of padding characters and decode each substring + my $octets = $STRICT_DECODE ? + MIME::Base64::decode($text) : + join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text); + return _decode_octets($enc, $octets, $chk); +} + +sub _decode_q { + my ($enc, $text, $chk) = @_; + $text =~ s/_/ /go; + $text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego; + return _decode_octets($enc, $text, $chk); } -sub decode_q { - my ( $enc, $q, $chk ) = @_; - my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); - $q =~ s/_/ /go; - $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; - return $d->name eq 'utf8' - ? Encode::decode_utf8($q) - : $d->decode( $q, $chk || Encode::FB_PERLQQ ); +sub _decode_octets { + my ($enc, $octets, $chk) = @_; + $chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk; + local $Carp::CarpLevel = $Carp::CarpLevel + 1; # propagate Carp messages back to caller + my $output = $enc->decode($octets, $chk); + return undef if not ref $chk and $chk and $octets ne ''; + return $output; } sub encode($$;$) { - my ( $obj, $str, $chk ) = @_; - $_[1] = '' if $chk; # empty the input string in the stack so perlio is ok - return $obj->_fold_line($obj->_encode_line($str)); + my ($obj, $str, $chk) = @_; + my $output = $obj->_fold_line($obj->_encode_string($str, $chk)); + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); + return $output . substr($str, 0, 0); # to propagate taintedness } sub _fold_line { - my ( $obj, $line ) = @_; + my ($obj, $line) = @_; my $bpl = $obj->{bpl}; - my $output = substr($line, 0, 0); # to propagate taintedness + my $output = ''; - while ( length $line ) { + while ( length($line) ) { if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) { $output .= $1; - $output .= "\r\n" . $2 if length $line; + $output .= "\r\n" . $2 if length($line); } elsif ( $line =~ s/(\s)(.*)$// ) { $output .= $line; $line = $2; - $output .= "\r\n" . $1 if length $line; + $output .= "\r\n" . $1 if length($line); } else { $output .= $line; last; @@ -147,56 +229,75 @@ sub _fold_line { return $output; } -use constant HEAD => '=?UTF-8?'; -use constant TAIL => '?='; -use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, B_len => \&_encode_b_len, Q_len => \&_encode_q_len }; - -sub _encode_line { - my ( $o, $str ) = @_; - my $enc = $o->{encode}; - my $enc_len = $enc . '_len'; - my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); - +sub _encode_string { + my ($obj, $str, $chk) = @_; + my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl}; + my $enc = Encode::find_mime_encoding($obj->{charset}); + my $enc_chk = (not ref $chk and $chk) ? ($chk | Encode::LEAVE_SRC) : $chk; my @result = (); - my $chunk = ''; - while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { - if ( SINGLE->{$enc_len}($chunk . $chr) > $llen ) { - push @result, SINGLE->{$enc}($chunk); - $chunk = ''; + my $octets = ''; + while ( length( my $chr = substr($str, 0, 1, '') ) ) { + my $seq; + { + local $Carp::CarpLevel = $Carp::CarpLevel + 1; # propagate Carp messages back to caller + $seq = $enc->encode($chr, $enc_chk); } - $chunk .= $chr; + if ( not length($seq) ) { + substr($str, 0, 0, $chr); + last; + } + if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) { + push @result, $obj->_encode_word($octets); + $octets = ''; + } + $octets .= $seq; } - length($chunk) and push @result, SINGLE->{$enc}($chunk); + length($octets) and push @result, $obj->_encode_word($octets); + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return join(' ', @result); } +sub _encode_word { + my ($obj, $octets) = @_; + my $charset = $obj->{charset}; + my $encode = $obj->{encode}; + my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets); + return "=?$charset?$encode?$text?="; +} + +sub _encoded_word_len { + my ($obj, $octets) = @_; + my $charset = $obj->{charset}; + my $encode = $obj->{encode}; + my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets); + return length("=?$charset?$encode??=") + $text_len; +} + sub _encode_b { - HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; + my ($octets) = @_; + return MIME::Base64::encode($octets, ''); } -sub _encode_b_len { - my ( $chunk ) = @_; - use bytes (); - return bytes::length($chunk) * 4 / 3; +sub _encoded_b_len { + my ($octets) = @_; + return ( length($octets) + 2 ) / 3 * 4; } -my $valid_q_chars = '0-9A-Za-z !*+\-/'; +my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/; sub _encode_q { - my ( $chunk ) = @_; - $chunk = encode_utf8($chunk); - $chunk =~ s{([^$valid_q_chars])}{ - join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + my ($octets) = @_; + $octets =~ s{($re_invalid_q_char)}{ + join('', map { sprintf('=%02X', $_) } unpack('C*', $1)) }egox; - $chunk =~ s/ /_/go; - return HEAD . 'Q?' . $chunk . TAIL; + $octets =~ s/ /_/go; + return $octets; } -sub _encode_q_len { - my ( $chunk ) = @_; - use bytes (); - my $valid_count =()= $chunk =~ /[$valid_q_chars]/sgo; - return ( bytes::length($chunk) - $valid_count ) * 3 + $valid_count; +sub _encoded_q_len { + my ($octets) = @_; + my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo; + return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count ); } 1; @@ -204,75 +305,119 @@ __END__ =head1 NAME -Encode::MIME::Header -- MIME 'B' and 'Q' encoding for unstructured header +Encode::MIME::Header -- MIME encoding for an unstructured email header =head1 SYNOPSIS - use Encode qw/encode decode/; - $utf8 = decode('MIME-Header', $header); - $header = encode('MIME-Header', $utf8); - -=head1 ABSTRACT - -This module implements RFC 2047 MIME encoding for unstructured header. -It cannot be used for structured headers like From or To. There are 3 -variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The -difference is described below + use Encode qw(encode decode); - decode() encode() - ---------------------------------------------- - MIME-Header Both B and Q =?UTF-8?B?....?= - MIME-B B only; Q croaks =?UTF-8?B?....?= - MIME-Q Q only; B croaks =?UTF-8?Q?....?= + my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}"); + # $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?=" -=head1 DESCRIPTION + my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}"); + # $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?=" -When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> -is extracted and decoded for I<X> encoding (B for Base64, Q for -Quoted-Printable). Then the decoded chunk is fed to -decode(I<encoding>). So long as I<encoding> is supported by Encode, -any source encoding is fine. + my $str = decode("MIME-Header", + "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " . + "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=" + ); + # $str is "If you can read this you understand the example." -When you encode, it just encodes UTF-8 string with I<X> encoding then -quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to -encode are left as is and long lines are folded within 76 bytes per -line. + use Encode qw(decode :fallbacks); + use Encode::MIME::Header; + local $Encode::MIME::Header::STRICT_DECODE = 1; + my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK); + # use strict decoding and croak on errors -=head1 BUGS - -Before version 2.83 this module had broken both decoder and encoder. -Encoder inserted additional spaces, incorrectly encoded input data -and produced invalid MIME strings. Decoder lot of times discarded -white space characters, incorrectly interpreted data or decoded -Base64 string as Quoted-Printable. +=head1 ABSTRACT -As of version 2.83 encoder should be fully compliant of RFC 2047. -Due to bugs in previous versions of encoder, decoder is by default in -less strict compatible mode. It should be able to decode strings -encoded by pre 2.83 version of this module. But this default mode is -not correct according to RFC 2047. +This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME +encoding for an unstructured field body of the email header. It can also be +used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token. However, +it cannot be used directly for the whole header with the field name or for the +structured header fields like From, To, Cc, Message-Id, etc... There are 3 +encoding names supported by this module: C<MIME-Header>, C<MIME-B> and +C<MIME-Q>. -In default mode decoder try to decode every substring which looks like -MIME encoded data. So it means that MIME data does not need to be -separated by white space. To enforce correct strict mode, set package -variable $Encode::MIME::Header::STRICT_DECODE to 1, e.g. by localizing: +=head1 DESCRIPTION -C<require Encode::MIME::Header; local $Encode::MIME::Header::STRICT_DECODE = 1;> +Decode method takes an unstructured field body of the email header (or +L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and +decodes each MIME encoded-word from input string to a sequence of bytes +according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and +L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Subsequently, each sequence +of bytes with the corresponding MIME charset is decoded with +L<the Encode module|Encode> and finally, one output string is returned. Text +parts of the input string which do not contain MIME encoded-word stay +unmodified in the output string. Folded newlines between two consecutive MIME +encoded-words are discarded, others are preserved in the output string. +C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable +variant and C<MIME-Header> can decode both of them. If L<Encode module|Encode> +does not support particular MIME charset or chosen variant then an action based +on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the +MIME encoded-word is not decoded). + +Encode method takes a scalar string as its input and uses +L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8 +bytes. Then a sequence of UTF-8 bytes is encoded into MIME encoded-words +(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a +Quoted-Printable variant) where each MIME encoded-word is limited to 75 +characters. MIME encoded-words are separated by C<CRLF SPACE> and joined to +one output string. Output string is suitable for unstructured field body of +the email header. + +Both encode and decode methods propagate +L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the +MIME charset. -It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? -and =?ISO-8859-1?= but that makes the implementation too complicated. -These days major mail agents all support =?UTF-8? so I think it is -just good enough. +=head1 BUGS -Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by -Makamaka. Thre are still too many MUAs especially cellular phone -handsets which does not grok UTF-8. +Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder +and encoder. The MIME encoder infamously inserted additional spaces or +discarded white spaces between consecutive MIME encoded-words, which led to +invalid MIME headers produced by this module. The MIME decoder had a tendency +to discard white spaces, incorrectly interpret data or attempt to decode Base64 +MIME encoded-words as Quoted-Printable. These problems were fixed in version +2.22. It is highly recommended not to use any version prior 2.22! + +Versions prior to 2.24 (part of Encode 2.87) ignored +L<CHECK flags|Encode/Handling Malformed Data>. The MIME encoder used +L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode +strings which could lead to invalid UTF-8 sequences. MIME decoder used also +L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally +called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified +L<CHECK flags|Encode/Handling Malformed Data> were ignored). Moreover, it +automatically croaked when a MIME encoded-word contained unknown encoding. +Since version 2.24, this module uses +L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder. And +L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated. + +Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully +compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and +L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Due to the aforementioned +bugs in previous versions of the MIME encoder, there is a I<less strict> +compatible mode for the MIME decoder which is used by default. It should be +able to decode MIME encoded-words encoded by pre 2.22 versions of this module. +However, note that this is not correct according to +L<RFC 2047|https://tools.ietf.org/html/rfc2047>. **** PATCH TRUNCATED AT 2000 LINES -- 1315 NOT SHOWN **** -- Perl5 Master Repository
