In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/c31ca2013f287840fcddf498ead9602666569966?hp=c16e7f98327a78a23d0eba94da62bf70782165ae>
- Log ----------------------------------------------------------------- commit c31ca2013f287840fcddf498ead9602666569966 Author: Steve Hay <steve.m....@googlemail.com> Date: Mon Feb 12 20:37:36 2018 +0000 Upgrade Encode from version 2.94 to 2.96 ----------------------------------------------------------------------- Summary of changes: Porting/Maintainers.pl | 2 +- cpan/Encode/Encode.pm | 4 +- cpan/Encode/Encode.xs | 192 +++++++++++++++++++++++++++++------------ cpan/Encode/Encode/encode.h | 7 +- cpan/Encode/Unicode/Unicode.pm | 2 +- cpan/Encode/Unicode/Unicode.xs | 6 +- cpan/Encode/encengine.c | 84 +++++++++--------- cpan/Encode/encoding.pm | 4 +- 8 files changed, 194 insertions(+), 107 deletions(-) diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index 29b0472ed5..6a33bb24f0 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -394,7 +394,7 @@ use File::Glob qw(:case); }, 'Encode' => { - 'DISTRIBUTION' => 'DANKOGAI/Encode-2.94.tar.gz', + 'DISTRIBUTION' => 'DANKOGAI/Encode-2.96.tar.gz', 'FILES' => q[cpan/Encode], }, diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm index 249ac6b138..c1de56100f 100644 --- a/cpan/Encode/Encode.pm +++ b/cpan/Encode/Encode.pm @@ -1,5 +1,5 @@ # -# $Id: Encode.pm,v 2.94 2018/01/09 05:53:00 dankogai Exp dankogai $ +# $Id: Encode.pm,v 2.96 2018/02/11 05:32:30 dankogai Exp $ # package Encode; use strict; @@ -7,7 +7,7 @@ use warnings; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; our $VERSION; BEGIN { - $VERSION = sprintf "%d.%02d", q$Revision: 2.94 $ =~ /(\d+)/g; + $VERSION = sprintf "%d.%02d", q$Revision: 2.96 $ =~ /(\d+)/g; require XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); } diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs index 6c077bec3a..bc4a77d6d2 100644 --- a/cpan/Encode/Encode.xs +++ b/cpan/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.41 2017/06/10 17:23:50 dankogai Exp $ + $Id: Encode.xs,v 2.42 2018/02/08 00:26:15 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -35,6 +35,13 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) #define SvIV_nomg SvIV #endif +#ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE +# define UTF8_DISALLOW_ILLEGAL_INTERCHANGE 0 +# define UTF8_ALLOW_NON_STRICT (UTF8_ALLOW_FE_FF|UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) +#else +# define UTF8_ALLOW_NON_STRICT 0 +#endif + static void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -177,65 +184,66 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * if (offset) { s += *offset; if (slen > *offset){ /* safeguard against slen overflow */ - slen -= *offset; + slen -= *offset; }else{ - slen = 0; + slen = 0; } tlen = slen; } if (slen == 0){ - SvCUR_set(dst, 0); - SvPOK_only(dst); - goto ENCODE_END; + SvCUR_set(dst, 0); + SvPOK_only(dst); + goto ENCODE_END; } while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, - trm, trmlen)) ) + trm, trmlen)) ) { - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || - code == ENCODE_FOUND_TERM) { - break; - } - switch (code) { - case ENCODE_NOSPACE: - { - STRLEN more = 0; /* make sure you initialize! */ - STRLEN sleft; - sdone += slen; - ddone += dlen; - sleft = tlen - sdone; + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { + break; + } + switch (code) { + case ENCODE_NOSPACE: + { + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; + sdone += slen; + ddone += dlen; + sleft = tlen - sdone; #if ENCODE_XS_PROFILE >= 2 - Perl_warn(aTHX_ - "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", - more, sdone, sleft, SvLEN(dst)); + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); #endif - if (sdone != 0) { /* has src ever been processed ? */ + if (sdone != 0) { /* has src ever been processed ? */ #if ENCODE_XS_USEFP == 2 - more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - - SvLEN(dst); + more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone + - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); #else - /* safe until SvLEN(dst) == MAX_INT/16 */ - more = (16*SvLEN(dst)+1)/sdone/16 * sleft; + /* safe until SvLEN(dst) == MAX_INT/16 */ + more = (16*SvLEN(dst)+1)/sdone/16 * sleft; #endif + } + more += UTF8_MAXLEN; /* insurance policy */ + d = (U8 *) SvGROW(dst, SvLEN(dst) + more); + /* dst need to grow need MORE bytes! */ + if (ddone >= SvLEN(dst)) { + Perl_croak(aTHX_ "Destination couldn't be grown."); + } + dlen = SvLEN(dst)-ddone-1; + d += ddone; + s += slen; + slen = tlen-sdone; + continue; } - more += UTF8_MAXLEN; /* insurance policy */ - d = (U8 *) SvGROW(dst, SvLEN(dst) + more); - /* dst need to grow need MORE bytes! */ - if (ddone >= SvLEN(dst)) { - Perl_croak(aTHX_ "Destination couldn't be grown."); - } - dlen = SvLEN(dst)-ddone-1; - d += ddone; - s += slen; - slen = tlen-sdone; - continue; - } + case ENCODE_NOREP: /* encoding */ if (dir == enc->f_utf8) { @@ -319,18 +327,18 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * } /* settle variables when fallback */ d = (U8 *)SvEND(dst); - dlen = SvLEN(dst) - ddone - 1; + dlen = SvLEN(dst) - ddone - 1; s = (U8*)SvPVX(src) + sdone; slen = tlen - sdone; break; - default: - Perl_croak(aTHX_ "Unexpected code %d converting %s %s", - code, (dir == enc->f_utf8) ? "to" : "from", - enc->name[0]); - return &PL_sv_undef; - } - } + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from", + enc->name[0]); + return &PL_sv_undef; + } + } /* End of looping through the string */ ENCODE_SET_SRC: if (check && !(check & ENCODE_LEAVE_SRC)){ sdone = SvCUR(src) - (slen+sdone); @@ -354,7 +362,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * } #endif - if (offset) + if (offset) *offset += sdone + slen; ENCODE_END: @@ -378,6 +386,13 @@ strict_utf8(pTHX_ SV* sv) return SvTRUE(*svp); } +/* Modern perls have the capability to do this more efficiently and portably */ +#ifdef is_utf8_string_loc_flags +# define CAN_USE_BASE_PERL +#endif + +#ifndef CAN_USE_BASE_PERL + /* * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126 */ @@ -433,10 +448,27 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) return uv; } +#endif /* CAN_USE_BASE_PERL */ + static U8* process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, bool encode, bool strict, bool stop_at_partial) { + /* Copies the purportedly UTF-8 encoded string starting at 's' and ending + * at 'e' - 1 to 'dst', checking as it goes along that the string actually + * is valid UTF-8. There are two levels of strictness checking. If + * 'strict' is FALSE, the string is checked for being well-formed UTF-8, as + * extended by Perl. Additionally, if 'strict' is TRUE, above-Unicode code + * points, surrogates, and non-character code points are checked for. When + * invalid input is encountered, some action is taken, exactly what depends + * on the flags in 'check_sv'. 'encode' gives if this is from an encode + * operation (if TRUE), or a decode one. This function returns the + * position in 's' of the start of the next character beyond where it got + * to. If there were no problems, that will be 'e'. If 'stop_at_partial' + * is TRUE, if the final character before 'e' is incomplete, but valid as + * far as is available, no action will be taken on that partial character, + * and the return value will point to its first byte */ + UV uv; STRLEN ulen; SV *fallback_cb; @@ -445,6 +477,9 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, STRLEN dlen; char esc[UTF8_MAXLEN * 6 + 1]; STRLEN i; + const U32 flags = (strict) + ? UTF8_DISALLOW_ILLEGAL_INTERCHANGE + : UTF8_ALLOW_NON_STRICT; if (SvROK(check_sv)) { /* croak("UTF-8 decoder doesn't support callback CHECK"); */ @@ -462,7 +497,44 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, dlen = (s && e && s < e) ? e-s+1 : 1; d = (U8 *) SvGROW(dst, dlen); + stop_at_partial = stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL); + while (s < e) { + +#ifdef CAN_USE_BASE_PERL /* Use the much faster, portable implementation if + available */ + + /* If there were no errors, this will be 'e'; otherwise it will point + * to the first byte of the erroneous input */ + const U8* e_or_where_failed; + bool valid = is_utf8_string_loc_flags(s, e - s, &e_or_where_failed, flags); + STRLEN len = e_or_where_failed - s; + + /* Copy as far as was successful */ + Move(s, d, len, U8); + d += len; + s = (U8 *) e_or_where_failed; + + /* Are done if it was valid, or we are accepting partial characters and + * the only error is that the final bytes form a partial character */ + if ( LIKELY(valid) + || ( stop_at_partial + && is_utf8_valid_partial_char_flags(s, e, flags))) + { + break; + } + + /* Here, was not valid. If is 'strict', and is legal extended UTF-8, + * we know it is a code point whose value we can calculate, just not + * one accepted under strict. Otherwise, it is malformed in some way. + * In either case, the system function can calculate either the code + * point, or the best substitution for it */ + uv = utf8n_to_uvchr(s, e - s, &ulen, UTF8_ALLOW_ANY); + +#else /* Use code for earlier perls */ + + PERL_UNUSED_VAR(flags); + if (UTF8_IS_INVARIANT(*s)) { *d++ = *s++; continue; @@ -483,7 +555,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, else ulen = 1; - if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == (STRLEN)(e-s)) + if (stop_at_partial && ulen == (STRLEN)(e-s)) break; goto malformed_byte; @@ -512,6 +584,16 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, ulen = 1; malformed: + +#endif /* The two versions for processing come back together here, for the + * error handling code. + * + * Here, we are looping through the input and found an error. + * 'uv' is the code point in error if calculable, or the REPLACEMENT + * CHARACTER if not. + * 'ulen' is how many bytes of input this iteration of the loop + * consumes */ + if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ))) for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]); if (check & ENCODE_DIE_ON_ERR){ @@ -617,7 +699,7 @@ PPCODE: utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); e = s+slen; - /* + /* * PerlIO check -- we assume the object is of PerlIO if renewed */ ENTER; SAVETMPS; @@ -627,7 +709,7 @@ PPCODE: if (call_method("renewed",G_SCALAR) == 1) { SPAGAIN; renewed = (bool)POPi; - PUTBACK; + PUTBACK; #if 0 fprintf(stderr, "renewed == %d\n", renewed); #endif diff --git a/cpan/Encode/Encode/encode.h b/cpan/Encode/Encode/encode.h index 5fbcf76ad3..df5554f1cb 100644 --- a/cpan/Encode/Encode/encode.h +++ b/cpan/Encode/Encode/encode.h @@ -88,7 +88,12 @@ extern void Encode_DefineEncoding(encode_t *enc); #define ENCODE_FALLBACK 4 #define ENCODE_FOUND_TERM 5 -#define FBCHAR_UTF8 "\xEF\xBF\xBD" +/* Use the perl core value if available; it is portable to EBCDIC */ +#ifdef REPLACEMENT_CHARACTER_UTF8 +# define FBCHAR_UTF8 REPLACEMENT_CHARACTER_UTF8 +#else +# define FBCHAR_UTF8 "\xEF\xBF\xBD" +#endif #define ENCODE_DIE_ON_ERR 0x0001 /* croaks immediately */ #define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */ diff --git a/cpan/Encode/Unicode/Unicode.pm b/cpan/Encode/Unicode/Unicode.pm index c56745d7b1..2a8b477784 100644 --- a/cpan/Encode/Unicode/Unicode.pm +++ b/cpan/Encode/Unicode/Unicode.pm @@ -3,7 +3,7 @@ package Encode::Unicode; use strict; use warnings; -our $VERSION = do { my @r = ( q$Revision: 2.16 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.17 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); diff --git a/cpan/Encode/Unicode/Unicode.xs b/cpan/Encode/Unicode/Unicode.xs index b3b1d2fea8..b459786d16 100644 --- a/cpan/Encode/Unicode/Unicode.xs +++ b/cpan/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.16 2017/06/10 17:23:50 dankogai Exp $ + $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -315,7 +315,7 @@ CODE: resultbuflen = SvLEN(result); } - d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, + d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord, UNICODE_WARN_ILLEGAL_INTERCHANGE); SvCUR_set(result, d - (U8 *)SvPVX(result)); } @@ -407,7 +407,7 @@ CODE: } while (s < e && s+UTF8SKIP(s) <= e) { STRLEN len; - UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE + UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE |UTF8_WARN_SURROGATE |UTF8_DISALLOW_FE_FF |UTF8_WARN_FE_FF diff --git a/cpan/Encode/encengine.c b/cpan/Encode/encengine.c index bddf556b35..67613a89e3 100644 --- a/cpan/Encode/encengine.c +++ b/cpan/Encode/encengine.c @@ -102,56 +102,56 @@ do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, U8 *dend = d + dlen, *dlast = d; int code = 0; while (s < send) { - const encpage_t *e = enc; - U8 byte = *s; - while (byte > e->max) - e++; - if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { - const U8 *cend = s + (e->slen & 0x7f); - if (cend <= send) { - STRLEN n; - if ((n = e->dlen)) { - const U8 *out = e->seq + n * (byte - e->min); - U8 *oend = d + n; - if (dst) { - if (oend <= dend) { - while (d < oend) - *d++ = *out++; + const encpage_t *e = enc; + U8 byte = *s; + while (byte > e->max) + e++; + if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { + const U8 *cend = s + (e->slen & 0x7f); + if (cend <= send) { + STRLEN n; + if ((n = e->dlen)) { + const U8 *out = e->seq + n * (byte - e->min); + U8 *oend = d + n; + if (dst) { + if (oend <= dend) { + while (d < oend) + *d++ = *out++; + } + else { + /* Out of space */ + code = ENCODE_NOSPACE; + break; + } + } + else + d = oend; + } + enc = e->next; + s++; + if (s == cend) { + if (approx && (e->slen & 0x80)) + code = ENCODE_FALLBACK; + last = s; + if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { + code = ENCODE_FOUND_TERM; + break; + } + dlast = d; + } } else { - /* Out of space */ - code = ENCODE_NOSPACE; + /* partial source character */ + code = ENCODE_PARTIAL; break; } - } - else - d = oend; - } - enc = e->next; - s++; - if (s == cend) { - if (approx && (e->slen & 0x80)) - code = ENCODE_FALLBACK; - last = s; - if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { - code = ENCODE_FOUND_TERM; - break; - } - dlast = d; - } } else { - /* partial source character */ - code = ENCODE_PARTIAL; - break; + /* Cannot represent */ + code = ENCODE_NOREP; + break; } } - else { - /* Cannot represent */ - code = ENCODE_NOREP; - break; - } - } *slen = last - src; *dout = d - dst; return code; diff --git a/cpan/Encode/encoding.pm b/cpan/Encode/encoding.pm index 1e82070afd..c3f324d29f 100644 --- a/cpan/Encode/encoding.pm +++ b/cpan/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.21 2017/10/06 22:21:53 dankogai Exp dankogai $ +# $Id: encoding.pm,v 2.22 2018/02/11 05:32:03 dankogai Exp $ package encoding; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.21 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.22 $ =~ /(\d+)/g; use Encode; use strict; -- Perl5 Master Repository