In perl.git, the branch smoke-me/khw-encode has been updated <http://perl5.git.perl.org/perl.git/commitdiff/9c03449800417dd02cc1af613951a1002490a52a?hp=655eab69b102d09f7e6643cbb78877097b23d994>
- Log ----------------------------------------------------------------- commit 9c03449800417dd02cc1af613951a1002490a52a Author: Karl Williamson <k...@cpan.org> Date: Tue Aug 30 21:26:17 2016 -0600 squash ----------------------------------------------------------------------- Summary of changes: cpan/Encode/Encode.xs | 81 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs index c4d061c..8c029c1 100644 --- a/cpan/Encode/Encode.xs +++ b/cpan/Encode/Encode.xs @@ -315,7 +315,7 @@ strict_utf8(pTHX_ SV* sv) static bool is_strict_utf8_string_loc(const U8* const s, STRLEN const len, const U8 **ep) { - /* Returns a boolean giving whether or not the input string from 's' to + /* Returns a boolean giving whether or not the input string from 's' to * ('s' + 'len' - 1) is well-formed UTF-8 that is entirely Unicode code * points that aren't surrogates nor non-character code points. *ep is set * to point to 1 byte beyond the end of the final valid input character */ @@ -342,6 +342,79 @@ is_strict_utf8_string_loc(const U8* const s, STRLEN const len, const U8 **ep) return TRUE; } +static bool +is_strict_utf8_valid_partial_char(const U8* const s, const U8 * const e) +{ + /* Returns a boolean giving whether or not the input string from 's' to + * ('e' - 1) is the initial, but incomplete, sequence of well-formed UTF-8 + * for some Unicode code point, as far as it goes. + * + * It also checks that if the sequence were to be followed by any valid + * UTF-8 to form a complete character, that that character could not be a + * surrogate nor non-Unicode code point. Only the first one or two bytes + * of the UTF-8 representation is needed to make this determination. To + * determine if the UTF-8 could be a non-character, a complete sequence is + * required, so that is not done here. Hence if the sequence, when + * completed, could be something other than a surrogate, non-character code + * point, or above-Unicode code point, TRUE is returned. + */ + + if (! is_utf8_valid_partial_char(s, e)) { + return FALSE; + } + + /* The code below is derived from this table. Keep in mind that legal + * continuation bytes range between \x80..\xBF for UTF-8, and \xA0..\xBF + * for I8. + * UTF-8 UTF-EBCDIC I8 + * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate + * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate + * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode + */ + +#ifndef EBCDIC + + if (UNLIKELY(*s > 0xF4)) { + return FALSE; /* Above UTF-8 */ + } + + if (e - s > 1) { + if (UNLIKELY(*s == 0xF4 && *(s + 1) >= 0x90)) { + return FALSE; /* Above UTF-8 */ + } + + if (UNLIKELY(*s == 0xED && *(s + 1) >= 0xA0)) { + return FALSE; /* Surrogate */ + } + } + +#else + + { + const U8 s0 = NATIVE_UTF8_TO_I8(*s); + + if (UNLIKELY(s0 > 0xF9)) { + return FALSE; /* Above UTF-8 */ + } + + if (e - s > 1) { + if (UNLIKELY(s0 == 0xF9 && NATIVE_UTF8_TO_I8(*(s + 1)) >= 0xA2)) { + return FALSE; /* Above UTF-8 */ + } + + if (UNLIKELY(s0 == 0xF1 && ( NATIVE_UTF8_TO_I8(*(s + 1)) = 0xB6 + || NATIVE_UTF8_TO_I8(*(s + 1)) = 0xB7))) + { + return FALSE; /* Surrogate */ + } + } + } + +#endif + + return TRUE; +} + static U8* process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, bool encode, bool strict, bool stop_at_partial) @@ -392,7 +465,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, bool valid = (strict) ? is_strict_utf8_string_loc(s, e - s, &e_or_where_failed) - : is_utf8_string_loc (s, e - s, &e_or_where_failed); + : is_utf8_string_loc(s, e - s, &e_or_where_failed); STRLEN len = e_or_where_failed - s; Move(s, d, len, U8); @@ -401,7 +474,9 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, if ( LIKELY(valid) || ( (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) - && is_utf8_valid_partial_char(s, e))) + && ((strict) + ? is_strict_utf8_valid_partial_char(s, e) + : is_utf8_valid_partial_char(s, e)))) { break; } -- Perl5 Master Repository