In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/14b9bbbc5b192cd6b85fd4ab2aff99bb6471fb0c?hp=8b78e93627afc5201875e37a37a7b27874b874c5>
- Log ----------------------------------------------------------------- commit 14b9bbbc5b192cd6b85fd4ab2aff99bb6471fb0c Author: Karl Williamson <[email protected]> Date: Sat Feb 11 13:07:33 2017 -0700 pp_pack.c: Remove no longer relevant comment M pp_pack.c commit df3377142c0886d6a189c225c7ceb29f6c3da6f2 Author: Karl Williamson <[email protected]> Date: Sat Feb 11 13:02:46 2017 -0700 pp_pack.c: Remove needless branch This function only sets *retlen to 0 if the input length is 0. In all but one case, the function was not called with with that input. In that one case, I changed to avoid calling the function with that input. Hence we can remove checking *retlen for 0. M pp_pack.c commit 05fefba96af5488952719c6f57af518afba9170a Author: Karl Williamson <[email protected]> Date: Sat Feb 11 13:00:27 2017 -0700 pp_pack.c: Remove obsolete code This code effectively reduced to if (foo) 0 else 0 because a #define was changed to 0 some releases ago. Just replace by 0 M pp_pack.c commit 15b010f03b2e5fb7406abea47a023c54c42402a2 Author: Karl Williamson <[email protected]> Date: Sat Feb 11 12:58:16 2017 -0700 utf8.c: Move comment a few lines up in the file Move it to where it makes more sense. M utf8.c commit 0eb3d6a038a2f51874b4eb6b6268e2f305559b1d Author: Karl Williamson <[email protected]> Date: Sat Feb 11 12:57:33 2017 -0700 utf8.h: Clarify comment M utf8.h commit 7084498489de93f9ab8ba8d9c39bba8fcb2c9a6a Author: Karl Williamson <[email protected]> Date: Sat Feb 11 20:59:49 2017 -0700 utf8.h: White-space, parens only Add parens to clarify grouping, white-space for legibility M utf8.h commit d34818305fc99020f8962e6d266a233f2f2bbe10 Author: Karl Williamson <[email protected]> Date: Sat Feb 11 20:57:36 2017 -0700 utf8.h: Add branch prediction use bytes; is unlikely to be the case. M utf8.h ----------------------------------------------------------------------- Summary of changes: pp_pack.c | 38 ++++++++++++++++++-------------------- utf8.c | 18 +++++++++--------- utf8.h | 22 +++++++++++----------- 3 files changed, 38 insertions(+), 40 deletions(-) diff --git a/pp_pack.c b/pp_pack.c index 737e019a74..86d138bb05 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -251,12 +251,15 @@ STATIC U8 utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) { STRLEN retlen; - UV val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, + UV val; + + if (*s >= end) { + goto croak; + } + val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - /* We try to process malformed UTF-8 as much as possible (preferably with - warnings), but these two mean we make no progress in the string and - might enter an infinite loop */ - if (retlen == (STRLEN) -1 || retlen == 0) + if (retlen == (STRLEN) -1) + croak: Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", (int) TYPE_NO_MODIFIERS(datumtype)); if (val >= 0x100) { @@ -290,7 +293,7 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, int buf_ for (;buf_len > 0; buf_len--) { if (from >= end) return FALSE; val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); - if (retlen == (STRLEN) -1 || retlen == 0) { + if (retlen == (STRLEN) -1) { from += UTF8SKIP(from); bad |= 1; } else from += retlen; @@ -396,7 +399,7 @@ STMT_START { \ STRLEN retlen; \ if (str >= end) break; \ val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ - if (retlen == (STRLEN) -1 || retlen == 0) { \ + if (retlen == (STRLEN) -1) { \ *cur = '\0'; \ Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ } \ @@ -1225,7 +1228,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c STRLEN retlen; aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1 || retlen == 0) + if (retlen == (STRLEN) -1) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; } @@ -1248,7 +1251,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c STRLEN retlen; const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1 || retlen == 0) + if (retlen == (STRLEN) -1) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; if (!checksum) @@ -1310,7 +1313,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c strend - s, &retlen, UTF8_ALLOW_DEFAULT)); - if (retlen == (STRLEN) -1 || retlen == 0) + if (retlen == (STRLEN) -1) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; } @@ -2594,10 +2597,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) GROWING(0, cat, start, cur, len+UTF8_MAXLEN); end = start+SvLEN(cat)-UTF8_MAXLEN; } - cur = (char *) uvchr_to_utf8_flags((U8 *) cur, - auv, - warn_utf8 ? - 0 : UNICODE_ALLOW_ANY); + cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0); } else { if (auv >= 0x100) { if (!SvUTF8(cat)) { @@ -2648,9 +2648,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) auv = SvUV_no_inf(fromstr, datumtype); if (utf8) { U8 buffer[UTF8_MAXLEN], *endb; - endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), - warn_utf8 ? - 0 : UNICODE_ALLOW_ANY); + endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0); if (cur+(endb-buffer)*UTF8_EXPAND >= end) { *cur = '\0'; SvCUR_set(cat, cur - start); @@ -2666,9 +2664,9 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) GROWING(0, cat, start, cur, len+UTF8_MAXLEN); end = start+SvLEN(cat)-UTF8_MAXLEN; } - cur = (char *) uvchr_to_utf8_flags((U8 *) cur, UNI_TO_NATIVE(auv), - warn_utf8 ? - 0 : UNICODE_ALLOW_ANY); + cur = (char *) uvchr_to_utf8_flags((U8 *) cur, + UNI_TO_NATIVE(auv), + 0); } } break; diff --git a/utf8.c b/utf8.c index 9ce72daba0..bec68a5883 100644 --- a/utf8.c +++ b/utf8.c @@ -1179,14 +1179,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Save how many bytes were actually in the character */ curlen = s - s0; - /* A convenience macro that matches either of the too-short conditions. */ -# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { - uv_so_far = uv; - uv = UNICODE_REPLACEMENT; - } - /* Note that there are two types of too-short malformation. One is when * there is actual wrong data before the normal termination of the * sequence. The other is that the sequence wasn't complete before the end @@ -1194,7 +1186,15 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * This means that we were passed data for a partial character, but it is * valid as far as we saw. The other is definitely invalid. This * distinction could be important to a caller, so the two types are kept - * separate. */ + * separate. + * + * A convenience macro that matches either of the too-short conditions. */ +# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) + + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + uv_so_far = uv; + uv = UNICODE_REPLACEMENT; + } /* Check for overflow */ if (UNLIKELY(does_utf8_overflow(s0, send))) { diff --git a/utf8.h b/utf8.h index 0fbe4b79d0..affa2d67f5 100644 --- a/utf8.h +++ b/utf8.h @@ -707,7 +707,7 @@ with a ptr argument. /* A Unicode character can fold to up to 3 characters */ #define UTF8_MAX_FOLD_CHAR_EXPAND 3 -#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES) +#define IN_BYTES UNLIKELY(CopHINTS_get(PL_curcop) & HINT_BYTES) /* @@ -726,12 +726,12 @@ case any call to string overloading updates the internal UTF-8 encoding flag. * Is so within 'feature unicode_strings' or 'locale :not_characters', and not * within 'use bytes'. UTF-8 locales are not tested for here, but perhaps * could be */ -#define IN_UNI_8_BIT \ - (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT)) \ - || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \ - /* -1 below is for :not_characters */ \ - && _is_in_locale_category(FALSE, -1))) \ - && ! IN_BYTES) +#define IN_UNI_8_BIT \ + (( ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT)) \ + || ( CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \ + /* -1 below is for :not_characters */ \ + && _is_in_locale_category(FALSE, -1))) \ + && (! IN_BYTES)) #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ @@ -802,10 +802,10 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_WARN_ILLEGAL_INTERCHANGE \ (UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR) -/* This is used typically for code that is willing to accept inputs of - * illformed UTF-8 sequences, for whatever reason. However, all such sequences - * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are - * also present. */ +/* This is typically used for code that processes UTF-8 input and doesn't want + * to have to deal with any malformations that might be present. All such will + * be safely replaced by the REPLACEMENT CHARACTER, unless other flags + * overriding this are also present. */ #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ |UTF8_ALLOW_NON_CONTINUATION \ |UTF8_ALLOW_SHORT \ -- Perl5 Master Repository
