In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/3c5aa26268facf2f11222e9c32dbafc7f4963105?hp=2a794dcfb0cb865c8a1f250b7019bc99bfeb5bb2>
- Log ----------------------------------------------------------------- commit 3c5aa26268facf2f11222e9c32dbafc7f4963105 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 19:26:37 2017 -0600 utf8.c: White-space, comment only This adjusts the indentation to reflect changes in the previous commit. M utf8.c commit 9fe0d3c275dfa1f5ca6d543f1dbbc34b8f7a16b1 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 19:21:41 2017 -0600 utf8_to_bytes(): Avoid work if possible This converts to use the new function is_utf8_invariant_string_loc() to find the first variant in the input. If none are found, the function is a no-op. If the intial part of the input is all invariants, they are now skipped during conversion, resulting in less work for such input. The new function could also be optimized to speed up searching. M utf8.c commit 28de57be1a3d5f5f97657c55e127e33e4082ba4d Author: Karl Williamson <[email protected]> Date: Tue Jun 6 02:01:10 2017 -0600 utf8.c: Change UTF8 to UVCHR There is no practical difference between UTF8_IS_INVARIANT and UVCHR_IS_INVARIANT, except that the latter is supposed to be used on characters. Fix to conform. M utf8.c commit dc772057bcaee68675253169d2cc0fd7f3dc61ba Author: Karl Williamson <[email protected]> Date: Wed Jun 7 21:51:03 2017 -0600 sv.c: Refactor slightly to avoid a goto The introduction of the inline function in the previous commit makes it clear that the code can be refactored to be more structured. M sv.c commit 7f764adcdf933d7a7b77989ac4bd7179ca460cf9 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 18:51:28 2017 -0600 sv.c: Convert to use is_utf8_invariant_string_loc This inline function was added in the previous commit. And the function has the potential to be sped up by using word-at-a-time operations. M sv.c commit 76d1063ed5ad16ae2d45a83f70269908d502c1a6 Author: Karl Williamson <[email protected]> Date: Wed Jun 7 21:41:12 2017 -0600 sv.c: Clarify some comments M sv.c commit 0cbf58655b076c3a89ba0364e0c7c75972ac5fb3 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 18:33:05 2017 -0600 Add XS-callable function is_utf8_invariant_string_loc() This is like is_utf8_invariant_string(), but takes an additional parameter, a pointer into which it stores the location of the first variant if any are found. M embed.fnc M embed.h M inline.h M pod/perldelta.pod M proto.h commit 61343a04469440fa429b50b3763567fa52e0cbea Author: Karl Williamson <[email protected]> Date: Mon Jun 5 12:56:28 2017 -0600 bytes_to_utf8(): Remove obsolete comment It said the logic was duplicated elsewhere, but now the essence of the logic is in an inlined function used in both places. M utf8.c commit 38af28cf45abcc0148c23c2ade102da20c92dc66 Author: Karl Williamson <[email protected]> Date: Wed Jun 7 21:17:56 2017 -0600 utf8.c: White_space only The previous commit created a block around the lines changed by this current commit. M utf8.c commit 170a1c22f1ad3af58230b4637da03f807a9efa56 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 12:26:06 2017 -0600 bytes_from_utf8(): Use memcpy if all invariant This function does two passes over the input. In the first it decides if the string can be downgraded, and computes the size needed for the downgraded string. In the 2nd pass, it does the conversion. Adding a single 'if' to the function can bypass the 2nd pass completely if only invariants are found. The 2nd pass is replaced by a memcpy(). M utf8.c commit 2a67563b073377398a953d7df7d276295521bfa6 Author: Karl Williamson <[email protected]> Date: Mon Jun 5 12:24:39 2017 -0600 utf8.c: A byte count should be Size_t, not I32 (or STRLEN. I thought we were converting to Size_t, so that's what I chose here.) M utf8.c ----------------------------------------------------------------------- Summary of changes: embed.fnc | 6 +++- embed.h | 2 +- inline.h | 39 +++++++++++++++++++--- pod/perldelta.pod | 9 ++++++ proto.h | 7 ++-- sv.c | 78 ++++++++++++++++++++++---------------------- utf8.c | 97 +++++++++++++++++++++++++++++++++---------------------- 7 files changed, 150 insertions(+), 88 deletions(-) diff --git a/embed.fnc b/embed.fnc index c25941437a..42450451c7 100644 --- a/embed.fnc +++ b/embed.fnc @@ -766,7 +766,11 @@ ADMpR |bool |is_uni_lower_lc|UV c ADMpR |bool |is_uni_print_lc|UV c ADMpR |bool |is_uni_punct_lc|UV c ADMpPR |bool |is_uni_xdigit_lc|UV c -AnidR |bool |is_utf8_invariant_string|NN const U8* const s|STRLEN const len +AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \ + |STRLEN const len +AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \ + |STRLEN const len \ + |NULLOK const U8 ** ep AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len AnpdD |STRLEN |is_utf8_char |NN const U8 *s diff --git a/embed.h b/embed.h index 79614b121d..aeec6370d9 100644 --- a/embed.h +++ b/embed.h @@ -294,7 +294,7 @@ #define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a) #define is_utf8_idcont(a) Perl_is_utf8_idcont(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) -#define is_utf8_invariant_string S_is_utf8_invariant_string +#define is_utf8_invariant_string_loc S_is_utf8_invariant_string_loc #define is_utf8_lower(a) Perl_is_utf8_lower(aTHX_ a) #define is_utf8_mark(a) Perl_is_utf8_mark(aTHX_ a) #define is_utf8_perl_space(a) Perl_is_utf8_perl_space(aTHX_ a) diff --git a/inline.h b/inline.h index 12633a36fa..d840d3d33b 100644 --- a/inline.h +++ b/inline.h @@ -353,19 +353,44 @@ and C<L</is_c9strict_utf8_string_loclen>>. =cut + +*/ + +#define is_utf8_invariant_string(s, len) \ + is_utf8_invariant_string_loc(s, len, NULL) + +/* +=for apidoc is_utf8_invariant_string_loc + +Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of +the first UTF-8 variant character in the C<ep> pointer; if all characters are +UTF-8 invariant, this function does not change the contents of C<*ep>. + +=cut + +XXX On ASCII machines this could be sped up by doing word-at-a-time operations + */ PERL_STATIC_INLINE bool -S_is_utf8_invariant_string(const U8* const s, const STRLEN len) +S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING; + PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; + + while (x < send) { + if (UTF8_IS_INVARIANT(*x)) { + x++; + continue; + } + + if (ep) { + *ep = x; + } - for (; x < send; ++x) { - if (!UTF8_IS_INVARIANT(*x)) - return FALSE; + return FALSE; } return TRUE; @@ -388,6 +413,7 @@ code points are considered valid. See also C<L</is_utf8_invariant_string>>, +C<L</is_utf8_invariant_string_loc>>, C<L</is_utf8_string_loc>>, C<L</is_utf8_string_loclen>>, C<L</is_utf8_fixed_width_buf_flags>>, @@ -435,6 +461,7 @@ non-character code points. See also C<L</is_utf8_invariant_string>>, +C<L</is_utf8_invariant_string_loc>>, C<L</is_utf8_string>>, C<L</is_utf8_string_flags>>, C<L</is_utf8_string_loc>>, @@ -491,6 +518,7 @@ L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. See also C<L</is_utf8_invariant_string>>, +C<L</is_utf8_invariant_string_loc>>, C<L</is_utf8_string>>, C<L</is_utf8_string_flags>>, C<L</is_utf8_string_loc>>, @@ -553,6 +581,7 @@ C<L</utf8n_to_uvchr>>, with the same meanings. See also C<L</is_utf8_invariant_string>>, +C<L</is_utf8_invariant_string_loc>>, C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, C<L</is_utf8_string_loc_flags>>, diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 46b0fe0719..29d47bee11 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -409,6 +409,15 @@ The C<PL_statbuf> interpreter variable has been removed. The deprecated function C<to_utf8_case()>, accessible from XS code, has been removed. +=item * + +A new function +L<C<is_utf8_invariant_string_loc()>|perlapi/is_utf8_invariant_string_loc> +has been added that is like +L<C<is_utf8_invariant_string()>|perlapi/is_utf8_invariant_string> +but takes an extra pointer parameter into which is stored the location +of the first variant character, if any are found. + =back =head1 Selected Bug Fixes diff --git a/proto.h b/proto.h index c6f80360d1..3b6922d503 100644 --- a/proto.h +++ b/proto.h @@ -1617,10 +1617,13 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \ assert(p) +/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len) + __attribute__warn_unused_result__; */ + #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_utf8_invariant_string(const U8* const s, STRLEN const len) +PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep) __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING \ +#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \ assert(s) #endif diff --git a/sv.c b/sv.c index de27251f03..4576f9c2e0 100644 --- a/sv.c +++ b/sv.c @@ -3488,37 +3488,35 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extr U8 * s = (U8 *) SvPVX_const(sv); U8 * e = (U8 *) SvEND(sv); U8 *t = s; - STRLEN two_byte_count = 0; + STRLEN two_byte_count; - if (flags & SV_FORCE_UTF8_UPGRADE) goto must_be_utf8; - - /* See if really will need to convert to utf8. We mustn't rely on our - * incoming SV being well formed and having a trailing '\0', as certain - * code in pp_formline can send us partially built SVs. */ - - while (t < e) { - const U8 ch = *t++; - if (NATIVE_BYTE_IS_INVARIANT(ch)) continue; - - t--; /* t already incremented; re-point to first variant */ - two_byte_count = 1; - goto must_be_utf8; - } + if (flags & SV_FORCE_UTF8_UPGRADE) { + two_byte_count = 0; + } + else { + if (is_utf8_invariant_string_loc(s, SvCUR(sv), (const U8 **) &t)) { - /* utf8 conversion not needed because all are invariants. Mark as - * UTF-8 even if no variant - saves scanning loop */ - SvUTF8_on(sv); - if (extra) SvGROW(sv, SvCUR(sv) + extra); - return SvCUR(sv); + /* utf8 conversion not needed because all are invariants. Mark + * as UTF-8 even if no variant - saves scanning loop */ + SvUTF8_on(sv); + if (extra) SvGROW(sv, SvCUR(sv) + extra); + return SvCUR(sv); + } - must_be_utf8: + /* Here, there is at least one variant, and t points to the first + * one */ + two_byte_count = 1; + } - /* Here, the string should be converted to utf8, either because of an - * input flag (two_byte_count = 0), or because a character that - * requires 2 bytes was found (two_byte_count = 1). t points either to - * the beginning of the string (if we didn't examine anything), or to - * the first variant. In either case, everything from s to t - 1 will - * occupy only 1 byte each on output. + /* Note that the incoming SV may not have a trailing '\0', as certain + * code in pp_formline can send us partially built SVs. + * + * Here, the string should be converted to utf8, either because of an + * input flag (which causes two_byte_count to be set to 0), or because + * a character that requires 2 bytes was found (two_byte_count = 1). t + * points either to the beginning of the string (if we didn't examine + * anything), or to the first variant. In either case, everything from + * s to t - 1 will occupy only 1 byte each on output. * * There are two main ways to convert. One is to create a new string * and go through the input starting from the beginning, appending each @@ -3529,7 +3527,7 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extr * from s to t - 1 is invariant, the destination can be initialized * with these using a fast memory copy * - * The other way is to figure out exactly how big the string should be + * The other way is to figure out exactly how big the string should be, * by parsing the entire input. Then you don't have to make it big * enough to handle the worst possible case, and more importantly, if * the string you already have is large enough, you don't have to @@ -3551,18 +3549,18 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extr * value. We go backwards through the string, converting until we * get to the position we are at now, and then stop. If this * position is far enough along in the string, this method is - * faster than the other method. If the memory copy were the same - * speed as the byte-by-byte loop, that position would be about - * half-way, as at the half-way mark, parsing to the end and back - * is one complete string's parse, the same amount as starting - * over and going all the way through. Actually, it would be - * somewhat less than half-way, as it's faster to just count bytes - * than to also copy, and we don't have the overhead of allocating - * a new string, changing the scalar to use it, and freeing the - * existing one. But if the memory copy is fast, the break-even - * point is somewhere after half way. The counting loop could be - * sped up by vectorization, etc, to move the break-even point - * further towards the beginning. + * faster than the first method above. If the memory copy were + * the same speed as the byte-by-byte loop, that position would be + * about half-way, as at the half-way mark, parsing to the end and + * back is one complete string's parse, the same amount as + * starting over and going all the way through. Actually, it + * would be somewhat less than half-way, as it's faster to just + * count bytes than to also copy, and we don't have the overhead + * of allocating a new string, changing the scalar to use it, and + * freeing the existing one. But if the memory copy is fast, the + * break-even point is somewhere after half way. The counting + * loop could be sped up by vectorization, etc, to move the + * break-even point further towards the beginning. * 2) if the string doesn't have enough space to handle the converted * value. A new string will have to be allocated, and one might * as well, given that, start from the beginning doing the first diff --git a/utf8.c b/utf8.c index 39df019769..7e8a5db272 100644 --- a/utf8.c +++ b/utf8.c @@ -1920,38 +1920,51 @@ If you need a copy of the string, see L</bytes_from_utf8>. U8 * Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) { - U8 * const save = s; - U8 * const send = s + *len; - U8 *d; + U8 * first_variant; PERL_ARGS_ASSERT_UTF8_TO_BYTES; PERL_UNUSED_CONTEXT; - /* ensure valid UTF-8 and chars < 256 before updating string */ - while (s < send) { - if (! UTF8_IS_INVARIANT(*s)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - *len = ((STRLEN) -1); - return 0; + /* This is a no-op if no variants at all in the input */ + if (is_utf8_invariant_string_loc(s, *len, (const U8 **) &first_variant)) { + return s; + } + + { + U8 * const save = s; + U8 * const send = s + *len; + U8 * d; + + /* Nothing before the first variant needs to be changed, so start the real + * work there */ + s = first_variant; + while (s < send) { + if (! UTF8_IS_INVARIANT(*s)) { + if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { + *len = ((STRLEN) -1); + return 0; + } + s++; } s++; } - s++; - } - d = s = save; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; - } - *d++ = c; + /* Is downgradable, so do it */ + d = s = first_variant; + while (s < send) { + U8 c = *s++; + if (! UVCHR_IS_INVARIANT(c)) { + /* Then it is two-byte encoded */ + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); + s++; + } + *d++ = c; + } + *d = '\0'; + *len = d - save; + + return save; } - *d = '\0'; - *len = d - save; - return save; } /* @@ -1981,7 +1994,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) U8 *d; const U8 *start = s; const U8 *send; - I32 count = 0; + Size_t count = 0; PERL_ARGS_ASSERT_BYTES_FROM_UTF8; PERL_UNUSED_CONTEXT; @@ -2003,19 +2016,28 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) *is_utf8 = FALSE; Newx(d, (*len) - count + 1, U8); - s = start; start = d; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; - } - *d++ = c; + + if (LIKELY(count)) { + s = start; start = d; + while (s < send) { + U8 c = *s++; + if (! UTF8_IS_INVARIANT(c)) { + /* Then it is two-byte encoded */ + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); + s++; + } + *d++ = c; + } + *d = '\0'; + *len = d - start; + + return (U8 *)start; + } + else { + Copy(start, d, *len, U8); + *(d + *len) = '\0'; + return (U8 *)d; } - *d = '\0'; - *len = d - start; - return (U8 *)start; } /* @@ -2035,9 +2057,6 @@ see L</sv_recode_to_utf8>(). =cut */ -/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will - likewise need duplication. */ - U8* Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) { -- Perl5 Master Repository
