[perl.git] branch blead, updated. v5.27.0-275-g3c5aa26268

Karl Williamson Thu, 08 Jun 2017 10:05:32 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/3c5aa26268facf2f11222e9c32dbafc7f4963105?hp=2a794dcfb0cb865c8a1f250b7019bc99bfeb5bb2>


- Log -----------------------------------------------------------------
commit 3c5aa26268facf2f11222e9c32dbafc7f4963105
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 19:26:37 2017 -0600

    utf8.c: White-space, comment only
    
    This adjusts the indentation to reflect changes in the previous commit.

M       utf8.c

commit 9fe0d3c275dfa1f5ca6d543f1dbbc34b8f7a16b1
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 19:21:41 2017 -0600

    utf8_to_bytes(): Avoid work if possible
    
    This converts to use the new function is_utf8_invariant_string_loc() to
    find the first variant in the input.  If none are found, the function is
    a no-op.  If the intial part of the input is all invariants, they are
    now skipped during conversion, resulting in less work for such input.
    
    The new function could also be optimized to speed up searching.

M       utf8.c

commit 28de57be1a3d5f5f97657c55e127e33e4082ba4d
Author: Karl Williamson <[email protected]>
Date:   Tue Jun 6 02:01:10 2017 -0600

    utf8.c: Change UTF8 to UVCHR
    
    There is no practical difference between UTF8_IS_INVARIANT and
    UVCHR_IS_INVARIANT, except that the latter is supposed to be used on
    characters.  Fix to conform.

M       utf8.c

commit dc772057bcaee68675253169d2cc0fd7f3dc61ba
Author: Karl Williamson <[email protected]>
Date:   Wed Jun 7 21:51:03 2017 -0600

    sv.c: Refactor slightly to avoid a goto
    
    The introduction of the inline function in the previous commit makes it
    clear that the code can be refactored to be more structured.

M       sv.c

commit 7f764adcdf933d7a7b77989ac4bd7179ca460cf9
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 18:51:28 2017 -0600

    sv.c: Convert to use is_utf8_invariant_string_loc
    
    This inline function was added in the previous commit.
    
    And the function has the potential to be sped up by using word-at-a-time
    operations.

M       sv.c

commit 76d1063ed5ad16ae2d45a83f70269908d502c1a6
Author: Karl Williamson <[email protected]>
Date:   Wed Jun 7 21:41:12 2017 -0600

    sv.c: Clarify some comments

M       sv.c

commit 0cbf58655b076c3a89ba0364e0c7c75972ac5fb3
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 18:33:05 2017 -0600

    Add XS-callable function is_utf8_invariant_string_loc()
    
    This is like is_utf8_invariant_string(), but takes an additional
    parameter, a pointer into which it stores the location of the first
    variant if any are found.

M       embed.fnc
M       embed.h
M       inline.h
M       pod/perldelta.pod
M       proto.h

commit 61343a04469440fa429b50b3763567fa52e0cbea
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 12:56:28 2017 -0600

    bytes_to_utf8(): Remove obsolete comment
    
    It said the logic was duplicated elsewhere, but now the essence of the
    logic is in an inlined function used in both places.

M       utf8.c

commit 38af28cf45abcc0148c23c2ade102da20c92dc66
Author: Karl Williamson <[email protected]>
Date:   Wed Jun 7 21:17:56 2017 -0600

    utf8.c: White_space only
    
    The previous commit created a block around the lines changed by this
    current commit.

M       utf8.c

commit 170a1c22f1ad3af58230b4637da03f807a9efa56
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 12:26:06 2017 -0600

    bytes_from_utf8(): Use memcpy if all invariant
    
    This function does two passes over the input.  In the first it decides
    if the string can be downgraded, and computes the size needed for the
    downgraded string.  In the 2nd pass, it does the conversion.
    
    Adding a single 'if' to the function can bypass the 2nd pass completely
    if only invariants are found.  The 2nd pass is replaced by a memcpy().

M       utf8.c

commit 2a67563b073377398a953d7df7d276295521bfa6
Author: Karl Williamson <[email protected]>
Date:   Mon Jun 5 12:24:39 2017 -0600

    utf8.c: A byte count should be Size_t, not I32
    
    (or STRLEN.  I thought we were converting to Size_t, so that's what I
    chose here.)

M       utf8.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc         |  6 +++-
 embed.h           |  2 +-
 inline.h          | 39 +++++++++++++++++++---
 pod/perldelta.pod |  9 ++++++
 proto.h           |  7 ++--
 sv.c              | 78 ++++++++++++++++++++++----------------------
 utf8.c            | 97 +++++++++++++++++++++++++++++++++----------------------
 7 files changed, 150 insertions(+), 88 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index c25941437a..42450451c7 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -766,7 +766,11 @@ ADMpR      |bool   |is_uni_lower_lc|UV c
 ADMpR  |bool   |is_uni_print_lc|UV c
 ADMpR  |bool   |is_uni_punct_lc|UV c
 ADMpPR |bool   |is_uni_xdigit_lc|UV c
-AnidR  |bool   |is_utf8_invariant_string|NN const U8* const s|STRLEN const len
+AndmoR |bool   |is_utf8_invariant_string|NN const U8* const s              \
+               |STRLEN const len
+AnidR  |bool   |is_utf8_invariant_string_loc|NN const U8* const s          \
+               |STRLEN const len                                           \
+               |NULLOK const U8 ** ep
 AmnpdRP        |bool   |is_ascii_string|NN const U8* const s|const STRLEN len
 AmnpdRP        |bool   |is_invariant_string|NN const U8* const s|const STRLEN 
len
 AnpdD  |STRLEN |is_utf8_char   |NN const U8 *s
diff --git a/embed.h b/embed.h
index 79614b121d..aeec6370d9 100644
--- a/embed.h
+++ b/embed.h
@@ -294,7 +294,7 @@
 #define is_utf8_graph(a)       Perl_is_utf8_graph(aTHX_ a)
 #define is_utf8_idcont(a)      Perl_is_utf8_idcont(aTHX_ a)
 #define is_utf8_idfirst(a)     Perl_is_utf8_idfirst(aTHX_ a)
-#define is_utf8_invariant_string       S_is_utf8_invariant_string
+#define is_utf8_invariant_string_loc   S_is_utf8_invariant_string_loc
 #define is_utf8_lower(a)       Perl_is_utf8_lower(aTHX_ a)
 #define is_utf8_mark(a)                Perl_is_utf8_mark(aTHX_ a)
 #define is_utf8_perl_space(a)  Perl_is_utf8_perl_space(aTHX_ a)
diff --git a/inline.h b/inline.h
index 12633a36fa..d840d3d33b 100644
--- a/inline.h
+++ b/inline.h
@@ -353,19 +353,44 @@ and
 C<L</is_c9strict_utf8_string_loclen>>.
 
 =cut
+
+*/
+
+#define is_utf8_invariant_string(s, len)                                    \
+                                is_utf8_invariant_string_loc(s, len, NULL)
+
+/*
+=for apidoc is_utf8_invariant_string_loc
+
+Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
+the first UTF-8 variant character in the C<ep> pointer; if all characters are
+UTF-8 invariant, this function does not change the contents of C<*ep>.
+
+=cut
+
+XXX On ASCII machines this could be sped up by doing word-at-a-time operations
+
 */
 
 PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string(const U8* const s, const STRLEN len)
+S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 
** ep)
 {
     const U8* const send = s + (len ? len : strlen((const char *)s));
     const U8* x = s;
 
-    PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING;
+    PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
+
+    while (x < send) {
+       if (UTF8_IS_INVARIANT(*x)) {
+            x++;
+            continue;
+        }
+
+        if (ep) {
+            *ep = x;
+        }
 
-    for (; x < send; ++x) {
-       if (!UTF8_IS_INVARIANT(*x))
-           return FALSE;
+        return FALSE;
     }
 
     return TRUE;
@@ -388,6 +413,7 @@ code points are considered valid.
 
 See also
 C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
 C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loclen>>,
 C<L</is_utf8_fixed_width_buf_flags>>,
@@ -435,6 +461,7 @@ non-character code points.
 
 See also
 C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
 C<L</is_utf8_string>>,
 C<L</is_utf8_string_flags>>,
 C<L</is_utf8_string_loc>>,
@@ -491,6 +518,7 @@ L<Corrigendum 
#9|http://www.unicode.org/versions/corrigendum9.html>.
 
 See also
 C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
 C<L</is_utf8_string>>,
 C<L</is_utf8_string_flags>>,
 C<L</is_utf8_string_loc>>,
@@ -553,6 +581,7 @@ C<L</utf8n_to_uvchr>>, with the same meanings.
 
 See also
 C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
 C<L</is_utf8_string>>,
 C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loc_flags>>,
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 46b0fe0719..29d47bee11 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -409,6 +409,15 @@ The C<PL_statbuf> interpreter variable has been removed.
 The deprecated function C<to_utf8_case()>, accessible from XS code, has
 been removed.
 
+=item *
+
+A new function
+L<C<is_utf8_invariant_string_loc()>|perlapi/is_utf8_invariant_string_loc>
+has been added that is like
+L<C<is_utf8_invariant_string()>|perlapi/is_utf8_invariant_string>
+but takes an extra pointer parameter into which is stored the location
+of the first variant character, if any are found.
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/proto.h b/proto.h
index c6f80360d1..3b6922d503 100644
--- a/proto.h
+++ b/proto.h
@@ -1617,10 +1617,13 @@ PERL_CALLCONV bool      Perl_is_utf8_idfirst(pTHX_ 
const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST       \
        assert(p)
 
+/* PERL_CALLCONV bool  is_utf8_invariant_string(const U8* const s, STRLEN 
const len)
+                       __attribute__warn_unused_result__; */
+
 #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool        S_is_utf8_invariant_string(const U8* const s, 
STRLEN const len)
+PERL_STATIC_INLINE bool        S_is_utf8_invariant_string_loc(const U8* const 
s, STRLEN const len, const U8 ** ep)
                        __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING      \
+#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC  \
        assert(s)
 #endif
 
diff --git a/sv.c b/sv.c
index de27251f03..4576f9c2e0 100644
--- a/sv.c
+++ b/sv.c
@@ -3488,37 +3488,35 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, 
const I32 flags, STRLEN extr
        U8 * s = (U8 *) SvPVX_const(sv);
        U8 * e = (U8 *) SvEND(sv);
        U8 *t = s;
-       STRLEN two_byte_count = 0;
+       STRLEN two_byte_count;
        
-       if (flags & SV_FORCE_UTF8_UPGRADE) goto must_be_utf8;
-
-       /* See if really will need to convert to utf8.  We mustn't rely on our
-        * incoming SV being well formed and having a trailing '\0', as certain
-        * code in pp_formline can send us partially built SVs. */
-
-       while (t < e) {
-           const U8 ch = *t++;
-           if (NATIVE_BYTE_IS_INVARIANT(ch)) continue;
-
-           t--;    /* t already incremented; re-point to first variant */
-           two_byte_count = 1;
-           goto must_be_utf8;
-       }
+       if (flags & SV_FORCE_UTF8_UPGRADE) {
+            two_byte_count = 0;
+        }
+        else {
+            if (is_utf8_invariant_string_loc(s, SvCUR(sv), (const U8 **) &t)) {
 
-       /* utf8 conversion not needed because all are invariants.  Mark as
-        * UTF-8 even if no variant - saves scanning loop */
-       SvUTF8_on(sv);
-       if (extra) SvGROW(sv, SvCUR(sv) + extra);
-       return SvCUR(sv);
+                /* utf8 conversion not needed because all are invariants.  Mark
+                 * as UTF-8 even if no variant - saves scanning loop */
+                SvUTF8_on(sv);
+                if (extra) SvGROW(sv, SvCUR(sv) + extra);
+                return SvCUR(sv);
+            }
 
-      must_be_utf8:
+            /* Here, there is at least one variant, and t points to the first
+             * one */
+            two_byte_count = 1;
+        }
 
-       /* Here, the string should be converted to utf8, either because of an
-        * input flag (two_byte_count = 0), or because a character that
-        * requires 2 bytes was found (two_byte_count = 1).  t points either to
-        * the beginning of the string (if we didn't examine anything), or to
-        * the first variant.  In either case, everything from s to t - 1 will
-        * occupy only 1 byte each on output.
+        /* Note that the incoming SV may not have a trailing '\0', as certain
+         * code in pp_formline can send us partially built SVs.
+         *
+        * Here, the string should be converted to utf8, either because of an
+         * input flag (which causes two_byte_count to be set to 0), or because
+         * a character that requires 2 bytes was found (two_byte_count = 1).  t
+         * points either to the beginning of the string (if we didn't examine
+         * anything), or to the first variant.  In either case, everything from
+         * s to t - 1 will occupy only 1 byte each on output.
         *
         * There are two main ways to convert.  One is to create a new string
         * and go through the input starting from the beginning, appending each
@@ -3529,7 +3527,7 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const 
I32 flags, STRLEN extr
         * from s to t - 1 is invariant, the destination can be initialized
         * with these using a fast memory copy
         *
-        * The other way is to figure out exactly how big the string should be
+         * The other way is to figure out exactly how big the string should be,
         * by parsing the entire input.  Then you don't have to make it big
         * enough to handle the worst possible case, and more importantly, if
         * the string you already have is large enough, you don't have to
@@ -3551,18 +3549,18 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, 
const I32 flags, STRLEN extr
         *      value.  We go backwards through the string, converting until we
         *      get to the position we are at now, and then stop.  If this
         *      position is far enough along in the string, this method is
-        *      faster than the other method.  If the memory copy were the same
-        *      speed as the byte-by-byte loop, that position would be about
-        *      half-way, as at the half-way mark, parsing to the end and back
-        *      is one complete string's parse, the same amount as starting
-        *      over and going all the way through.  Actually, it would be
-        *      somewhat less than half-way, as it's faster to just count bytes
-        *      than to also copy, and we don't have the overhead of allocating
-        *      a new string, changing the scalar to use it, and freeing the
-        *      existing one.  But if the memory copy is fast, the break-even
-        *      point is somewhere after half way.  The counting loop could be
-        *      sped up by vectorization, etc, to move the break-even point
-        *      further towards the beginning.
+         *     faster than the first method above.  If the memory copy were
+         *     the same speed as the byte-by-byte loop, that position would be
+         *     about half-way, as at the half-way mark, parsing to the end and
+         *     back is one complete string's parse, the same amount as
+         *     starting over and going all the way through.  Actually, it
+         *     would be somewhat less than half-way, as it's faster to just
+         *     count bytes than to also copy, and we don't have the overhead
+         *     of allocating a new string, changing the scalar to use it, and
+         *     freeing the existing one.  But if the memory copy is fast, the
+         *     break-even point is somewhere after half way.  The counting
+         *     loop could be sped up by vectorization, etc, to move the
+         *     break-even point further towards the beginning.
         *  2)  if the string doesn't have enough space to handle the converted
         *      value.  A new string will have to be allocated, and one might
         *      as well, given that, start from the beginning doing the first
diff --git a/utf8.c b/utf8.c
index 39df019769..7e8a5db272 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1920,38 +1920,51 @@ If you need a copy of the string, see 
L</bytes_from_utf8>.
 U8 *
 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 {
-    U8 * const save = s;
-    U8 * const send = s + *len;
-    U8 *d;
+    U8 * first_variant;
 
     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
     PERL_UNUSED_CONTEXT;
 
-    /* ensure valid UTF-8 and chars < 256 before updating string */
-    while (s < send) {
-        if (! UTF8_IS_INVARIANT(*s)) {
-            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
-                *len = ((STRLEN) -1);
-                return 0;
+    /* This is a no-op if no variants at all in the input */
+    if (is_utf8_invariant_string_loc(s, *len, (const U8 **) &first_variant)) {
+        return s;
+    }
+
+    {
+        U8 * const save = s;
+        U8 * const send = s + *len;
+        U8 * d;
+
+        /* Nothing before the first variant needs to be changed, so start the 
real
+         * work there */
+        s = first_variant;
+        while (s < send) {
+            if (! UTF8_IS_INVARIANT(*s)) {
+                if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
+                    *len = ((STRLEN) -1);
+                    return 0;
+                }
+                s++;
             }
             s++;
         }
-        s++;
-    }
 
-    d = s = save;
-    while (s < send) {
-       U8 c = *s++;
-       if (! UTF8_IS_INVARIANT(c)) {
-           /* Then it is two-byte encoded */
-           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
-            s++;
-       }
-       *d++ = c;
+        /* Is downgradable, so do it */
+        d = s = first_variant;
+        while (s < send) {
+            U8 c = *s++;
+            if (! UVCHR_IS_INVARIANT(c)) {
+                /* Then it is two-byte encoded */
+                c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
+                s++;
+            }
+            *d++ = c;
+        }
+        *d = '\0';
+        *len = d - save;
+
+        return save;
     }
-    *d = '\0';
-    *len = d - save;
-    return save;
 }
 
 /*
@@ -1981,7 +1994,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool 
*is_utf8)
     U8 *d;
     const U8 *start = s;
     const U8 *send;
-    I32 count = 0;
+    Size_t count = 0;
 
     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
     PERL_UNUSED_CONTEXT;
@@ -2003,19 +2016,28 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, 
bool *is_utf8)
     *is_utf8 = FALSE;
 
     Newx(d, (*len) - count + 1, U8);
-    s = start; start = d;
-    while (s < send) {
-       U8 c = *s++;
-       if (! UTF8_IS_INVARIANT(c)) {
-           /* Then it is two-byte encoded */
-           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
-            s++;
-       }
-       *d++ = c;
+
+    if (LIKELY(count)) {
+        s = start; start = d;
+        while (s < send) {
+            U8 c = *s++;
+            if (! UTF8_IS_INVARIANT(c)) {
+                /* Then it is two-byte encoded */
+                c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
+                s++;
+            }
+            *d++ = c;
+        }
+        *d = '\0';
+        *len = d - start;
+
+        return (U8 *)start;
+    }
+    else {
+        Copy(start, d, *len, U8);
+        *(d + *len) = '\0';
+        return (U8 *)d;
     }
-    *d = '\0';
-    *len = d - start;
-    return (U8 *)start;
 }
 
 /*
@@ -2035,9 +2057,6 @@ see L</sv_recode_to_utf8>().
 =cut
 */
 
-/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
-   likewise need duplication. */
-
 U8*
 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 {

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.27.0-275-g3c5aa26268

Reply via email to