In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/63975e6ff2b59e9c3fcc4d085294363d332d2015?hp=b2e7ed74dcabdba63e3e8e2ff1980e1cd109b869>
- Log ----------------------------------------------------------------- commit 63975e6ff2b59e9c3fcc4d085294363d332d2015 Author: Karl Williamson <[email protected]> Date: Mon Nov 27 14:44:23 2017 -0700 APItest: Add ability to test API fcn utf8_length() commit 877e58e3ce0167926ed872f5c4d5fa89b2366aa8 Author: Karl Williamson <[email protected]> Date: Mon Nov 27 14:43:13 2017 -0700 APItest: Initialize parameter This silences a compiler warning commit a0d7f935a296987510e7187b2af3bfc0593464ac Author: Karl Williamson <[email protected]> Date: Sun Nov 26 17:19:49 2017 -0700 inline.h: White-space only Mostly this indents some code that the previous commit created blocks around. commit 33756530b5c7b031069d47839f8132f4574d2f50 Author: Karl Williamson <[email protected]> Date: Sun Nov 26 17:06:44 2017 -0700 Use is_utf8_invariant_string() more Now that this function was changed to do word-at-a time searching in commit e17544a60909ed9555c0dad7cd24afc40eb736e7, we can more quickly find the first variant byte in a string, if any. Given that a lot of usage of Perl is on ASCII data, it makes sense to try this first before any byte-at-a-time processing. Since Perl can be used on things that are mostly non-ASCII, we give up at the first such one, and process the rest of the string byte-by-byte. Otherwise we could have a pipeline of finding the next variant quickly, but this would only be faster if variants were rare, which I don't feel we can be confident about, after finding at least one. ----------------------------------------------------------------------- Summary of changes: embed.fnc | 18 ++-- ext/XS-APItest/APItest.xs | 9 +- inline.h | 222 ++++++++++++++++++++++++++++++---------------- proto.h | 18 ++-- 4 files changed, 174 insertions(+), 93 deletions(-) diff --git a/embed.fnc b/embed.fnc index 6f10fa8c78..d174290aae 100644 --- a/embed.fnc +++ b/embed.fnc @@ -783,7 +783,7 @@ AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \ |STRLEN len \ |NULLOK const U8 ** ep AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len -AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len +AmnpdRP |bool |is_invariant_string|NN const U8* const s|STRLEN len #if defined(PERL_CORE) || defined (PERL_EXT) EXnidR |bool |is_utf8_non_invariant_string|NN const U8* const s \ |STRLEN len @@ -796,14 +796,14 @@ AnidR |bool |is_utf8_string_flags \ AnmdpR |bool |is_strict_utf8_string|NN const U8 *s|STRLEN len AnmdpR |bool |is_c9strict_utf8_string|NN const U8 *s|STRLEN len Anpdmb |bool |is_utf8_string_loc \ - |NN const U8 *s|const STRLEN len|NN const U8 **ep + |NN const U8 *s|STRLEN len|NN const U8 **ep Andm |bool |is_utf8_string_loc_flags \ |NN const U8 *s|STRLEN len|NN const U8 **ep \ |const U32 flags Andm |bool |is_strict_utf8_string_loc \ - |NN const U8 *s|const STRLEN len|NN const U8 **ep + |NN const U8 *s|STRLEN len|NN const U8 **ep Andm |bool |is_c9strict_utf8_string_loc \ - |NN const U8 *s|const STRLEN len|NN const U8 **ep + |NN const U8 *s|STRLEN len|NN const U8 **ep Anipd |bool |is_utf8_string_loclen \ |NN const U8 *s|STRLEN len|NULLOK const U8 **ep \ |NULLOK STRLEN *el @@ -811,18 +811,18 @@ Anid |bool |is_utf8_string_loclen_flags \ |NN const U8 *s|STRLEN len|NULLOK const U8 **ep \ |NULLOK STRLEN *el|const U32 flags Anid |bool |is_strict_utf8_string_loclen \ - |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \ + |NN const U8 *s|STRLEN len|NULLOK const U8 **ep \ |NULLOK STRLEN *el Anid |bool |is_c9strict_utf8_string_loclen \ - |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \ + |NN const U8 *s|STRLEN len|NULLOK const U8 **ep \ |NULLOK STRLEN *el Amnd |bool |is_utf8_fixed_width_buf_flags \ - |NN const U8 * const s|const STRLEN len|const U32 flags + |NN const U8 * const s|STRLEN len|const U32 flags Amnd |bool |is_utf8_fixed_width_buf_loc_flags \ - |NN const U8 * const s|const STRLEN len \ + |NN const U8 * const s|STRLEN len \ |NULLOK const U8 **ep|const U32 flags Anid |bool |is_utf8_fixed_width_buf_loclen_flags \ - |NN const U8 * const s|const STRLEN len \ + |NN const U8 * const s|STRLEN len \ |NULLOK const U8 **ep|NULLOK STRLEN *el|const U32 flags AmndP |bool |is_utf8_valid_partial_char \ |NN const U8 * const s|NN const U8 * const e diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index ea793ba39e..5ceb7fe939 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -6008,7 +6008,7 @@ AV * test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len) PREINIT: AV *av; - const U8 * ep; + const U8 * ep = NULL; CODE: av = newAV(); av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep))); @@ -6017,6 +6017,13 @@ test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len) OUTPUT: RETVAL +STRLEN +test_utf8_length(unsigned char *s, STRLEN offset, STRLEN len) +CODE: + RETVAL = utf8_length(s + offset, s + len); +OUTPUT: + RETVAL + AV * test_is_utf8_string_loc(char *s, STRLEN len) PREINIT: diff --git a/inline.h b/inline.h index 309d74f435..dfad907cd3 100644 --- a/inline.h +++ b/inline.h @@ -652,8 +652,7 @@ C<L</is_c9strict_utf8_string_loclen>>. PERL_STATIC_INLINE bool S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) { - const U8* send; - const U8* x = s; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE @@ -679,13 +678,17 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) return is_c9strict_utf8_string(s, len); } - send = s + len; - while (x < send) { - STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - return FALSE; + if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { + const U8* const send = s + len; + const U8* x = first_variant; + + while (x < send) { + STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + return FALSE; + } + x += cur_len; } - x += cur_len; } return TRUE; @@ -721,31 +724,50 @@ See also C<L</is_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -779,31 +801,50 @@ See also C<L</is_strict_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -837,31 +878,50 @@ See also C<L</is_c9strict_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -902,16 +962,14 @@ See also C<L</is_utf8_string_loc_flags>>. PERL_STATIC_INLINE bool S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) { - const U8* send; - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); if (len == 0) { - len = strlen((const char *)s); + len = strlen((const char *) s); } if (flags == 0) { @@ -930,24 +988,40 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el return is_c9strict_utf8_string_loclen(s, len, ep, el); } - send = s + len; - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - break; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; } - x += cur_len; - outlen++; + + return TRUE; } - if (el) - *el = outlen; + { + const U8* send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } - if (ep) { - *ep = x; - } + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } - return (x == send); + return (x == send); + } } /* @@ -1245,7 +1319,7 @@ complete, valid characters found in the C<el> pointer. PERL_STATIC_INLINE bool S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, - const STRLEN len, + STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) diff --git a/proto.h b/proto.h index d1fcc6279c..2b57ee5a07 100644 --- a/proto.h +++ b/proto.h @@ -1382,13 +1382,13 @@ PERL_CALLCONV bool Perl_isIDFIRST_lazy(pTHX_ const char* p) /* PERL_CALLCONV bool Perl_is_c9strict_utf8_string(const U8 *s, STRLEN len) __attribute__warn_unused_result__; */ -/* PERL_CALLCONV bool is_c9strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */ +/* PERL_CALLCONV bool is_c9strict_utf8_string_loc(const U8 *s, STRLEN len, const U8 **ep); */ #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el); +PERL_STATIC_INLINE bool S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el); #define PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN \ assert(s) #endif -/* PERL_CALLCONV bool Perl_is_invariant_string(const U8* const s, const STRLEN len) +/* PERL_CALLCONV bool Perl_is_invariant_string(const U8* const s, STRLEN len) __attribute__warn_unused_result__ __attribute__pure__; */ @@ -1405,9 +1405,9 @@ PERL_STATIC_INLINE bool S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, cons /* PERL_CALLCONV bool Perl_is_strict_utf8_string(const U8 *s, STRLEN len) __attribute__warn_unused_result__; */ -/* PERL_CALLCONV bool is_strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */ +/* PERL_CALLCONV bool is_strict_utf8_string_loc(const U8 *s, STRLEN len, const U8 **ep); */ #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el); +PERL_STATIC_INLINE bool S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el); #define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN \ assert(s) #endif @@ -1593,10 +1593,10 @@ PERL_CALLCONV bool Perl_is_utf8_digit(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_DIGIT \ assert(p) -/* PERL_CALLCONV bool is_utf8_fixed_width_buf_flags(const U8 * const s, const STRLEN len, const U32 flags); */ -/* PERL_CALLCONV bool is_utf8_fixed_width_buf_loc_flags(const U8 * const s, const STRLEN len, const U8 **ep, const U32 flags); */ +/* PERL_CALLCONV bool is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 flags); */ +/* PERL_CALLCONV bool is_utf8_fixed_width_buf_loc_flags(const U8 * const s, STRLEN len, const U8 **ep, const U32 flags); */ #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags); +PERL_STATIC_INLINE bool S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags); #define PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS \ assert(s) #endif @@ -1687,7 +1687,7 @@ PERL_STATIC_INLINE bool S_is_utf8_string_flags(const U8 *s, STRLEN len, const U3 #endif #ifndef NO_MATHOMS -PERL_CALLCONV bool Perl_is_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); +PERL_CALLCONV bool Perl_is_utf8_string_loc(const U8 *s, STRLEN len, const U8 **ep); #define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOC \ assert(s); assert(ep) #endif -- Perl5 Master Repository
