In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/b50535da2f4aaf97d13e96cda0069755fb6bbf76?hp=74102a88afc9d8f1973892ce66cf9a83e79d043a>
- Log ----------------------------------------------------------------- commit b50535da2f4aaf97d13e96cda0069755fb6bbf76 Author: Karl Williamson <[email protected]> Date: Tue Jun 13 22:09:25 2017 -0600 Allow bitwise & ^ | to accept trailing UTF-8 Commit 08b6664b858b8fd4b5c0c27542763337b6d78e46 breaks things like $foo = "" & "\x{100}"; We have deprecated using above-FF code points in bitwise operations, and made them illegal in 5.27. However, the case where the illegal code points don't play a part in the operation never raised deprecation warnings. The example above is one such, because the \x{100} comes after the operation stops since the other operand has length 0. We can't make something illegal without warning people about it for 2 releases. Rather than revert that commit, and reinstate a bunch of slow code that is far more general than now needed, this commit adds some extra code to deal with these situations, but the basic operations still take place in tight loops, which 08b6664b858b8fd4b5c0c27542763337b6d78e46 caused to happen. In the case of "&", the illegal code points get truncated away. In the case of ^ and |, they get catenated as-is. This preserves earlier behavior. It has not been decided if these should at least warn, or the usage should be deprecated. A commit can easily be done to change to whatever the final decision is, but this commit doesn't raise any warnings, hence preserves existing behavior. The breaking commit looks like it might create some havoc with CPAN, and fixing it now will save the CPAN testers effort, as they won't have to deal with a bunch of broken distributions. M doop.c M t/op/bop.t commit 7299a0452a733f22046692b3c19ac8e15db8db0a Author: Karl Williamson <[email protected]> Date: Tue Jun 13 21:50:40 2017 -0600 utf8.c: White-space only Outdent some code that had a surrounding block removed in the previous commit. M utf8.c commit 976c1b0821cf2c6b33779dfb8e251deaee29f7bd Author: Karl Williamson <[email protected]> Date: Tue Jun 13 21:37:22 2017 -0600 Add new function utf8_from_bytes_loc() This is currently undocumented externally, so we can change the API if needed. This is like utf8_from_bytes(), but in the case of not being able to convert the whole string, it converts the initial substring that is convertible, and tells you where it had to stop. M embed.fnc M embed.h M proto.h M utf8.c M utf8.h commit 23b37b1241a38c682dd4cd59cd6355024048868d Author: Karl Williamson <[email protected]> Date: Tue Jun 13 21:15:50 2017 -0600 Clarify pod for bytes to/from utf8() M utf8.c commit 59f80a355333d79f772276049e38ca6e91f4a437 Author: Karl Williamson <[email protected]> Date: Fri Jun 9 14:21:01 2017 -0600 embed.fnc: Add some comments M embed.fnc commit 41ae60897350f0b7028765b91dead2d244b1176f Author: Karl Williamson <[email protected]> Date: Fri Jun 9 11:44:58 2017 -0600 bytes_from_utf8(): parameter must not be NULL The function assumes that the parameter is not NULL. Declare that to embed.fnc. Also change the name to indicate that it is a pointer. M embed.fnc M proto.h M utf8.c ----------------------------------------------------------------------- Summary of changes: doop.c | 89 +++++++++++++++++++++++++++++----- embed.fnc | 12 ++++- embed.h | 2 +- proto.h | 7 ++- t/op/bop.t | 9 +++- utf8.c | 161 ++++++++++++++++++++++++++++++++++++++++++------------------- utf8.h | 2 + 7 files changed, 215 insertions(+), 67 deletions(-) diff --git a/doop.c b/doop.c index bb679a8987..47d7fce2ab 100644 --- a/doop.c +++ b/doop.c @@ -1030,6 +1030,14 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) const char *rsave; STRLEN needlen = 0; bool result_needs_to_be_utf8 = FALSE; + bool left_utf8 = FALSE; + bool right_utf8 = FALSE; + U8 * left_non_downgraded = NULL; + U8 * right_non_downgraded = NULL; + Size_t left_non_downgraded_len = 0; + Size_t right_non_downgraded_len = 0; + char * non_downgraded = NULL; + Size_t non_downgraded_len = 0; PERL_ARGS_ASSERT_DO_VOP; @@ -1049,32 +1057,70 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) /* Create downgraded temporaries of any UTF-8 encoded operands */ if (DO_UTF8(left)) { - bool utf8 = TRUE; + const U8 * save_lc = (U8 *) lc; + left_utf8 = TRUE; result_needs_to_be_utf8 = TRUE; - lc = (char *) bytes_from_utf8((const U8 *) lc, &leftlen, &utf8); - if (utf8) { - Perl_croak(aTHX_ fatal_above_ff_msg, PL_op_desc[optype]); - } - SAVEFREEPV(lc); + left_non_downgraded_len = leftlen; + lc = (char *) bytes_from_utf8_loc((const U8 *) lc, &leftlen, + &left_utf8, + (const U8 **) &left_non_downgraded); + /* Calculate the number of trailing unconvertible bytes. This quantity + * is the original length minus the length of the converted portion. */ + left_non_downgraded_len -= left_non_downgraded - save_lc; + SAVEFREEPV(lc); } if (DO_UTF8(right)) { - bool utf8 = TRUE; + const U8 * save_rc = (U8 *) rc; + right_utf8 = TRUE; result_needs_to_be_utf8 = TRUE; - rc = (char *) bytes_from_utf8((const U8 *) rc, &rightlen, &utf8); - if (utf8) { + right_non_downgraded_len = rightlen; + rc = (char *) bytes_from_utf8_loc((const U8 *) rc, &rightlen, + &right_utf8, + (const U8 **) &right_non_downgraded); + right_non_downgraded_len -= right_non_downgraded - save_rc; + SAVEFREEPV(rc); + } + + /* We set 'len' to the length that the operation actually operates on. The + * dangling part of the longer operand doesn't actually participate in the + * operation. What happens is that we pretend that the shorter operand has + * been extended to the right by enough imaginary zeros to match the length + * of the longer one. But we know in advance the result of the operation + * on zeros without having to do it. In the case of '&', the result is + * zero, and the dangling portion is simply discarded. For '|' and '^', the + * result is the same as the other operand, so the dangling part is just + * appended to the final result, unchanged. We currently accept above-FF + * code points in the dangling portion, as that's how it has long worked, + * and code depends on it staying that way. But it is now fatal for + * above-FF to appear in the portion that does get operated on. Hence, any + * above-FF must come only in the longer operand, and only in its dangling + * portion. That means that at least one of the operands has to be + * entirely non-UTF-8, and the length of that operand has to be before the + * first above-FF in the other */ + if (left_utf8) { + if (right_utf8 || rightlen > leftlen) { Perl_croak(aTHX_ fatal_above_ff_msg, PL_op_desc[optype]); } - SAVEFREEPV(rc); + len = rightlen; + } + else if (right_utf8) { + if (leftlen > rightlen) { + Perl_croak(aTHX_ fatal_above_ff_msg, PL_op_desc[optype]); + } + len = leftlen; + } + else { /* Neither is UTF-8 */ + len = leftlen < rightlen ? leftlen : rightlen; } + lensave = len; lsave = lc; rsave = rc; - len = leftlen < rightlen ? leftlen : rightlen; - lensave = len; + SvCUR_set(sv, len); (void)SvPOK_only(sv); if (SvOK(sv) || SvTYPE(sv) > SVt_PVMG) { @@ -1167,11 +1213,28 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) } *SvEND(sv) = '\0'; + /* If there is trailing stuff that couldn't be converted from UTF-8, it + * is appended as-is for the ^ and | operators. This preserves + * backwards compatibility */ + if (right_non_downgraded) { + non_downgraded = (char *) right_non_downgraded; + non_downgraded_len = right_non_downgraded_len; + } + else if (left_non_downgraded) { + non_downgraded = (char *) left_non_downgraded; + non_downgraded_len = left_non_downgraded_len; + } + break; } if (result_needs_to_be_utf8) { - sv_utf8_upgrade_nomg(sv); + sv_utf8_upgrade_nomg(sv); + + /* Append any trailing UTF-8 as-is. */ + if (non_downgraded) { + sv_catpvn_nomg(sv, non_downgraded, non_downgraded_len); + } } SvTAINT(sv); diff --git a/embed.fnc b/embed.fnc index f5ca638515..c8576af84a 100644 --- a/embed.fnc +++ b/embed.fnc @@ -124,6 +124,12 @@ : : o Has no Perl_foo or S_foo compatibility macro: : +: This can be used when you define a macro with this entry's name that +: doesn't call the function specified by this entry. This is typically +: done for a function that effectively just wraps another one, and where +: the macro form calls the underlying function directly. For these, also +: specify the 'm' flag. Legacy-only functions should instead use 'b'. +: : embed.h: suppress "#define foo Perl_foo" : : P Pure function: @@ -1792,7 +1798,11 @@ AipdRn |U8* |utf8_hop_safe |NN const U8 *s|SSize_t off|NN const U8 *start|NN con ApMd |U8* |utf8_to_bytes |NN U8 *s|NN STRLEN *lenp Apd |int |bytes_cmp_utf8 |NN const U8 *b|STRLEN blen|NN const U8 *u \ |STRLEN ulen -ApMd |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *lenp|NULLOK bool *is_utf8 +AModp |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *lenp|NN bool *is_utf8p +AMnp |U8* |bytes_from_utf8_loc|NN const U8 *s \ + |NN STRLEN *lenp \ + |NN bool *is_utf8p \ + |NULLOK const U8 ** first_unconverted ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *lenp ApdD |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen ApdD |UV |utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen diff --git a/embed.h b/embed.h index aeec6370d9..7cc3bbe205 100644 --- a/embed.h +++ b/embed.h @@ -71,7 +71,7 @@ #define block_gimme() Perl_block_gimme(aTHX) #define block_start(a) Perl_block_start(aTHX_ a) #define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) -#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) +#define bytes_from_utf8_loc Perl_bytes_from_utf8_loc #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) #define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) #define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) diff --git a/proto.h b/proto.h index 06df4e611b..d9c8798498 100644 --- a/proto.h +++ b/proto.h @@ -256,9 +256,12 @@ PERL_CALLCONV void Perl_boot_core_mro(pTHX); PERL_CALLCONV int Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen); #define PERL_ARGS_ASSERT_BYTES_CMP_UTF8 \ assert(b); assert(u) -PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8); +PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p); #define PERL_ARGS_ASSERT_BYTES_FROM_UTF8 \ - assert(s); assert(lenp) + assert(s); assert(lenp); assert(is_utf8p) +PERL_CALLCONV U8* Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8 ** first_unconverted); +#define PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC \ + assert(s); assert(lenp); assert(is_utf8p) PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ assert(s); assert(lenp) diff --git a/t/op/bop.t b/t/op/bop.t index 541d671b69..c1be5883c8 100644 --- a/t/op/bop.t +++ b/t/op/bop.t @@ -18,7 +18,7 @@ BEGIN { # If you find tests are failing, please try adding names to tests to track # down where the failure is, and supply your new names as a patch. # (Just-in-time test naming) -plan tests => 471; +plan tests => 477; # numerics ok ((0xdead & 0xbeef) == 0x9ead); @@ -579,3 +579,10 @@ foreach my $op_info ([and => "&"], [or => "|"], [xor => "^"]) { "Use of code points above 0xFF as argument to 1's complement " . "(~) is not allowed"; } + +is("abc" & "abc\x{100}", "abc", '"abc" & "abc\x{100}" works'); +is("abc" | "abc\x{100}", "abc\x{100}", '"abc" | "abc\x{100}" works'); +is("abc" ^ "abc\x{100}", "\0\0\0\x{100}", '"abc" ^ "abc\x{100}" works'); +is("abc\x{100}" & "abc", "abc", '"abc\x{100}" & "abc" works'); +is("abc\x{100}" | "abc", "abc\x{100}", '"abc\x{100}" | "abc" works'); +is("abc\x{100}" ^ "abc", "\0\0\0\x{100}", '"abc\x{100}" | "abc" works'); diff --git a/utf8.c b/utf8.c index e8e143c49e..6b92023c4c 100644 --- a/utf8.c +++ b/utf8.c @@ -1913,8 +1913,8 @@ updates C<*lenp> to contain the new length. Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. Upon successful return, the number of variants in the string can be computed by -saving the value of C<*lenp> before the call, and subtracting the after-call -value of C<*lenp> from it. +having saved the value of C<*lenp> before the call, and subtracting the +after-call value of C<*lenp> from it. If you need a copy of the string, see L</bytes_from_utf8>. @@ -1975,77 +1975,140 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) =for apidoc bytes_from_utf8 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native -byte encoding. On input, the boolean C<*is_utf8> gives whether or not C<s> is +byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is actually encoded in UTF-8. Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of the input string. -Do nothing if C<*is_utf8> is 0, or if there are code points in the string -not expressible in native byte encoding. In these cases, C<*is_utf8> and +Do nothing if C<*is_utf8p> is 0, or if there are code points in the string +not expressible in native byte encoding. In these cases, C<*is_utf8p> and C<*lenp> are unchanged, and the return value is the original C<s>. -Otherwise, C<*is_utf8> is set to 0, and the return value is a pointer to a +Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a newly created string containing a downgraded copy of C<s>, and whose length is -returned in C<*lenp>, updated. +returned in C<*lenp>, updated. The new string is C<NUL>-terminated. Upon successful return, the number of variants in the string can be computed by -saving the value of C<*lenp> before the call, and subtracting the after-call -value of C<*lenp> from it. +having saved the value of C<*lenp> before the call, and subtracting the +after-call value of C<*lenp> from it. =cut + +There is a macro that avoids this function call, but this is retained for +anyone who calls it with the Perl_ prefix */ + +U8 * +Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) +{ + PERL_ARGS_ASSERT_BYTES_FROM_UTF8; + PERL_UNUSED_CONTEXT; + + return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); +} + +/* +No = here because currently externally undocumented +for apidoc bytes_from_utf8_loc + +Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where +to store the location of the first character in C<"s"> that cannot be +converted to non-UTF8. + +If that parameter is C<NULL>, this function behaves identically to +C<bytes_from_utf8>. + +Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to +C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>. + +Otherwise, the function returns a newly created C<NUL>-terminated string +containing the non-UTF8 equivalent of the convertible first portion of +C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>. +If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, +and C<*first_non_downgradable> is set to C<NULL>. + +Otherwise, C<*first_non_downgradable> set to point to the first byte of the +first character in the original string that wasn't converted. C<*is_utf8p> is +unchanged. Note that the new string may have length 0. + +Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and +C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and +converts as many characters in it as possible stopping at the first one it +finds one that can't be converted to non-UTF-8. C<*first_non_downgradable> is +set to point to that. The function returns the portion that could be converted +in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length, +not including the terminating C<NUL>. If the very first character in the +original could not be converted, C<*lenp> will be 0, and the new string will +contain just a single C<NUL>. If the entire input string was converted, +C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>. + +Upon successful return, the number of variants in the converted portion of the +string can be computed by having saved the value of C<*lenp> before the call, +and subtracting the after-call value of C<*lenp> from it. + +=cut + + */ U8 * -Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8) +Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) { U8 *d; - const U8 *start = s; - const U8 *send; - Size_t count = 0; + const U8 *original = s; + U8 *converted_start; + const U8 *send = s + *lenp; - PERL_ARGS_ASSERT_BYTES_FROM_UTF8; - PERL_UNUSED_CONTEXT; - if (!*is_utf8) - return (U8 *)start; - - /* ensure valid UTF-8 and chars < 256 before converting string */ - for (send = s + *lenp; s < send;) { - if (! UTF8_IS_INVARIANT(*s)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - return (U8 *)start; - } - count++; - s++; - } - s++; - } + PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; + + if (! *is_utf8p) { + if (first_unconverted) { + *first_unconverted = NULL; + } - *is_utf8 = FALSE; + return (U8 *) original; + } - Newx(d, (*lenp) - count + 1, U8); + Newx(d, (*lenp) + 1, U8); - if (LIKELY(count)) { - s = start; start = d; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; + converted_start = d; + while (s < send) { + U8 c = *s++; + if (! UTF8_IS_INVARIANT(c)) { + + /* Then it is multi-byte encoded. If the code point is above 0xFF, + * have to stop now */ + if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { + if (first_unconverted) { + *first_unconverted = s - 1; + goto finish_and_return; + } + else { + Safefree(converted_start); + return (U8 *) original; + } } - *d++ = c; - } - *d = '\0'; - *lenp = d - start; - return (U8 *)start; + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); + s++; + } + *d++ = c; } - else { - Copy(start, d, *lenp, U8); - *(d + *lenp) = '\0'; - return (U8 *)d; + + /* Here, converted the whole of the input */ + *is_utf8p = FALSE; + if (first_unconverted) { + *first_unconverted = NULL; } + + finish_and_return: + *d = '\0'; + *lenp = d - converted_start; + + /* Trim unused space */ + Renew(converted_start, *lenp + 1, U8); + + return converted_start; } /* @@ -2057,7 +2120,7 @@ Returns a pointer to the newly-created string, and sets C<*lenp> to reflect the new length in bytes. Upon successful return, the number of variants in the string can be computed by -saving the value of C<*lenp> before the call, and subtracting it from the +having saved the value of C<*lenp> before the call, and subtracting it from the after-call value of C<*lenp>. A C<NUL> character will be written after the end of the string. diff --git a/utf8.h b/utf8.h index 41db2f4b93..276fa2953f 100644 --- a/utf8.h +++ b/utf8.h @@ -1049,6 +1049,8 @@ is a valid UTF-8 character. : _is_utf8_char_helper(s, e, 0)) #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) +#define bytes_from_utf8(s, lenp, is_utf8p) \ + bytes_from_utf8_loc(s, lenp, is_utf8p, 0) /* -- Perl5 Master Repository
