In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/272d03f882615257eeac4d2796d0b94e8c4ad868?hp=a7ea90b1451006596c4574b1e65894f0bda1bafc>
- Log ----------------------------------------------------------------- commit 272d03f882615257eeac4d2796d0b94e8c4ad868 Author: Tony Cook <[email protected]> Date: Wed Nov 9 14:16:21 2016 +1100 skip some tests that aren't UTF-EBCIDIC compatible on non-ASCII M t/op/lex.t commit cbe6b21e6e54a6011fdc65434771479776a3818d Author: Tony Cook <[email protected]> Date: Wed Nov 9 14:09:23 2016 +1100 (perl #129000) use the new utf8_hop_back() when reporting unrecognized characters in UTF mode. M t/op/lex.t M toke.c commit 65df57a84b55413fcde1e64b86e3d740485536d3 Author: Tony Cook <[email protected]> Date: Mon Oct 31 14:28:34 2016 +1100 (perl #129000) create a safer utf8_hop() Unlike utf8_hop(), utf8_hop_safe() won't navigate before the beginning or after the end of the supplied buffer. The original version of this put all of the logic into utf8_hop_safe(), but in many cases a caller specifically needs to go forward or backward, and supplying the other limit made the function less usable, so I split the function into forward and backward cases. This split may also make inlining these functions more efficient or more likely. M embed.fnc M embed.h M ext/XS-APItest/APItest.xs M ext/XS-APItest/t/utf8.t M inline.h M proto.h ----------------------------------------------------------------------- Summary of changes: embed.fnc | 3 ++ embed.h | 3 ++ ext/XS-APItest/APItest.xs | 13 ++++++ ext/XS-APItest/t/utf8.t | 46 +++++++++++++++++++ inline.h | 111 ++++++++++++++++++++++++++++++++++++++++++++++ proto.h | 18 ++++++++ t/op/lex.t | 39 ++++++++++------ toke.c | 2 +- 8 files changed, 220 insertions(+), 15 deletions(-) diff --git a/embed.fnc b/embed.fnc index a83372f..9d40940 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1734,6 +1734,9 @@ Ap |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|I32 bytelen|NN I32 *newlen AdpPR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e AipdPR |IV |utf8_distance |NN const U8 *a|NN const U8 *b AipdPRn |U8* |utf8_hop |NN const U8 *s|SSize_t off +AipdPRn |U8* |utf8_hop_back|NN const U8 *s|SSize_t off|NN const U8 *start +AipdPRn |U8* |utf8_hop_forward|NN const U8 *s|SSize_t off|NN const U8 *end +AipdPRn |U8* |utf8_hop_safe |NN const U8 *s|SSize_t off|NN const U8 *start|NN const U8 *end ApMd |U8* |utf8_to_bytes |NN U8 *s|NN STRLEN *len Apd |int |bytes_cmp_utf8 |NN const U8 *b|STRLEN blen|NN const U8 *u \ |STRLEN ulen diff --git a/embed.h b/embed.h index b8ee773..d54ed6c 100644 --- a/embed.h +++ b/embed.h @@ -733,6 +733,9 @@ #define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d) #define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b) #define utf8_hop Perl_utf8_hop +#define utf8_hop_back Perl_utf8_hop_back +#define utf8_hop_forward Perl_utf8_hop_forward +#define utf8_hop_safe Perl_utf8_hop_safe #define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) #define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index bb7d865..bb22e6c 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -5562,6 +5562,19 @@ test_is_utf8_fixed_width_buf_loclen_flags(char *s, STRLEN len, U32 flags) OUTPUT: RETVAL +IV +test_utf8_hop_safe(SV *s_sv, STRLEN s_off, IV off) + PREINIT: + STRLEN len; + U8 *p; + U8 *r; + CODE: + p = (U8 *)SvPV(s_sv, len); + r = utf8_hop_safe(p + s_off, off, p, p + len); + RETVAL = r - p; + OUTPUT: + RETVAL + UV test_toLOWER(UV ord) CODE: diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 121c6ef..e366254 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -2401,4 +2401,50 @@ foreach my $test (@tests) { } } +SKIP: +{ + isASCII + or skip "These tests probably break on non-ASCII", 1; + my $simple = join "", "A" .. "J"; + my $utf_ch = "\x{7fffffff}"; + utf8::encode($utf_ch); + my $utf_ch_len = length $utf_ch; + note "utf_ch_len $utf_ch_len"; + my $utf = $utf_ch x 10; + my $bad_start = substr($utf, 1); + # $bad_end ends with a start byte and a single continuation + my $bad_end = substr($utf, 0, length($utf)-$utf_ch_len+2); + + # WARNING: all offsets are *byte* offsets + my @hop_tests = + ( + # string s off expected name + [ $simple, 0, 5, 5, "simple in range, forward" ], + [ $simple, 10, -5, 5, "simple in range, backward" ], + [ $simple, 5, 10, 10, "simple out of range, forward" ], + [ $simple, 5, -10, 0, "simple out of range, backward" ], + [ $utf, $utf_ch_len * 5, 5, length($utf), "utf in range, forward" ], + [ $utf, $utf_ch_len * 5, -5, 0, "utf in range, backward" ], + [ $utf, $utf_ch_len * 5, 4, $utf_ch_len * 9, "utf in range b, forward" ], + [ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ], + [ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ], + [ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ], + [ $bad_start, 0, 1, 1, "bad start, forward 1 from 0" ], + [ $bad_start, 0, $utf_ch_len-1, $utf_ch_len-1, "bad start, forward ch_len-1 from 0" ], + [ $bad_start, 0, $utf_ch_len, $utf_ch_len*2-1, "bad start, forward ch_len from 0" ], + [ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ], + [ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ], + [ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ], + [ $bad_start, length $bad_start, -10, 0, "bad start, back 10 from end" ], + [ $bad_end, 0, 10, length $bad_end, "bad end, forward 10 from 0" ], + [ $bad_end, length($bad_end)-1, 10, length $bad_end, "bad end, forward 1 from end-1" ], + ); + + for my $test (@hop_tests) { + my ($str, $s_off, $off, $want, $name) = @$test; + my $result = test_utf8_hop_safe($str, $s_off, $off); + is($result, $want, "utf8_hop_safe: $name"); + } +} + done_testing; diff --git a/inline.h b/inline.h index 66ba348..adcd85d 100644 --- a/inline.h +++ b/inline.h @@ -920,6 +920,117 @@ Perl_utf8_hop(const U8 *s, SSize_t off) } /* +=for apidoc utf8_hop_forward + +Return the UTF-8 pointer C<s> displaced by up to C<off> characters, +forward. + +C<off> must be non-negative. + +C<s> must be before or equal to C<end>. + +When moving forward it will not move beyond C<end>. + +Will not exceed this limit even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) +{ + PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(s <= end); + assert(off >= 0); + + while (off--) { + STRLEN skip = UTF8SKIP(s); + if ((STRLEN)(end - s) <= skip) + return (U8 *)end; + s += skip; + } + + return (U8 *)s; +} + +/* +=for apidoc utf8_hop_back + +Return the UTF-8 pointer C<s> displaced by up to C<off> characters, +backward. + +C<off> must be non-positive. + +C<s> must be after or equal to C<start>. + +When moving backward it will not move before C<start>. + +Will not exceed this limit even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) +{ + PERL_ARGS_ASSERT_UTF8_HOP_BACK; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(start <= s); + assert(off <= 0); + + while (off++ && s > start) { + s--; + while (UTF8_IS_CONTINUATION(*s) && s > start) + s--; + } + + return (U8 *)s; +} + +/* +=for apidoc utf8_hop_safe + +Return the UTF-8 pointer C<s> displaced by up to C<off> characters, +either forward or backward. + +When moving backward it will not move before C<start>. + +When moving forward it will not move beyond C<end>. + +Will not exceed those limits even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) +{ + PERL_ARGS_ASSERT_UTF8_HOP_SAFE; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(start <= s && s <= end); + + if (off >= 0) { + return utf8_hop_forward(s, off, end); + } + else { + return utf8_hop_back(s, off, start); + } +} + +/* =for apidoc is_utf8_valid_partial_char diff --git a/proto.h b/proto.h index 2e6dbf2..0b10c0a 100644 --- a/proto.h +++ b/proto.h @@ -3512,6 +3512,24 @@ PERL_STATIC_INLINE U8* Perl_utf8_hop(const U8 *s, SSize_t off) #define PERL_ARGS_ASSERT_UTF8_HOP \ assert(s) +PERL_STATIC_INLINE U8* Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) + __attribute__warn_unused_result__ + __attribute__pure__; +#define PERL_ARGS_ASSERT_UTF8_HOP_BACK \ + assert(s); assert(start) + +PERL_STATIC_INLINE U8* Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) + __attribute__warn_unused_result__ + __attribute__pure__; +#define PERL_ARGS_ASSERT_UTF8_HOP_FORWARD \ + assert(s); assert(end) + +PERL_STATIC_INLINE U8* Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) + __attribute__warn_unused_result__ + __attribute__pure__; +#define PERL_ARGS_ASSERT_UTF8_HOP_SAFE \ + assert(s); assert(start); assert(end) + PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ const U8* s, const U8 *e) __attribute__warn_unused_result__ __attribute__pure__; diff --git a/t/op/lex.t b/t/op/lex.t index f3cb510..df96ed7 100644 --- a/t/op/lex.t +++ b/t/op/lex.t @@ -7,7 +7,7 @@ use warnings; BEGIN { chdir 't' if -d 't'; require './test.pl'; } -plan(tests => 33); +plan(tests => 34); { no warnings 'deprecated'; @@ -248,16 +248,27 @@ fresh_perl_like( {}, '[perl #129336] - #!perl -i argument handling' ); -fresh_perl_is( - "BEGIN{\$^H=hex ~0}\xF3", - "Integer overflow in hexadecimal number at - line 1.\n" . - "Malformed UTF-8 character: \\xf3 (too short; got 1 byte, need 4) at - line 1.", - {}, - '[perl #128996] - use of PL_op after op is freed' -); -fresh_perl_like( - qq(BEGIN{\$0="";\$^H=-hex join""=>1}""\xFF), - qr/Malformed UTF-8 character: \\xff \(too short; got 1 byte, need 13\) at - line 1\./, - {}, - '[perl #128997] - buffer read overflow' -); +SKIP: +{ + ord("A") == 65 + or skip "These tests won't work on EBCIDIC", 3; + fresh_perl_is( + "BEGIN{\$^H=hex ~0}\xF3", + "Integer overflow in hexadecimal number at - line 1.\n" . + "Malformed UTF-8 character: \\xf3 (too short; got 1 byte, need 4) at - line 1.", + {}, + '[perl #128996] - use of PL_op after op is freed' + ); + fresh_perl_like( + qq(BEGIN{\$0="";\$^H=-hex join""=>1}""\xFF), + qr/Malformed UTF-8 character: \\xff \(too short; got 1 byte, need 13\) at - line 1\./, + {}, + '[perl #128997] - buffer read overflow' + ); + fresh_perl_like( + qq(BEGIN{\$^H=0x800000}\n 0m 0\xB5\xB500\xB5\0), + qr/Unrecognized character \\x\{0\}; marked by <-- HERE after 0m.*<-- HERE near column 12 at - line 2./, + {}, + '[perl #129000] read before buffer' + ); +} diff --git a/toke.c b/toke.c index 2495bc2..ac7b5f3 100644 --- a/toke.c +++ b/toke.c @@ -4915,7 +4915,7 @@ Perl_yylex(pTHX) } len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart); if (len > UNRECOGNIZED_PRECEDE_COUNT) { - d = UTF ? (char *) utf8_hop((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT) : s - UNRECOGNIZED_PRECEDE_COUNT; + d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - UNRECOGNIZED_PRECEDE_COUNT; } else { d = PL_linestart; } -- Perl5 Master Repository
