In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/272d03f882615257eeac4d2796d0b94e8c4ad868?hp=a7ea90b1451006596c4574b1e65894f0bda1bafc>

- Log -----------------------------------------------------------------
commit 272d03f882615257eeac4d2796d0b94e8c4ad868
Author: Tony Cook <[email protected]>
Date:   Wed Nov 9 14:16:21 2016 +1100

    skip some tests that aren't UTF-EBCIDIC compatible on non-ASCII

M       t/op/lex.t

commit cbe6b21e6e54a6011fdc65434771479776a3818d
Author: Tony Cook <[email protected]>
Date:   Wed Nov 9 14:09:23 2016 +1100

    (perl #129000) use the new utf8_hop_back()
    
    when reporting unrecognized characters in UTF mode.

M       t/op/lex.t
M       toke.c

commit 65df57a84b55413fcde1e64b86e3d740485536d3
Author: Tony Cook <[email protected]>
Date:   Mon Oct 31 14:28:34 2016 +1100

    (perl #129000) create a safer utf8_hop()
    
    Unlike utf8_hop(), utf8_hop_safe() won't navigate before the
    beginning or after the end of the supplied buffer.
    
    The original version of this put all of the logic into
    utf8_hop_safe(), but in many cases a caller specifically
    needs to go forward or backward, and supplying the other limit
    made the function less usable, so I split the function
    into forward and backward cases.
    
    This split may also make inlining these functions more efficient
    or more likely.

M       embed.fnc
M       embed.h
M       ext/XS-APItest/APItest.xs
M       ext/XS-APItest/t/utf8.t
M       inline.h
M       proto.h
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                 |   3 ++
 embed.h                   |   3 ++
 ext/XS-APItest/APItest.xs |  13 ++++++
 ext/XS-APItest/t/utf8.t   |  46 +++++++++++++++++++
 inline.h                  | 111 ++++++++++++++++++++++++++++++++++++++++++++++
 proto.h                   |  18 ++++++++
 t/op/lex.t                |  39 ++++++++++------
 toke.c                    |   2 +-
 8 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index a83372f..9d40940 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1734,6 +1734,9 @@ Ap        |U8*    |utf16_to_utf8_reversed|NN U8* p|NN U8 
*d|I32 bytelen|NN I32 *newlen
 AdpPR  |STRLEN |utf8_length    |NN const U8* s|NN const U8 *e
 AipdPR |IV     |utf8_distance  |NN const U8 *a|NN const U8 *b
 AipdPRn        |U8*    |utf8_hop       |NN const U8 *s|SSize_t off
+AipdPRn        |U8*    |utf8_hop_back|NN const U8 *s|SSize_t off|NN const U8 
*start
+AipdPRn        |U8*    |utf8_hop_forward|NN const U8 *s|SSize_t off|NN const 
U8 *end
+AipdPRn        |U8*    |utf8_hop_safe  |NN const U8 *s|SSize_t off|NN const U8 
*start|NN const U8 *end
 ApMd   |U8*    |utf8_to_bytes  |NN U8 *s|NN STRLEN *len
 Apd    |int    |bytes_cmp_utf8 |NN const U8 *b|STRLEN blen|NN const U8 *u \
                                |STRLEN ulen
diff --git a/embed.h b/embed.h
index b8ee773..d54ed6c 100644
--- a/embed.h
+++ b/embed.h
@@ -733,6 +733,9 @@
 #define utf16_to_utf8_reversed(a,b,c,d)        
Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
 #define utf8_distance(a,b)     Perl_utf8_distance(aTHX_ a,b)
 #define utf8_hop               Perl_utf8_hop
+#define utf8_hop_back          Perl_utf8_hop_back
+#define utf8_hop_forward       Perl_utf8_hop_forward
+#define utf8_hop_safe          Perl_utf8_hop_safe
 #define utf8_length(a,b)       Perl_utf8_length(aTHX_ a,b)
 #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
 #define utf8_to_uvchr(a,b)     Perl_utf8_to_uvchr(aTHX_ a,b)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index bb7d865..bb22e6c 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -5562,6 +5562,19 @@ test_is_utf8_fixed_width_buf_loclen_flags(char *s, 
STRLEN len, U32 flags)
     OUTPUT:
         RETVAL
 
+IV
+test_utf8_hop_safe(SV *s_sv, STRLEN s_off, IV off)
+    PREINIT:
+        STRLEN len;
+        U8 *p;
+        U8 *r;
+    CODE:
+        p = (U8 *)SvPV(s_sv, len);
+        r = utf8_hop_safe(p + s_off, off, p, p + len);
+        RETVAL = r - p;
+    OUTPUT:
+        RETVAL
+
 UV
 test_toLOWER(UV ord)
     CODE:
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 121c6ef..e366254 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -2401,4 +2401,50 @@ foreach my $test (@tests) {
     }
 }
 
+SKIP:
+{
+    isASCII
+      or skip "These tests probably break on non-ASCII", 1;
+    my $simple = join "", "A" .. "J";
+    my $utf_ch = "\x{7fffffff}";
+    utf8::encode($utf_ch);
+    my $utf_ch_len = length $utf_ch;
+    note "utf_ch_len $utf_ch_len";
+    my $utf = $utf_ch x 10;
+    my $bad_start = substr($utf, 1);
+    # $bad_end ends with a start byte and a single continuation
+    my $bad_end = substr($utf, 0, length($utf)-$utf_ch_len+2);
+
+    # WARNING: all offsets are *byte* offsets
+    my @hop_tests =
+      (
+       # string      s                off        expected         name
+       [ $simple,    0,               5,         5,               "simple in 
range, forward" ],
+       [ $simple,    10,              -5,        5,               "simple in 
range, backward" ],
+       [ $simple,    5,               10,        10,              "simple out 
of range, forward" ],
+       [ $simple,    5,               -10,       0,               "simple out 
of range, backward" ],
+       [ $utf,       $utf_ch_len * 5, 5,         length($utf),    "utf in 
range, forward" ],
+       [ $utf,       $utf_ch_len * 5, -5,        0,               "utf in 
range, backward" ],
+       [ $utf,       $utf_ch_len * 5, 4,         $utf_ch_len * 9, "utf in 
range b, forward" ],
+       [ $utf,       $utf_ch_len * 5, -4,        $utf_ch_len,     "utf in 
range b, backward" ],
+       [ $utf,       $utf_ch_len * 5, 6,         length($utf),    "utf out of 
range, forward" ],
+       [ $utf,       $utf_ch_len * 5, -6,        0,               "utf out of 
range, backward"  ],
+       [ $bad_start, 0,               1,         1,               "bad start, 
forward 1 from 0" ],
+       [ $bad_start, 0,               $utf_ch_len-1, $utf_ch_len-1, "bad 
start, forward ch_len-1 from 0" ],
+       [ $bad_start, 0,               $utf_ch_len, $utf_ch_len*2-1, "bad 
start, forward ch_len from 0" ],
+       [ $bad_start, $utf_ch_len-1,   -1,        0,                "bad start, 
back 1 from first start byte" ],
+       [ $bad_start, $utf_ch_len-2,   -1,        0,                "bad start, 
back 1 from before first start byte" ],
+       [ $bad_start, 0,               -1,        0,                "bad start, 
back 1 from 0" ],
+       [ $bad_start, length $bad_start, -10,     0,                "bad start, 
back 10 from end" ],
+       [ $bad_end,   0,               10,        length $bad_end, "bad end, 
forward 10 from 0" ],
+       [ $bad_end,   length($bad_end)-1, 10,     length $bad_end, "bad end, 
forward 1 from end-1" ],
+       );
+
+    for my $test (@hop_tests) {
+        my ($str, $s_off, $off, $want, $name) = @$test;
+        my $result = test_utf8_hop_safe($str, $s_off, $off);
+        is($result, $want, "utf8_hop_safe: $name");
+    }
+}
+
 done_testing;
diff --git a/inline.h b/inline.h
index 66ba348..adcd85d 100644
--- a/inline.h
+++ b/inline.h
@@ -920,6 +920,117 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
 }
 
 /*
+=for apidoc utf8_hop_forward
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+forward.
+
+C<off> must be non-negative.
+
+C<s> must be before or equal to C<end>.
+
+When moving forward it will not move beyond C<end>.
+
+Will not exceed this limit even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(s <= end);
+    assert(off >= 0);
+
+    while (off--) {
+        STRLEN skip = UTF8SKIP(s);
+        if ((STRLEN)(end - s) <= skip)
+            return (U8 *)end;
+        s += skip;
+    }
+
+    return (U8 *)s;
+}
+
+/*
+=for apidoc utf8_hop_back
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+backward.
+
+C<off> must be non-positive.
+
+C<s> must be after or equal to C<start>.
+
+When moving backward it will not move before C<start>.
+
+Will not exceed this limit even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_BACK;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(start <= s);
+    assert(off <= 0);
+
+    while (off++ && s > start) {
+        s--;
+        while (UTF8_IS_CONTINUATION(*s) && s > start)
+            s--;
+    }
+    
+    return (U8 *)s;
+}
+
+/*
+=for apidoc utf8_hop_safe
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+either forward or backward.
+
+When moving backward it will not move before C<start>.
+
+When moving forward it will not move beyond C<end>.
+
+Will not exceed those limits even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(start <= s && s <= end);
+
+    if (off >= 0) {
+        return utf8_hop_forward(s, off, end);
+    }
+    else {
+        return utf8_hop_back(s, off, start);
+    }
+}
+
+/*
 
 =for apidoc is_utf8_valid_partial_char
 
diff --git a/proto.h b/proto.h
index 2e6dbf2..0b10c0a 100644
--- a/proto.h
+++ b/proto.h
@@ -3512,6 +3512,24 @@ PERL_STATIC_INLINE U8*   Perl_utf8_hop(const U8 *s, 
SSize_t off)
 #define PERL_ARGS_ASSERT_UTF8_HOP      \
        assert(s)
 
+PERL_STATIC_INLINE U8* Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 
*start)
+                       __attribute__warn_unused_result__
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_UTF8_HOP_BACK \
+       assert(s); assert(start)
+
+PERL_STATIC_INLINE U8* Perl_utf8_hop_forward(const U8 *s, SSize_t off, const 
U8 *end)
+                       __attribute__warn_unused_result__
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_UTF8_HOP_FORWARD      \
+       assert(s); assert(end)
+
+PERL_STATIC_INLINE U8* Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 
*start, const U8 *end)
+                       __attribute__warn_unused_result__
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_UTF8_HOP_SAFE \
+       assert(s); assert(start); assert(end)
+
 PERL_CALLCONV STRLEN   Perl_utf8_length(pTHX_ const U8* s, const U8 *e)
                        __attribute__warn_unused_result__
                        __attribute__pure__;
diff --git a/t/op/lex.t b/t/op/lex.t
index f3cb510..df96ed7 100644
--- a/t/op/lex.t
+++ b/t/op/lex.t
@@ -7,7 +7,7 @@ use warnings;
 
 BEGIN { chdir 't' if -d 't'; require './test.pl'; }
 
-plan(tests => 33);
+plan(tests => 34);
 
 {
     no warnings 'deprecated';
@@ -248,16 +248,27 @@ fresh_perl_like(
     {},
     '[perl #129336] - #!perl -i argument handling'
 );
-fresh_perl_is(
-    "BEGIN{\$^H=hex ~0}\xF3",
-    "Integer overflow in hexadecimal number at - line 1.\n" .
-    "Malformed UTF-8 character: \\xf3 (too short; got 1 byte, need 4) at - 
line 1.",
-    {},
-    '[perl #128996] - use of PL_op after op is freed'
-);
-fresh_perl_like(
-    qq(BEGIN{\$0="";\$^H=-hex join""=>1}""\xFF),
-    qr/Malformed UTF-8 character: \\xff \(too short; got 1 byte, need 13\) at 
- line 1\./,
-    {},
-    '[perl #128997] - buffer read overflow'
-);
+SKIP:
+{
+    ord("A") == 65
+      or skip "These tests won't work on EBCIDIC", 3;
+    fresh_perl_is(
+        "BEGIN{\$^H=hex ~0}\xF3",
+        "Integer overflow in hexadecimal number at - line 1.\n" .
+        "Malformed UTF-8 character: \\xf3 (too short; got 1 byte, need 4) at - 
line 1.",
+        {},
+        '[perl #128996] - use of PL_op after op is freed'
+    );
+    fresh_perl_like(
+        qq(BEGIN{\$0="";\$^H=-hex join""=>1}""\xFF),
+        qr/Malformed UTF-8 character: \\xff \(too short; got 1 byte, need 13\) 
at - line 1\./,
+        {},
+        '[perl #128997] - buffer read overflow'
+    );
+    fresh_perl_like(
+        qq(BEGIN{\$^H=0x800000}\n   0m 0\xB5\xB500\xB5\0),
+        qr/Unrecognized character \\x\{0\}; marked by <-- HERE after    
0m.*<-- HERE near column 12 at - line 2./,
+        {},
+        '[perl #129000] read before buffer'
+    );
+}
diff --git a/toke.c b/toke.c
index 2495bc2..ac7b5f3 100644
--- a/toke.c
+++ b/toke.c
@@ -4915,7 +4915,7 @@ Perl_yylex(pTHX)
         }
         len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : 
(STRLEN) (s - PL_linestart);
         if (len > UNRECOGNIZED_PRECEDE_COUNT) {
-            d = UTF ? (char *) utf8_hop((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT) 
: s - UNRECOGNIZED_PRECEDE_COUNT;
+            d = UTF ? (char *) utf8_hop_back((U8 *) s, 
-UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - 
UNRECOGNIZED_PRECEDE_COUNT;
         } else {
             d = PL_linestart;
         }

--
Perl5 Master Repository

Reply via email to