In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/65ab9279784aa811d78b2903b57bc0e7947dec78?hp=e57ed4ecd4d7de38a79a316da8d657dad656f93f>

- Log -----------------------------------------------------------------
commit 65ab9279784aa811d78b2903b57bc0e7947dec78
Author: Tony Cook <t...@develop-help.com>
Date:   Tue Mar 16 23:46:48 2010 +1100

    handle perl extended utf8 start bytes
    
    perl uses UTF8_IS_START() to test if a byte is a valid start byte,
    this didn't take perl's extended UTF-8 range into account.
-----------------------------------------------------------------------

Summary of changes:
 t/op/chop.t |   21 ++++++++++++++++++++-
 utf8.h      |    4 +++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/t/op/chop.t b/t/op/chop.t
index 30f7bff..36f8cad 100644
--- a/t/op/chop.t
+++ b/t/op/chop.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 139;
+plan tests => 143;
 
 $_ = 'abc';
 $c = foo();
@@ -243,3 +243,22 @@ foreach my $start (@chars) {
     map chomp(+()), ('')x68;
     ok(1, "extend sp in pp_chomp");
 }
+
+{
+    # [perl #73246] chop doesn't support utf8
+    # the problem was UTF8_IS_START() didn't handle perl's extended UTF8
+    my $utf = "\x{80000001}\x{80000000}";
+    my $result = chop($utf);
+    is($utf, "\x{80000001}", "chopping high 'unicode'- remnant");
+    is($result, "\x{80000000}", "chopping high 'unicode' - result");
+
+    SKIP: {
+        use Config;
+        $Config{ivsize} >= 8
+         or skip("this build can't handle very large characters", 2);
+        my $utf = "\x{ffffffffffffffff}\x{fffffffffffffffe}";
+        my $result = chop $utf;
+        is($utf, "\x{ffffffffffffffff}", "chop even higher 'unicode' - 
remnant");
+        is($result, "\x{fffffffffffffffe}", "chop even higher 'unicode' - 
result");
+    }
+}
diff --git a/utf8.h b/utf8.h
index e58dded..b0cfedf 100644
--- a/utf8.h
+++ b/utf8.h
@@ -104,13 +104,15 @@ As you can see, the continuation bytes all begin with 
C<10>, and the
 leading bits of the start byte tell how many bytes there are in the
 encoded character.
 
+Perl's extended UTF-8 means we can have start bytes up to FF.
+
 */
 
 
 #define UNI_IS_INVARIANT(c)            (((UV)c) <  0x80)
 /* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
  * below might ought to be C2 */
-#define UTF8_IS_START(c)               (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
+#define UTF8_IS_START(c)               (((U8)c) >= 0xc0)
 #define UTF8_IS_CONTINUATION(c)                (((U8)c) >= 0x80 && (((U8)c) <= 
0xbf))
 #define UTF8_IS_CONTINUED(c)           (((U8)c) &  0x80)
 #define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)

--
Perl5 Master Repository

Reply via email to