Change 30294 by [EMAIL PROTECTED] on 2007/02/14 17:41:03

        Integrate:
        [ 29880]
        Subject: [PATCH] fix unicode split /\s+/
        From: demerphq <[EMAIL PROTECTED]>
        Date: Fri, 19 Jan 2007 02:14:06 +0100
        Message-ID: <[EMAIL PROTECTED]>
        
        [ 29887]
        Subject: Re: [PATCH] fix unicode split /\s+/
        From: SADAHIRO Tomoyuki <[EMAIL PROTECTED]>
        Message-Id: <[EMAIL PROTECTED]>
        Date: Sat, 20 Jan 2007 00:52:42 +0900
        
        [ 29975]
        Subject: split by " \0" (const string staring with a SPACE followed by 
NULL)
        From: SADAHIRO Tomoyuki <[EMAIL PROTECTED]>
        Date: Fri, 19 Jan 2007 22:21:48 +0900
        Message-Id: <[EMAIL PROTECTED]>

Affected files ...

... //depot/maint-5.8/perl/op.c#200 integrate
... //depot/maint-5.8/perl/pp.c#135 integrate
... //depot/maint-5.8/perl/t/op/split.t#12 integrate

Differences ...

==== //depot/maint-5.8/perl/op.c#200 (text) ====
Index: perl/op.c
--- perl/op.c#199~30292~        2007-02-14 09:29:10.000000000 -0800
+++ perl/op.c   2007-02-14 09:41:03.000000000 -0800
@@ -2854,7 +2854,7 @@
        STRLEN plen;
        SV * const pat = ((SVOP*)expr)->op_sv;
        const char *p = SvPV_const(pat, plen);
-       if ((o->op_flags & OPf_SPECIAL) && (*p == ' ' && p[1] == '\0')) {
+       if ((o->op_flags & OPf_SPECIAL) && (plen == 1 && *p == ' ')) {
            U32 was_readonly = SvREADONLY(pat);
 
            if (was_readonly) {

==== //depot/maint-5.8/perl/pp.c#135 (text) ====
Index: perl/pp.c
--- perl/pp.c#134~30196~        2007-02-10 11:13:38.000000000 -0800
+++ perl/pp.c   2007-02-14 09:41:03.000000000 -0800
@@ -4593,7 +4593,11 @@
     base = SP - PL_stack_base;
     orig = s;
     if (pm->op_pmflags & PMf_SKIPWHITE) {
-       if (pm->op_pmflags & PMf_LOCALE) {
+       if (do_utf8) {
+           while (*s == ' ' || is_utf8_space((U8*)s))
+               s += UTF8SKIP(s);
+       }
+       else if (pm->op_pmflags & PMf_LOCALE) {
            while (isSPACE_LC(*s))
                s++;
        }
@@ -4612,10 +4616,23 @@
     if (pm->op_pmflags & PMf_WHITE) {
        while (--limit) {
            m = s;
-           while (m < strend &&
-                  !((pm->op_pmflags & PMf_LOCALE)
-                    ? isSPACE_LC(*m) : isSPACE(*m)))
-               ++m;
+           /* this one uses 'm' and is a negative test */
+           if (do_utf8) {
+               while (m < strend && !( *m == ' ' || is_utf8_space((U8*)m) )) {
+                   const int t = UTF8SKIP(m);
+                   /* is_utf8_space returns FALSE for malform utf8 */
+                   if (strend - m < t)
+                       m = strend;
+                   else
+                       m += t;
+               }
+            } else if (pm->op_pmflags & PMf_LOCALE) {
+               while (m < strend && !isSPACE_LC(*m))
+                   ++m;
+            } else {
+                while (m < strend && !isSPACE(*m))
+                    ++m;
+            }  
            if (m >= strend)
                break;
 
@@ -4626,11 +4643,23 @@
                (void)SvUTF8_on(dstr);
            XPUSHs(dstr);
 
-           s = m + 1;
-           while (s < strend &&
-                  ((pm->op_pmflags & PMf_LOCALE)
-                   ? isSPACE_LC(*s) : isSPACE(*s)))
-               ++s;
+           /* skip the whitespace found last */
+           if (do_utf8)
+               s = m + UTF8SKIP(m);
+           else
+               s = m + 1;
+
+           /* this one uses 's' and is a positive test */
+           if (do_utf8) {
+               while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
+                   s +=  UTF8SKIP(s);
+            } else if (pm->op_pmflags & PMf_LOCALE) {
+               while (s < strend && isSPACE_LC(*s))
+                   ++s;
+            } else {
+                while (s < strend && isSPACE(*s))
+                    ++s;
+            }      
        }
     }
     else if (rx->precomp[0] == '^' && rx->precomp[1] == '\0') {

==== //depot/maint-5.8/perl/t/op/split.t#12 (xtext) ====
Index: perl/t/op/split.t
--- perl/t/op/split.t#11~23833~ 2005-01-20 03:26:12.000000000 -0800
+++ perl/t/op/split.t   2007-02-14 09:41:03.000000000 -0800
@@ -6,7 +6,7 @@
     require './test.pl';
 }
 
-plan tests => 55;
+plan tests => 135;
 
 $FS = ':';
 
@@ -297,4 +297,64 @@
     $x = \$a[2];
     is (ref $x, 'SCALAR', '#28938 - garbage after extend');
 }
+{
+    # check the special casing of split /\s/ and unicode
+    use charnames qw(:full);
+    # below test data is extracted from
+    # PropList-5.0.0.txt
+    # Date: 2006-06-07, 23:22:52 GMT [MD]
+    #
+    # Unicode Character Database
+    # Copyright (c) 1991-2006 Unicode, Inc.
+    # For terms of use, see http://www.unicode.org/terms_of_use.html
+    # For documentation, see UCD.html
+    my @spaces=(
+       ord("\t"),      # Cc       <control-0009>
+       ord("\n"),      # Cc       <control-000A>
+       # not PerlSpace # Cc       <control-000B>
+       ord("\f"),      # Cc       <control-000C>
+       ord("\r"),      # Cc       <control-000D>
+       ord(" "),       # Zs       SPACE
+       ord("\N{NEL}"), # Cc       <control-0085>
+       ord("\N{NO-BREAK SPACE}"),
+                       # Zs       NO-BREAK SPACE
+        0x1680,         # Zs       OGHAM SPACE MARK
+        0x180E,         # Zs       MONGOLIAN VOWEL SEPARATOR
+        0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
+        0x2028,         # Zl       LINE SEPARATOR
+        0x2029,         # Zp       PARAGRAPH SEPARATOR
+        0x202F,         # Zs       NARROW NO-BREAK SPACE
+        0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
+        0x3000          # Zs       IDEOGRAPHIC SPACE
+    );
+    #diag "Have @[EMAIL PROTECTED] to test\n";
+    foreach my $cp (@spaces) {
+       my $msg = sprintf "Space: U+%04x", $cp;
+        my $space = chr($cp);
+        my $str="A:$space:B\x{FFFD}";
+        chop $str;
+
+        my @res=split(/\s+/,$str);
+        ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
+
+        my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
+        chop $s2;
+
+        my @r2 = split(' ',$s2);
+        ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
+
+        my @r3 = split(/\s+/, $s2);
+        ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
+    }
+}
 
+{
+    my $src = "ABC \0 FOO \0  XYZ";
+    my @s = split(" \0 ", $src);
+    my @r = split(/ \0 /, $src);
+    is(scalar(@s), 3);
+    is($s[0], "ABC");
+    is($s[1], "FOO");
+    is($s[2]," XYZ");
+    is(join(':',@s), join(':',@r));
+}
End of Patch.

Reply via email to