Change 30294 by [EMAIL PROTECTED] on 2007/02/14 17:41:03
Integrate:
[ 29880]
Subject: [PATCH] fix unicode split /\s+/
From: demerphq <[EMAIL PROTECTED]>
Date: Fri, 19 Jan 2007 02:14:06 +0100
Message-ID: <[EMAIL PROTECTED]>
[ 29887]
Subject: Re: [PATCH] fix unicode split /\s+/
From: SADAHIRO Tomoyuki <[EMAIL PROTECTED]>
Message-Id: <[EMAIL PROTECTED]>
Date: Sat, 20 Jan 2007 00:52:42 +0900
[ 29975]
Subject: split by " \0" (const string staring with a SPACE followed by
NULL)
From: SADAHIRO Tomoyuki <[EMAIL PROTECTED]>
Date: Fri, 19 Jan 2007 22:21:48 +0900
Message-Id: <[EMAIL PROTECTED]>
Affected files ...
... //depot/maint-5.8/perl/op.c#200 integrate
... //depot/maint-5.8/perl/pp.c#135 integrate
... //depot/maint-5.8/perl/t/op/split.t#12 integrate
Differences ...
==== //depot/maint-5.8/perl/op.c#200 (text) ====
Index: perl/op.c
--- perl/op.c#199~30292~ 2007-02-14 09:29:10.000000000 -0800
+++ perl/op.c 2007-02-14 09:41:03.000000000 -0800
@@ -2854,7 +2854,7 @@
STRLEN plen;
SV * const pat = ((SVOP*)expr)->op_sv;
const char *p = SvPV_const(pat, plen);
- if ((o->op_flags & OPf_SPECIAL) && (*p == ' ' && p[1] == '\0')) {
+ if ((o->op_flags & OPf_SPECIAL) && (plen == 1 && *p == ' ')) {
U32 was_readonly = SvREADONLY(pat);
if (was_readonly) {
==== //depot/maint-5.8/perl/pp.c#135 (text) ====
Index: perl/pp.c
--- perl/pp.c#134~30196~ 2007-02-10 11:13:38.000000000 -0800
+++ perl/pp.c 2007-02-14 09:41:03.000000000 -0800
@@ -4593,7 +4593,11 @@
base = SP - PL_stack_base;
orig = s;
if (pm->op_pmflags & PMf_SKIPWHITE) {
- if (pm->op_pmflags & PMf_LOCALE) {
+ if (do_utf8) {
+ while (*s == ' ' || is_utf8_space((U8*)s))
+ s += UTF8SKIP(s);
+ }
+ else if (pm->op_pmflags & PMf_LOCALE) {
while (isSPACE_LC(*s))
s++;
}
@@ -4612,10 +4616,23 @@
if (pm->op_pmflags & PMf_WHITE) {
while (--limit) {
m = s;
- while (m < strend &&
- !((pm->op_pmflags & PMf_LOCALE)
- ? isSPACE_LC(*m) : isSPACE(*m)))
- ++m;
+ /* this one uses 'm' and is a negative test */
+ if (do_utf8) {
+ while (m < strend && !( *m == ' ' || is_utf8_space((U8*)m) )) {
+ const int t = UTF8SKIP(m);
+ /* is_utf8_space returns FALSE for malform utf8 */
+ if (strend - m < t)
+ m = strend;
+ else
+ m += t;
+ }
+ } else if (pm->op_pmflags & PMf_LOCALE) {
+ while (m < strend && !isSPACE_LC(*m))
+ ++m;
+ } else {
+ while (m < strend && !isSPACE(*m))
+ ++m;
+ }
if (m >= strend)
break;
@@ -4626,11 +4643,23 @@
(void)SvUTF8_on(dstr);
XPUSHs(dstr);
- s = m + 1;
- while (s < strend &&
- ((pm->op_pmflags & PMf_LOCALE)
- ? isSPACE_LC(*s) : isSPACE(*s)))
- ++s;
+ /* skip the whitespace found last */
+ if (do_utf8)
+ s = m + UTF8SKIP(m);
+ else
+ s = m + 1;
+
+ /* this one uses 's' and is a positive test */
+ if (do_utf8) {
+ while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
+ s += UTF8SKIP(s);
+ } else if (pm->op_pmflags & PMf_LOCALE) {
+ while (s < strend && isSPACE_LC(*s))
+ ++s;
+ } else {
+ while (s < strend && isSPACE(*s))
+ ++s;
+ }
}
}
else if (rx->precomp[0] == '^' && rx->precomp[1] == '\0') {
==== //depot/maint-5.8/perl/t/op/split.t#12 (xtext) ====
Index: perl/t/op/split.t
--- perl/t/op/split.t#11~23833~ 2005-01-20 03:26:12.000000000 -0800
+++ perl/t/op/split.t 2007-02-14 09:41:03.000000000 -0800
@@ -6,7 +6,7 @@
require './test.pl';
}
-plan tests => 55;
+plan tests => 135;
$FS = ':';
@@ -297,4 +297,64 @@
$x = \$a[2];
is (ref $x, 'SCALAR', '#28938 - garbage after extend');
}
+{
+ # check the special casing of split /\s/ and unicode
+ use charnames qw(:full);
+ # below test data is extracted from
+ # PropList-5.0.0.txt
+ # Date: 2006-06-07, 23:22:52 GMT [MD]
+ #
+ # Unicode Character Database
+ # Copyright (c) 1991-2006 Unicode, Inc.
+ # For terms of use, see http://www.unicode.org/terms_of_use.html
+ # For documentation, see UCD.html
+ my @spaces=(
+ ord("\t"), # Cc <control-0009>
+ ord("\n"), # Cc <control-000A>
+ # not PerlSpace # Cc <control-000B>
+ ord("\f"), # Cc <control-000C>
+ ord("\r"), # Cc <control-000D>
+ ord(" "), # Zs SPACE
+ ord("\N{NEL}"), # Cc <control-0085>
+ ord("\N{NO-BREAK SPACE}"),
+ # Zs NO-BREAK SPACE
+ 0x1680, # Zs OGHAM SPACE MARK
+ 0x180E, # Zs MONGOLIAN VOWEL SEPARATOR
+ 0x2000..0x200A, # Zs [11] EN QUAD..HAIR SPACE
+ 0x2028, # Zl LINE SEPARATOR
+ 0x2029, # Zp PARAGRAPH SEPARATOR
+ 0x202F, # Zs NARROW NO-BREAK SPACE
+ 0x205F, # Zs MEDIUM MATHEMATICAL SPACE
+ 0x3000 # Zs IDEOGRAPHIC SPACE
+ );
+ #diag "Have @[EMAIL PROTECTED] to test\n";
+ foreach my $cp (@spaces) {
+ my $msg = sprintf "Space: U+%04x", $cp;
+ my $space = chr($cp);
+ my $str="A:$space:B\x{FFFD}";
+ chop $str;
+
+ my @res=split(/\s+/,$str);
+ ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
+
+ my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
+ chop $s2;
+
+ my @r2 = split(' ',$s2);
+ ok(@r2 == 2 && join('-', @r2) eq ":A:-:B", "$msg - ' '");
+
+ my @r3 = split(/\s+/, $s2);
+ ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
+ }
+}
+{
+ my $src = "ABC \0 FOO \0 XYZ";
+ my @s = split(" \0 ", $src);
+ my @r = split(/ \0 /, $src);
+ is(scalar(@s), 3);
+ is($s[0], "ABC");
+ is($s[1], "FOO");
+ is($s[2]," XYZ");
+ is(join(':',@s), join(':',@r));
+}
End of Patch.