Change 29880 by [EMAIL PROTECTED] on 2007/01/19 09:33:00
Subject: [PATCH] fix unicode split /\s+/
From: demerphq <[EMAIL PROTECTED]>
Date: Fri, 19 Jan 2007 02:14:06 +0100
Message-ID: <[EMAIL PROTECTED]>
Affected files ...
... //depot/perl/pp.c#575 edit
... //depot/perl/t/op/split.t#36 edit
Differences ...
==== //depot/perl/pp.c#575 (text) ====
Index: perl/pp.c
--- perl/pp.c#574~29730~ 2007-01-09 02:04:37.000000000 -0800
+++ perl/pp.c 2007-01-19 01:33:00.000000000 -0800
@@ -4606,12 +4606,29 @@
if (!limit)
limit = maxiters + 2;
if (pm->op_pmflags & PMf_WHITE) {
+ if (do_utf8 && !PL_utf8_space) {
+ /* force PL_utf8_space to be loaded */
+ bool ok;
+ ENTER;
+ ok = is_utf8_space((const U8*)" ");
+ assert(ok);
+ LEAVE;
+ }
while (--limit) {
m = s;
- while (m < strend &&
- !((pm->op_pmflags & PMf_LOCALE)
- ? isSPACE_LC(*m) : isSPACE(*m)))
- ++m;
+ /* this one uses 'm' and is a negative test */
+ if (do_utf8) {
+ STRLEN uskip;
+ while (m < strend &&
+ !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m,
do_utf8) ))
+ m += UTF8SKIP(m);
+ } else if (pm->op_pmflags & PMf_LOCALE) {
+ while (m < strend && !isSPACE_LC(*m))
+ ++m;
+ } else {
+ while (m < strend && !isSPACE(*m))
+ ++m;
+ }
if (m >= strend)
break;
@@ -4623,10 +4640,18 @@
XPUSHs(dstr);
s = m + 1;
- while (s < strend &&
- ((pm->op_pmflags & PMf_LOCALE)
- ? isSPACE_LC(*s) : isSPACE(*s)))
- ++s;
+ /* this one uses 's' and is a positive test */
+ if (do_utf8) {
+ while (s < strend &&
+ ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s,
do_utf8) ))
+ s += UTF8SKIP(s);
+ } else if (pm->op_pmflags & PMf_LOCALE) {
+ while (s < strend && isSPACE_LC(*s))
+ ++s;
+ } else {
+ while (s < strend && isSPACE(*s))
+ ++s;
+ }
}
}
else if (rx->extflags & RXf_START_ONLY) {
==== //depot/perl/t/op/split.t#36 (xtext) ====
Index: perl/t/op/split.t
--- perl/t/op/split.t#35~23779~ 2005-01-10 09:07:03.000000000 -0800
+++ perl/t/op/split.t 2007-01-19 01:33:00.000000000 -0800
@@ -6,7 +6,7 @@
require './test.pl';
}
-plan tests => 55;
+plan tests => 80;
$FS = ':';
@@ -297,4 +297,38 @@
$x = \$a[2];
is (ref $x, 'SCALAR', '#28938 - garbage after extend');
}
-
+{
+ # check the special casing of split /\s/ and unicode
+ use charnames qw(:full);
+ # below test data is extracted from
+ # PropList-5.0.0.txt
+ # Date: 2006-06-07, 23:22:52 GMT [MD]
+ #
+ # Unicode Character Database
+ # Copyright (c) 1991-2006 Unicode, Inc.
+ # For terms of use, see http://www.unicode.org/terms_of_use.html
+ # For documentation, see UCD.html
+ my @spaces=(
+ 0x0009..0x000A, # Cc [5] <control-0009>..<control-000D>
+ 0x000C..0x000D, # EXCLUDING \v aka ctl-000B aka vert-tab
+ 0x0020, # Zs SPACE
+ 0x0085, # Cc <control-0085>
+ 0x00A0, # Zs NO-BREAK SPACE
+ 0x1680, # Zs OGHAM SPACE MARK
+ 0x180E, # Zs MONGOLIAN VOWEL SEPARATOR
+ 0x2000..0x200A, # Zs [11] EN QUAD..HAIR SPACE
+ 0x2028, # Zl LINE SEPARATOR
+ 0x2029, # Zp PARAGRAPH SEPARATOR
+ 0x202F, # Zs NARROW NO-BREAK SPACE
+ 0x205F, # Zs MEDIUM MATHEMATICAL SPACE
+ 0x3000 # Zs IDEOGRAPHIC SPACE
+ );
+ #diag "Have @[EMAIL PROTECTED] to test\n";
+ foreach my $cp (@spaces) {
+ my $space = chr($cp);
+ my $str="A:$space:B\x{FFFF}";
+ chop $str;
+ my @res=split(/\s+/,$str);
+ is([EMAIL PROTECTED],2) or do { diag sprintf "Char failed: 0x%x",$cp }
+ }
+}
End of Patch.