Author: pmichaud
Date: Thu Nov 3 12:26:14 2005
New Revision: 9758
Modified:
trunk/charset/gen_tables.pl
trunk/charset/tables.c
trunk/t/op/string_cclass.t
trunk/t/p6rules/metachars.t
Log:
* Fixed charset/gen_tables.pl,tables.c to treat \u00a0 and \u0085 as whitespace
* Updated t/p6rules/metachars.t with whitespace rules for above
* Added test for unicode find_not_cclass bug to t/op/string_cclass.t
Modified: trunk/charset/gen_tables.pl
==============================================================================
--- trunk/charset/gen_tables.pl (original)
+++ trunk/charset/gen_tables.pl Thu Nov 3 12:26:14 2005
@@ -51,7 +51,7 @@ sub classify {
$ret |= 0x0004 if $chr =~ /^[[:alpha:]]$/; # CCLASS_ALPHABETIC
$ret |= 0x0008 if $chr =~ /^[[:digit:]]$/; # CCLASS_NUMERIC
$ret |= 0x0010 if $chr =~ /^[[:xdigit:]]$/; # CCLASS_HEXADECIMAL
- $ret |= 0x0020 if $chr =~ /^[[:space:]]$/; # CCLASS_WHITESPACE
+ $ret |= 0x0020 if $chr =~ /^[[:space:]\x85\xa0]$/; # CCLASS_WHITESPACE
$ret |= 0x0040 if $chr =~ /^[[:print:]]$/; # CCLASS_PRINTING
$ret |= 0x0080 if $chr =~ /^[[:graph:]]$/; # CCLASS_GRAPHICAL
$ret |= 0x0100 if $chr =~ /^[[:blank:]]$/; # CCLASS_BLANK
Modified: trunk/charset/tables.c
==============================================================================
--- trunk/charset/tables.c (original)
+++ trunk/charset/tables.c Thu Nov 3 12:26:14 2005
@@ -31,11 +31,11 @@ const PARROT_CCLASS_FLAGS Parrot_ascii_t
0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 128-135 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0020, 0x0000, 0x0000, /* 128-135 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 136-143 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 144-151 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 152-159 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
+0x0020, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 168-175 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 176-183 */
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 184-191 */
@@ -65,11 +65,11 @@ const PARROT_CCLASS_FLAGS Parrot_iso_885
0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 128-135 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0220, 0x0200, 0x0200, /* 128-135 */
0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 136-143 */
0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 144-151 */
0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 152-159 */
-0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
+0x04e0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 168-175 */
0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x28c6, 0x04c0, 0x04c0, /* 176-183 */
0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 184-191 */
Modified: trunk/t/op/string_cclass.t
==============================================================================
--- trunk/t/op/string_cclass.t (original)
+++ trunk/t/op/string_cclass.t Thu Nov 3 12:26:14 2005
@@ -18,7 +18,7 @@ Tests find_cclass find_not_cclass, is_cc
use strict;
-use Parrot::Test tests => 10;
+use Parrot::Test tests => 11;
use Parrot::Config;
pir_output_is(<<'CODE', <<'OUT', "find_cclass, ascii");
@@ -396,4 +396,65 @@ CODE
9 3 6
OUT
+pir_output_is(<<'CODE', <<'OUT', "is_cclass, unicode first codepage");
+.include "cclass.pasm"
+.sub main :main
+ $S1 = unicode:"ab\nC_X34.\0 \t!"
+ test1( $S1 )
+.end
+.sub test1
+ .param string str
+ test2( str, .CCLASS_UPPERCASE)
+ test2( str, .CCLASS_LOWERCASE)
+ test2( str, .CCLASS_ALPHABETIC)
+ test2( str, .CCLASS_NUMERIC)
+ test2( str, .CCLASS_HEXADECIMAL)
+ test2( str, .CCLASS_WHITESPACE)
+ test2( str, .CCLASS_PRINTING)
+ test2( str, .CCLASS_GRAPHICAL)
+ test2( str, .CCLASS_BLANK)
+ test2( str, .CCLASS_CONTROL)
+ test2( str, .CCLASS_PUNCTUATION)
+ test2( str, .CCLASS_ALPHANUMERIC)
+ test2( str, .CCLASS_NEWLINE)
+ test2( str, .CCLASS_WORD)
+ $I0 = .CCLASS_NEWLINE|.CCLASS_WHITESPACE
+ test2( str, $I0)
+ $I0 = .CCLASS_WHITESPACE|.CCLASS_LOWERCASE
+ test2( str, $I0)
+ $I0 = .CCLASS_UPPERCASE|.CCLASS_PUNCTUATION
+ test2( str, $I0)
+.end
+.sub test2
+ .param string str
+ .param int code
+
+ $I1 = length str
+ set $I0, 0
+loop:
+ $I2 = is_cclass code, str, $I0
+ print $I2
+ inc $I0
+ if $I0 <= $I1 goto loop
+ print "\n"
+.end
+CODE
+00010100000000
+11000000000000
+11010100000000
+00000011000000
+11010011000000
+00100000001100
+11011111101010
+11011111100010
+00000000001100
+00100000010100
+00001000100010
+11010111000000
+00100000000000
+11011111000000
+00100000001100
+11100000001100
+00011100100010
+OUT
Modified: trunk/t/p6rules/metachars.t
==============================================================================
--- trunk/t/p6rules/metachars.t (original)
+++ trunk/t/p6rules/metachars.t Thu Nov 3 12:26:14 2005
@@ -132,40 +132,45 @@ p6rule_is ("1abc", '\1abc', 'retired me
## setup for unicode whitespace tests
## see http://www.unicode.org/Public/UNIDATA/PropList.txt for White_Space list
my $ws= {
- horizontal_ascii => [qw/ \u0009 \u0020 /],
+ horizontal_iso_8859_1 => [qw/ \u0009 \u0020 \u00a0 /],
horizontal_unicode => [qw/
- \u00a0 \u1680 \u180e \u2000 \u2001 \u2002 \u2003
+ \u1680 \u180e \u2000 \u2001 \u2002 \u2003
\u2004 \u2005 \u2006 \u2007 \u2008 \u2009
\u200a \u202f \u205f \u3000
/],
- vertical_ascii => [qw/ \u000a \u000b \u000c \u000d /],
- vertical_unicode => [qw/ \u0085 \u2028 \u2029 /]
+ vertical_iso_8859_1 => [qw/ \u000a \u000b \u000c \u000d \u0085 /],
+ vertical_unicode => [qw/ \u2028 \u2029 /]
};
push @{ $ws->{horizontal} } =>
- @{ $ws->{horizontal_ascii} }, @{ $ws->{horizontal_unicode} };
+ @{ $ws->{horizontal_iso_8859_1} }, @{ $ws->{horizontal_unicode} };
push @{ $ws->{vertical} } =>
- @{ $ws->{vertical_ascii} }, @{ $ws->{vertical_unicode} };
+ @{ $ws->{vertical_iso_8859_1} }, @{ $ws->{vertical_unicode} };
-push @{ $ws->{whitespace_ascii} } =>
- @{ $ws->{horizontal_ascii} }, @{ $ws->{vertical_ascii} };
+push @{ $ws->{whitespace_iso_8859_1} } =>
+ @{ $ws->{horizontal_iso_8859_1} }, @{ $ws->{vertical_iso_8859_1} };
push @{ $ws->{whitespace_unicode} } =>
@{ $ws->{horizontal_unicode} }, @{ $ws->{vertical_unicode} };
push @{ $ws->{whitespace} } =>
- @{ $ws->{whitespace_ascii} }, @{ $ws->{whitespace_unicode} };
+ @{ $ws->{whitespace_iso_8859_1} }, @{ $ws->{whitespace_unicode} };
## \s -- match unicode whitespace
## \h and \H -- horizontal whitespace, including unicode
## \v and \V -- vertical whitespace, including unicode
-p6rule_is (join('', @{$ws->{whitespace_ascii}}), '^ \s+ $', 'ascii whitespace
(\s)');
-p6rule_is (join('', @{$ws->{horizontal_ascii}}), '^ \h+ $', 'ascii horizontal
whitespace (\h)');
-p6rule_is (join('', @{$ws->{vertical_ascii}}), '^ \v+ $', 'ascii vertical
whitespace (\v)');
-p6rule_isnt(join('', @{$ws->{vertical_ascii}}), '^ \h+ $', 'ascii horizontal
whitespace (\h)');
-p6rule_isnt(join('', @{$ws->{horizontal_ascii}}), '^ \v+ $', 'ascii vertical
whitespace (\v)');
+p6rule_is (join('', @{$ws->{whitespace_iso_8859_1}}), '^ \s+ $',
+ '0-255 whitespace (\s)');
+p6rule_is (join('', @{$ws->{horizontal_iso_8859_1}}), '^ \h+ $',
+ '0-255 horizontal whitespace (\h)');
+p6rule_is (join('', @{$ws->{vertical_iso_8859_1}}), '^ \v+ $',
+ '0-255 vertical whitespace (\v)');
+p6rule_isnt(join('', @{$ws->{vertical_iso_8859_1}}), '^ \h+ $',
+ '0-255 horizontal whitespace (\h)');
+p6rule_isnt(join('', @{$ws->{horizontal_iso_8859_1}}), '^ \v+ $',
+ '0-255 vertical whitespace (\v)');
SKIP: {
skip 'unicode support unavailable' => 5
unless $PConfig{has_icu};