Author: pmichaud
Date: Thu Nov  3 12:26:14 2005
New Revision: 9758

Modified:
   trunk/charset/gen_tables.pl
   trunk/charset/tables.c
   trunk/t/op/string_cclass.t
   trunk/t/p6rules/metachars.t
Log:
* Fixed charset/gen_tables.pl,tables.c to treat \u00a0 and \u0085 as whitespace
* Updated t/p6rules/metachars.t with whitespace rules for above
* Added test for unicode find_not_cclass bug to t/op/string_cclass.t


Modified: trunk/charset/gen_tables.pl
==============================================================================
--- trunk/charset/gen_tables.pl (original)
+++ trunk/charset/gen_tables.pl Thu Nov  3 12:26:14 2005
@@ -51,7 +51,7 @@ sub classify {
     $ret |= 0x0004 if $chr =~ /^[[:alpha:]]$/;  # CCLASS_ALPHABETIC
     $ret |= 0x0008 if $chr =~ /^[[:digit:]]$/;  # CCLASS_NUMERIC        
     $ret |= 0x0010 if $chr =~ /^[[:xdigit:]]$/; # CCLASS_HEXADECIMAL    
-    $ret |= 0x0020 if $chr =~ /^[[:space:]]$/;  # CCLASS_WHITESPACE     
+    $ret |= 0x0020 if $chr =~ /^[[:space:]\x85\xa0]$/;  # CCLASS_WHITESPACE    
 
     $ret |= 0x0040 if $chr =~ /^[[:print:]]$/;  # CCLASS_PRINTING       
     $ret |= 0x0080 if $chr =~ /^[[:graph:]]$/;  # CCLASS_GRAPHICAL      
     $ret |= 0x0100 if $chr =~ /^[[:blank:]]$/;  # CCLASS_BLANK  

Modified: trunk/charset/tables.c
==============================================================================
--- trunk/charset/tables.c      (original)
+++ trunk/charset/tables.c      Thu Nov  3 12:26:14 2005
@@ -31,11 +31,11 @@ const PARROT_CCLASS_FLAGS Parrot_ascii_t
 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
 0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 128-135 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0020, 0x0000, 0x0000, /* 128-135 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 136-143 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 144-151 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 152-159 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
+0x0020, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 168-175 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 176-183 */
 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 184-191 */
@@ -65,11 +65,11 @@ const PARROT_CCLASS_FLAGS Parrot_iso_885
 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
 0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 128-135 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0220, 0x0200, 0x0200, /* 128-135 */
 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 136-143 */
 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 144-151 */
 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 152-159 */
-0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
+0x04e0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
 0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 168-175 */
 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x28c6, 0x04c0, 0x04c0, /* 176-183 */
 0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 184-191 */

Modified: trunk/t/op/string_cclass.t
==============================================================================
--- trunk/t/op/string_cclass.t  (original)
+++ trunk/t/op/string_cclass.t  Thu Nov  3 12:26:14 2005
@@ -18,7 +18,7 @@ Tests find_cclass find_not_cclass, is_cc
 
 use strict;
 
-use Parrot::Test tests => 10;
+use Parrot::Test tests => 11;
 use Parrot::Config;
 
 pir_output_is(<<'CODE', <<'OUT', "find_cclass, ascii");
@@ -396,4 +396,65 @@ CODE
 9 3 6
 OUT
 
+pir_output_is(<<'CODE', <<'OUT', "is_cclass, unicode first codepage");
+.include "cclass.pasm"
+.sub main :main
+    $S1 = unicode:"ab\nC_X34.\0 \t!"
+    test1( $S1 )
+.end
+.sub test1
+    .param string str
+    test2( str, .CCLASS_UPPERCASE)
+    test2( str, .CCLASS_LOWERCASE)
+    test2( str, .CCLASS_ALPHABETIC)
+    test2( str, .CCLASS_NUMERIC)
+    test2( str, .CCLASS_HEXADECIMAL)
+    test2( str, .CCLASS_WHITESPACE)
+    test2( str, .CCLASS_PRINTING)
+    test2( str, .CCLASS_GRAPHICAL)
+    test2( str, .CCLASS_BLANK)
+    test2( str, .CCLASS_CONTROL)
+    test2( str, .CCLASS_PUNCTUATION)
+    test2( str, .CCLASS_ALPHANUMERIC)
+    test2( str, .CCLASS_NEWLINE)
+    test2( str, .CCLASS_WORD)
 
+    $I0 = .CCLASS_NEWLINE|.CCLASS_WHITESPACE
+    test2( str, $I0)
+    $I0 = .CCLASS_WHITESPACE|.CCLASS_LOWERCASE
+    test2( str, $I0)
+    $I0 = .CCLASS_UPPERCASE|.CCLASS_PUNCTUATION
+    test2( str, $I0)
+.end
+.sub test2
+    .param string str
+    .param int code
+
+    $I1 = length str
+    set $I0, 0
+loop:
+    $I2 = is_cclass code, str, $I0
+    print $I2
+    inc $I0
+    if $I0 <= $I1 goto loop
+    print "\n"
+.end
+CODE
+00010100000000
+11000000000000
+11010100000000
+00000011000000
+11010011000000
+00100000001100
+11011111101010
+11011111100010
+00000000001100
+00100000010100
+00001000100010
+11010111000000
+00100000000000
+11011111000000
+00100000001100
+11100000001100
+00011100100010
+OUT

Modified: trunk/t/p6rules/metachars.t
==============================================================================
--- trunk/t/p6rules/metachars.t (original)
+++ trunk/t/p6rules/metachars.t Thu Nov  3 12:26:14 2005
@@ -132,40 +132,45 @@ p6rule_is  ("1abc", '\1abc', 'retired me
 ## setup for unicode whitespace tests
 ## see http://www.unicode.org/Public/UNIDATA/PropList.txt for White_Space list
 my $ws= {
-       horizontal_ascii => [qw/ \u0009 \u0020 /],
+       horizontal_iso_8859_1 => [qw/ \u0009 \u0020 \u00a0 /],
        horizontal_unicode => [qw/
-           \u00a0 \u1680 \u180e \u2000 \u2001 \u2002 \u2003 
+           \u1680 \u180e \u2000 \u2001 \u2002 \u2003 
            \u2004 \u2005 \u2006 \u2007 \u2008 \u2009 
            \u200a \u202f \u205f \u3000
        /],
-       vertical_ascii => [qw/ \u000a \u000b \u000c \u000d /],
-       vertical_unicode => [qw/ \u0085 \u2028 \u2029 /] 
+       vertical_iso_8859_1 => [qw/ \u000a \u000b \u000c \u000d \u0085 /],
+       vertical_unicode => [qw/ \u2028 \u2029 /] 
 };
 
 push @{ $ws->{horizontal} } =>
-       @{ $ws->{horizontal_ascii} }, @{ $ws->{horizontal_unicode} };
+       @{ $ws->{horizontal_iso_8859_1} }, @{ $ws->{horizontal_unicode} };
 
 push @{ $ws->{vertical} } =>
-       @{ $ws->{vertical_ascii} }, @{ $ws->{vertical_unicode} };
+       @{ $ws->{vertical_iso_8859_1} }, @{ $ws->{vertical_unicode} };
 
-push @{ $ws->{whitespace_ascii} } =>
-       @{ $ws->{horizontal_ascii} }, @{ $ws->{vertical_ascii} };
+push @{ $ws->{whitespace_iso_8859_1} } =>
+       @{ $ws->{horizontal_iso_8859_1} }, @{ $ws->{vertical_iso_8859_1} };
 
 push @{ $ws->{whitespace_unicode} } =>
        @{ $ws->{horizontal_unicode} }, @{ $ws->{vertical_unicode} };
 
 push @{ $ws->{whitespace} } =>
-       @{ $ws->{whitespace_ascii} }, @{ $ws->{whitespace_unicode} };
+       @{ $ws->{whitespace_iso_8859_1} }, @{ $ws->{whitespace_unicode} };
 
 
 ## \s -- match unicode whitespace
 ## \h and \H -- horizontal whitespace, including unicode
 ## \v and \V -- vertical whitespace, including unicode
-p6rule_is  (join('', @{$ws->{whitespace_ascii}}), '^ \s+ $', 'ascii whitespace 
(\s)');
-p6rule_is  (join('', @{$ws->{horizontal_ascii}}), '^ \h+ $', 'ascii horizontal 
whitespace (\h)');
-p6rule_is  (join('', @{$ws->{vertical_ascii}}), '^ \v+ $', 'ascii vertical 
whitespace (\v)');
-p6rule_isnt(join('', @{$ws->{vertical_ascii}}), '^ \h+ $', 'ascii horizontal 
whitespace (\h)');
-p6rule_isnt(join('', @{$ws->{horizontal_ascii}}), '^ \v+ $', 'ascii vertical 
whitespace (\v)');
+p6rule_is  (join('', @{$ws->{whitespace_iso_8859_1}}), '^ \s+ $', 
+    '0-255 whitespace (\s)');
+p6rule_is  (join('', @{$ws->{horizontal_iso_8859_1}}), '^ \h+ $', 
+    '0-255 horizontal whitespace (\h)');
+p6rule_is  (join('', @{$ws->{vertical_iso_8859_1}}), '^ \v+ $', 
+    '0-255 vertical whitespace (\v)');
+p6rule_isnt(join('', @{$ws->{vertical_iso_8859_1}}), '^ \h+ $', 
+    '0-255 horizontal whitespace (\h)');
+p6rule_isnt(join('', @{$ws->{horizontal_iso_8859_1}}), '^ \v+ $', 
+    '0-255 vertical whitespace (\v)');
 SKIP: {
        skip 'unicode support unavailable' => 5
                unless $PConfig{has_icu};

Reply via email to