[svn:parrot] r9737 - in trunk: . charset include/parrot t/op

leo Thu, 03 Nov 2005 02:57:58 -0800

Author: leo
Date: Thu Nov  3 02:57:38 2005
New Revision: 9737

Modified:
   trunk/DEPRECATED
   trunk/charset/unicode.c
   trunk/include/parrot/cclass.h
   trunk/t/op/string_cclass.t
Log:
Implement unicode is_cclass


* almost all character classes done (except newline)
* test unicode whitespace
* deprecate Parrot_char_is_* API


Modified: trunk/DEPRECATED
==============================================================================
--- trunk/DEPRECATED    (original)
+++ trunk/DEPRECATED    Thu Nov  3 02:57:38 2005
@@ -6,6 +6,15 @@ All are gone. Please read the history of
 
 =back
 
+=head1 Deprecated APIs
+
+=over 4
+
+All Parrot_char_is_* functions from src/string_primitives.c will
+be removed. Please use Parrot_string_is_cclass() instead.
+
+=back
+
 =head1 Deprecated methods
 
 =over 4

Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c     (original)
+++ trunk/charset/unicode.c     Thu Nov  3 02:57:38 2005
@@ -22,6 +22,12 @@ This file implements the charset functio
 #  undef EXCEPTION
 #endif
 
+#if PARROT_HAS_ICU
+#include <unicode/ucnv.h>
+#include <unicode/utypes.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#endif
 #define EXCEPTION(err, str) \
     real_exception(interpreter, NULL, err, str)
 
@@ -198,8 +204,68 @@ validate(Interp *interpreter, STRING *sr
 static INTVAL
 is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING 
*source_string, UINTVAL offset)
 {
-    UNIMPL;
-    return 0;
+    UINTVAL codepoint;
+    int result, bit, mask;
+
+    if (offset >= source_string->strlen)
+        return 0;
+    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+#if PARROT_HAS_ICU
+    for (result = 0, mask = enum_cclass_uppercase;
+            mask <= enum_cclass_word ; mask <<= 1) {
+        bit = mask & flags;
+        switch (bit) {
+            case 0: continue;
+            case enum_cclass_uppercase:
+                    result |= u_isupper(codepoint);
+                    break;
+            case enum_cclass_lowercase:
+                    result |= u_islower(codepoint);
+                    break;
+            case enum_cclass_alphabetic:
+                    result |= u_isalpha(codepoint);
+                    break;
+            case enum_cclass_numeric:
+                    result |= u_isdigit(codepoint);
+                    /* XXX which one
+                       result |= u_charDigitValue(codepoint);
+                       */
+                    break;
+            case enum_cclass_hexadecimal:
+                    result |= u_isxdigit(codepoint);
+                    break;
+            case enum_cclass_whitespace:
+                    result |= u_isspace(codepoint);
+                    break;
+            case enum_cclass_printing:
+                    result |= u_isprint(codepoint);
+                    break;
+            case enum_cclass_graphical:
+                    result |= u_isgraph(codepoint);
+                    break;
+            case enum_cclass_blank:
+                    result |= u_isblank(codepoint);
+                    break;
+            case enum_cclass_control:
+                    result |= u_iscntrl(codepoint);
+                    break;
+            case enum_cclass_alphanumeric:
+                    result |= u_isalnum(codepoint);
+                    break;
+            default:
+                    UNIMPL;
+        }
+        /* more bits? */
+        if (~ (flags ^ ~mask) == 0)
+            break;
+    }
+    return result;
+#else
+    if (codepoint >= 128)
+        real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+                "no ICU lib loaded");
+    return (Parrot_ascii_typetable[codepoint] & flags) ? 1 : 0;
+#endif
 }
 
 static INTVAL

Modified: trunk/include/parrot/cclass.h
==============================================================================
--- trunk/include/parrot/cclass.h       (original)
+++ trunk/include/parrot/cclass.h       Thu Nov  3 02:57:38 2005
@@ -10,8 +10,8 @@
 
 /* &gen_from_enum(cclass.pasm) subst(s/enum_cclass_(\w+)/uc("CCLASS_$1")/e) */
 typedef enum {                         /* ASCII characters matching this 
class: */
-enum_cclass_any = 0x0000,              /* all */
-enum_cclass_none = 0xffff,             /* none */
+enum_cclass_any = 0xffff,              /* all */
+enum_cclass_none = 0x0000,             /* none */
 enum_cclass_uppercase = 0x0001,                /* A-Z */
 enum_cclass_lowercase = 0x0002,                /* a-z */
 enum_cclass_alphabetic = 0x0004,       /* a-z, A-Z */

Modified: trunk/t/op/string_cclass.t
==============================================================================
--- trunk/t/op/string_cclass.t  (original)
+++ trunk/t/op/string_cclass.t  Thu Nov  3 02:57:38 2005
@@ -18,7 +18,7 @@ Tests find_cclass find_not_cclass, is_cc
 
 use strict;
 
-use Parrot::Test tests => 6;
+use Parrot::Test tests => 7;
 
 pir_output_is(<<'CODE', <<'OUT', "find_cclass, ascii");
 .include "cclass.pasm"
@@ -273,3 +273,57 @@ CODE
 11100000001100
 00011100100010
 OUT
+
+## setup for unicode whitespace tests
+## see http://www.unicode.org/Public/UNIDATA/PropList.txt for White_Space list
+## see also t/p6rules/metachars.t
+my $ws= {
+       horizontal_ascii => [qw/ \u0009 \u0020 \u00a0 /],
+       horizontal_unicode => [qw/
+               \u1680 \u180e \u2000 \u2001 \u2002 \u2003 \u2004 \u2005
+               \u2006 \u2007 \u2008 \u2009 \u200a \u202f \u205f \u3000
+       /],
+       vertical_ascii => [qw/ \u000a \u000b \u000c \u000d \u0085 /],
+       vertical_unicode => [qw/ \u2028 \u2029 /],
+};
+
+push @{ $ws->{horizontal} } =>
+       @{ $ws->{horizontal_ascii} }, @{ $ws->{horizontal_unicode} };
+
+push @{ $ws->{vertical} } =>
+       @{ $ws->{vertical_ascii} }, @{ $ws->{vertical_unicode} };
+
+push @{ $ws->{whitespace_ascii} } =>
+       @{ $ws->{horizontal_ascii} }, @{ $ws->{vertical_ascii} };
+
+push @{ $ws->{whitespace_unicode} } =>
+       @{ $ws->{horizontal_unicode} }, @{ $ws->{vertical_unicode} };
+
+push @{ $ws->{whitespace} } =>
+       @{ $ws->{whitespace_ascii} }, @{ $ws->{whitespace_unicode} };
+
+sub string {
+    my $which = shift;
+    'unicode:"' . join('',  @{$ws->{$which}}) . '"';
+}
+
+my $all_ws = string('whitespace');
+pir_output_is(<<"CODE", <<'OUT', "unicode whitespace");
+.sub main :main
+.include "cclass.pasm"
+   .local int result, char, len, i
+   .local string s
+   s = $all_ws
+   len = length s
+   i = 0
+loop:
+   result = is_cclass .CCLASS_WHITESPACE, s, i
+   print result
+   inc i
+   if i < len goto loop
+   print "\\n"
+.end
+CODE
+11111111111111111111111111
+OUT
+

[svn:parrot] r9737 - in trunk: . charset include/parrot t/op

Reply via email to