Author: leo
Date: Thu Nov 3 02:57:38 2005
New Revision: 9737
Modified:
trunk/DEPRECATED
trunk/charset/unicode.c
trunk/include/parrot/cclass.h
trunk/t/op/string_cclass.t
Log:
Implement unicode is_cclass
* almost all character classes done (except newline)
* test unicode whitespace
* deprecate Parrot_char_is_* API
Modified: trunk/DEPRECATED
==============================================================================
--- trunk/DEPRECATED (original)
+++ trunk/DEPRECATED Thu Nov 3 02:57:38 2005
@@ -6,6 +6,15 @@ All are gone. Please read the history of
=back
+=head1 Deprecated APIs
+
+=over 4
+
+All Parrot_char_is_* functions from src/string_primitives.c will
+be removed. Please use Parrot_string_is_cclass() instead.
+
+=back
+
=head1 Deprecated methods
=over 4
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Thu Nov 3 02:57:38 2005
@@ -22,6 +22,12 @@ This file implements the charset functio
# undef EXCEPTION
#endif
+#if PARROT_HAS_ICU
+#include <unicode/ucnv.h>
+#include <unicode/utypes.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#endif
#define EXCEPTION(err, str) \
real_exception(interpreter, NULL, err, str)
@@ -198,8 +204,68 @@ validate(Interp *interpreter, STRING *sr
static INTVAL
is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset)
{
- UNIMPL;
- return 0;
+ UINTVAL codepoint;
+ int result, bit, mask;
+
+ if (offset >= source_string->strlen)
+ return 0;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+#if PARROT_HAS_ICU
+ for (result = 0, mask = enum_cclass_uppercase;
+ mask <= enum_cclass_word ; mask <<= 1) {
+ bit = mask & flags;
+ switch (bit) {
+ case 0: continue;
+ case enum_cclass_uppercase:
+ result |= u_isupper(codepoint);
+ break;
+ case enum_cclass_lowercase:
+ result |= u_islower(codepoint);
+ break;
+ case enum_cclass_alphabetic:
+ result |= u_isalpha(codepoint);
+ break;
+ case enum_cclass_numeric:
+ result |= u_isdigit(codepoint);
+ /* XXX which one
+ result |= u_charDigitValue(codepoint);
+ */
+ break;
+ case enum_cclass_hexadecimal:
+ result |= u_isxdigit(codepoint);
+ break;
+ case enum_cclass_whitespace:
+ result |= u_isspace(codepoint);
+ break;
+ case enum_cclass_printing:
+ result |= u_isprint(codepoint);
+ break;
+ case enum_cclass_graphical:
+ result |= u_isgraph(codepoint);
+ break;
+ case enum_cclass_blank:
+ result |= u_isblank(codepoint);
+ break;
+ case enum_cclass_control:
+ result |= u_iscntrl(codepoint);
+ break;
+ case enum_cclass_alphanumeric:
+ result |= u_isalnum(codepoint);
+ break;
+ default:
+ UNIMPL;
+ }
+ /* more bits? */
+ if (~ (flags ^ ~mask) == 0)
+ break;
+ }
+ return result;
+#else
+ if (codepoint >= 128)
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+ return (Parrot_ascii_typetable[codepoint] & flags) ? 1 : 0;
+#endif
}
static INTVAL
Modified: trunk/include/parrot/cclass.h
==============================================================================
--- trunk/include/parrot/cclass.h (original)
+++ trunk/include/parrot/cclass.h Thu Nov 3 02:57:38 2005
@@ -10,8 +10,8 @@
/* &gen_from_enum(cclass.pasm) subst(s/enum_cclass_(\w+)/uc("CCLASS_$1")/e) */
typedef enum { /* ASCII characters matching this
class: */
-enum_cclass_any = 0x0000, /* all */
-enum_cclass_none = 0xffff, /* none */
+enum_cclass_any = 0xffff, /* all */
+enum_cclass_none = 0x0000, /* none */
enum_cclass_uppercase = 0x0001, /* A-Z */
enum_cclass_lowercase = 0x0002, /* a-z */
enum_cclass_alphabetic = 0x0004, /* a-z, A-Z */
Modified: trunk/t/op/string_cclass.t
==============================================================================
--- trunk/t/op/string_cclass.t (original)
+++ trunk/t/op/string_cclass.t Thu Nov 3 02:57:38 2005
@@ -18,7 +18,7 @@ Tests find_cclass find_not_cclass, is_cc
use strict;
-use Parrot::Test tests => 6;
+use Parrot::Test tests => 7;
pir_output_is(<<'CODE', <<'OUT', "find_cclass, ascii");
.include "cclass.pasm"
@@ -273,3 +273,57 @@ CODE
11100000001100
00011100100010
OUT
+
+## setup for unicode whitespace tests
+## see http://www.unicode.org/Public/UNIDATA/PropList.txt for White_Space list
+## see also t/p6rules/metachars.t
+my $ws= {
+ horizontal_ascii => [qw/ \u0009 \u0020 \u00a0 /],
+ horizontal_unicode => [qw/
+ \u1680 \u180e \u2000 \u2001 \u2002 \u2003 \u2004 \u2005
+ \u2006 \u2007 \u2008 \u2009 \u200a \u202f \u205f \u3000
+ /],
+ vertical_ascii => [qw/ \u000a \u000b \u000c \u000d \u0085 /],
+ vertical_unicode => [qw/ \u2028 \u2029 /],
+};
+
+push @{ $ws->{horizontal} } =>
+ @{ $ws->{horizontal_ascii} }, @{ $ws->{horizontal_unicode} };
+
+push @{ $ws->{vertical} } =>
+ @{ $ws->{vertical_ascii} }, @{ $ws->{vertical_unicode} };
+
+push @{ $ws->{whitespace_ascii} } =>
+ @{ $ws->{horizontal_ascii} }, @{ $ws->{vertical_ascii} };
+
+push @{ $ws->{whitespace_unicode} } =>
+ @{ $ws->{horizontal_unicode} }, @{ $ws->{vertical_unicode} };
+
+push @{ $ws->{whitespace} } =>
+ @{ $ws->{whitespace_ascii} }, @{ $ws->{whitespace_unicode} };
+
+sub string {
+ my $which = shift;
+ 'unicode:"' . join('', @{$ws->{$which}}) . '"';
+}
+
+my $all_ws = string('whitespace');
+pir_output_is(<<"CODE", <<'OUT', "unicode whitespace");
+.sub main :main
+.include "cclass.pasm"
+ .local int result, char, len, i
+ .local string s
+ s = $all_ws
+ len = length s
+ i = 0
+loop:
+ result = is_cclass .CCLASS_WHITESPACE, s, i
+ print result
+ inc i
+ if i < len goto loop
+ print "\\n"
+.end
+CODE
+11111111111111111111111111
+OUT
+