cvsuser 03/11/04 08:22:33
Modified: chartypes unicode.c usascii.c
config/gen parrot_include.pl
include/parrot chartype.h exceptions.h
src chartype.c string.c
Log:
Improve support for dynamically loaded DBCS chartypes
Add infrastructure for character classification functions
Revision Changes Path
1.19 +17 -2 parrot/chartypes/unicode.c
Index: unicode.c
===================================================================
RCS file: /cvs/public/parrot/chartypes/unicode.c,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -w -r1.18 -r1.19
--- unicode.c 7 Sep 2003 09:01:18 -0000 1.18
+++ unicode.c 4 Nov 2003 16:22:27 -0000 1.19
@@ -1,7 +1,7 @@
/* unicode.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: unicode.c,v 1.18 2003/09/07 09:01:18 petergibbs Exp $
+ * $Id: unicode.c,v 1.19 2003/11/04 16:22:27 petergibbs Exp $
* Overview:
* This defines the US-ASCII character type routines.
* Data Structure and Algorithms:
@@ -46,12 +46,27 @@
{ NULL, NULL, NULL }
};
+static INTVAL
+unicode_is_charclass(const struct parrot_chartype_t *type, const Parrot_UInt c,
+ const unsigned int class)
+{
+ switch (class) {
+ case enum_charclass_digit:
+ return chartype_is_digit_mapn(type, c, class);
+ }
+ internal_exception(INVALID_CHARCLASS, "Unknown character class <%d>\n",
+ class);
+ return 0;
+}
CHARTYPE unicode_chartype = {
enum_chartype_unicode,
"unicode",
"utf32",
+ {
chartype_is_digit_mapn, /* is_digit() */
+ unicode_is_charclass /* is_charclass() - slow version */
+ },
chartype_get_digit_mapn, /* get_digit() */
unicode_digit_map, /* digit_map */
NULL, /* unicode_map */
1.16 +49 -2 parrot/chartypes/usascii.c
Index: usascii.c
===================================================================
RCS file: /cvs/public/parrot/chartypes/usascii.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -w -r1.15 -r1.16
--- usascii.c 17 Sep 2003 20:41:41 -0000 1.15
+++ usascii.c 4 Nov 2003 16:22:27 -0000 1.16
@@ -1,7 +1,7 @@
/* usascii.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: usascii.c,v 1.15 2003/09/17 20:41:41 petergibbs Exp $
+ * $Id: usascii.c,v 1.16 2003/11/04 16:22:27 petergibbs Exp $
* Overview:
* This defines the US-ASCII character type routines.
* Data Structure and Algorithms:
@@ -25,11 +25,58 @@
return c;
}
+static INTVAL
+usascii_is_charclass(const struct parrot_chartype_t *type, const Parrot_UInt c,
+ const unsigned int class)
+{
+ switch (class) {
+ case enum_charclass_alnum:
+ return (c >= 0x30 && c <= 0x39) ||
+ (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
+ case enum_charclass_alpha:
+ return (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
+ case enum_charclass_ascii:
+ return c < 128;
+ case enum_charclass_blank:
+ return c == 0x20 || c == 0x09;
+ case enum_charclass_cntrl:
+ return c < 0x20 || c == 0x7F;
+ case enum_charclass_digit:
+ return c >= 0x30 && c <= 0x39;
+ case enum_charclass_graph:
+ return c > 0x20 && c < 0x7F;
+ case enum_charclass_lower:
+ return c >= 0x61 && c <= 0x7A;
+ case enum_charclass_print:
+ return c >= 0x20 && c < 0x7F;
+ case enum_charclass_punct:
+ return (c >= 0x21 && c <= 0x2F) ||
+ (c >= 0x3A && c <= 0x40) ||
+ (c >= 0x5B && c <= 0x60) ||
+ (c >= 0x7B && c <= 0x7E);
+ case enum_charclass_space:
+ return c == 0x20 || c == 0x0C || c == 0x0A || c == 0x0D ||
+ c == 0x09 || c == 0x0B;
+ case enum_charclass_upper:
+ return c >= 0x41 && c <= 0x5A;
+ case enum_charclass_xdigit:
+ return (c >= 0x30 && c <= 0x39) ||
+ (c >= 0x41 && c <= 0x46) ||
+ (c >= 0x61 && c <= 0x66);
+ }
+ internal_exception(INVALID_CHARCLASS, "Unknown character class <%d>\n",
+ class);
+ return 0;
+}
+
CHARTYPE usascii_chartype = {
enum_chartype_usascii,
"usascii",
"singlebyte",
+ {
chartype_is_digit_map1, /* is_digit() */
+ usascii_is_charclass /* is_charclass() - slow version */
+ },
chartype_get_digit_map1, /* get_digit() */
&usascii_digit_map, /* digit_map */
NULL, /* unicode_map */
1.7 +28 -3 parrot/config/gen/parrot_include.pl
Index: parrot_include.pl
===================================================================
RCS file: /cvs/public/parrot/config/gen/parrot_include.pl,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -w -r1.6 -r1.7
--- parrot_include.pl 23 Oct 2003 19:21:24 -0000 1.6
+++ parrot_include.pl 4 Nov 2003 16:22:29 -0000 1.7
@@ -33,10 +33,11 @@
for my $f (@files) {
my $in_def = ''; # in #define='def', in enum='enum'
my ($inc, $prefix, $last_val, $subst, %values);
+ my (%var, $match, $block);
open F, "<$f" or die "Can't open $f\n";
while (<F>) {
if (m!
- &gen_from_(enum|def)\((.*?)\)
+ &gen_from_(enum|def|template)\((.*?)\)
(\s+prefix\((\w+)\))?
(\s+subst\((s/.*?/.*?/\w*)\))?
!x
@@ -49,6 +50,7 @@
$last_val = -1;
%values = ();
open INC, ">$inc.tmp" or die "Can't write $inc.tmp";
+ print INC "/*\n" if $inc =~ /\.h/;
print INC <<"EOF";
# DO NOT EDIT THIS FILE.
#
@@ -58,12 +60,16 @@
# Any changes made here will be lost.
#
EOF
+ print INC "*/\n" if $inc =~ /\.h/;
next;
}
if (/&end_gen/) {
close INC;
- move_if_diff("$inc.tmp", "$destdir/$inc");
- push(@generated, "$destdir/$inc");
+ my $destfile = ($inc =~ m[/]) ? "$inc" : "$destdir/$inc";
+ #move_if_diff("$inc.tmp", "$destdir/$inc");
+ #push(@generated, "$destdir/$inc");
+ move_if_diff("$inc.tmp", "$destfile");
+ push(@generated, "$destfile");
$in_def = '';
next;
}
@@ -96,6 +102,25 @@
eval $subst if ($subst ne '');
print INC ".constant $_\n";
}
+ }
+ elsif ($in_def eq 'template') {
+ if (/match{(.*)}/) {
+ $match = $1;
+ next;
+ }
+ if (/eval{{/) {
+ while (<F>) {
+ last if /}}/;
+ $block .= $_;
+ }
+ next;
+ }
+ if (/$match/) {
+ select INC;
+ eval $block;
+ select STDOUT;
+ die $@ if $@;
+ }
}
}
1.22 +55 -5 parrot/include/parrot/chartype.h
Index: chartype.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/chartype.h,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -w -r1.21 -r1.22
--- chartype.h 4 Nov 2003 14:51:40 -0000 1.21
+++ chartype.h 4 Nov 2003 16:22:31 -0000 1.22
@@ -1,7 +1,7 @@
/* chartype.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: chartype.h,v 1.21 2003/11/04 14:51:40 boemmels Exp $
+ * $Id: chartype.h,v 1.22 2003/11/04 16:22:31 petergibbs Exp $
* Overview:
* This is the api header for the string character type subsystem
* Data Structure and Algorithms:
@@ -31,6 +31,55 @@
/* &end_gen */
/*
+ * Character classifications
+ */
+/*
+ &gen_from_template(include/parrot/charclass.h)
+ match{enum_charclass_(\w+)}
+ eval{{
+ if ($1 eq 'SLOW') {
+ $var{slow} = 1;
+ }
+ elsif ($1 eq 'MAX') {
+ }
+ elsif ($var{slow}) {
+ print "#define Parrot_char_is_$1(ct,c) \\\n";
+ print " ct->is_charclass[enum_charclass_SLOW](ct,c,enum_charclass_$1)\n";
+ }
+ else {
+ print "#define Parrot_char_is_$1(ct,c) \\\n";
+ print " ct->is_charclass[enum_charclass_$1](ct,c,enum_charclass_$1)\n";
+ }
+ }}
+ */
+enum {
+ enum_charclass_digit,
+ enum_charclass_SLOW,
+ enum_charclass_alnum,
+ enum_charclass_alpha,
+ enum_charclass_ascii,
+ enum_charclass_blank,
+ enum_charclass_cntrl,
+ enum_charclass_graph,
+ enum_charclass_lower,
+ enum_charclass_print,
+ enum_charclass_punct,
+ enum_charclass_space,
+ enum_charclass_upper,
+ enum_charclass_xdigit,
+ enum_charclass_MAX
+};
+/* &end_gen */
+
+/* Generated file containing access macros */
+#include "parrot/charclass.h"
+
+typedef Parrot_Int (*Parrot_is_charclass)
+ (const struct parrot_chartype_t *type,
+ const Parrot_UInt c,
+ const unsigned int charclass);
+
+/*
* Character code to digit value translation map
*/
struct chartype_digit_map_t {
@@ -59,8 +108,7 @@
Parrot_Int index;
const char *name;
const char *default_encoding;
- Parrot_Int (*is_digit)
- (const struct parrot_chartype_t *type, Parrot_UInt c);
+ Parrot_is_charclass is_charclass[enum_charclass_SLOW+1];
Parrot_Int (*get_digit)
(const struct parrot_chartype_t *type, Parrot_UInt c);
const struct chartype_digit_map_t *digit_map;
@@ -83,9 +131,11 @@
const CHARTYPE * chartype_lookup_index(INTVAL n);
INTVAL chartype_find_chartype(const char *name);
-Parrot_Int chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c);
+Parrot_Int chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c,
+ const unsigned int class);
+Parrot_Int chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c,
+ const unsigned int class);
Parrot_Int chartype_get_digit_map1(const CHARTYPE* type, const UINTVAL c);
-Parrot_Int chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c);
Parrot_Int chartype_get_digit_mapn(const CHARTYPE* type, const UINTVAL c);
Parrot_UInt chartype_transcode_nop(const struct parrot_chartype_t *from,
const struct parrot_chartype_t *to,
1.42 +2 -1 parrot/include/parrot/exceptions.h
Index: exceptions.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/exceptions.h,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -w -r1.41 -r1.42
--- exceptions.h 28 Oct 2003 03:06:39 -0000 1.41
+++ exceptions.h 4 Nov 2003 16:22:31 -0000 1.42
@@ -1,7 +1,7 @@
/* exceptions.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: exceptions.h,v 1.41 2003/10/28 03:06:39 mrjoltcola Exp $
+ * $Id: exceptions.h,v 1.42 2003/11/04 16:22:31 petergibbs Exp $
* Overview:
* define the internal interpreter exceptions
* Data Structure and Algorithms:
@@ -37,6 +37,7 @@
#define INVALID_CHARACTER 1
#define INVALID_CHARTYPE 1
#define INVALID_ENCODING 1
+#define INVALID_CHARCLASS 1
#define NEG_REPEAT 1
#define NEG_SUBSTR 1
#define NEG_SLEEP 1
1.19 +29 -6 parrot/src/chartype.c
Index: chartype.c
===================================================================
RCS file: /cvs/public/parrot/src/chartype.c,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -w -r1.18 -r1.19
--- chartype.c 3 Nov 2003 15:05:09 -0000 1.18
+++ chartype.c 4 Nov 2003 16:22:33 -0000 1.19
@@ -1,7 +1,7 @@
/* chartype.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: chartype.c,v 1.18 2003/11/03 15:05:09 petergibbs Exp $
+ * $Id: chartype.c,v 1.19 2003/11/04 16:22:33 petergibbs Exp $
* Overview:
* This defines the string character type subsystem
* Data Structure and Algorithms:
@@ -121,6 +121,25 @@
}
/*
+ * Digit mapping via translation to Unicode
+ * XXX Replace by generation of custom digit mapping table
+ */
+static Parrot_Int
+chartype_is_digit_Unicode(const CHARTYPE* type, const UINTVAL c,
+ unsigned int class)
+{
+ UINTVAL uc = chartype_to_unicode_cparray(type, NULL, c);
+ return chartype_is_digit_mapn(&unicode_chartype, uc, enum_charclass_digit);
+}
+
+static Parrot_Int
+chartype_get_digit_Unicode(const CHARTYPE* type, const UINTVAL c)
+{
+ UINTVAL uc = chartype_to_unicode_cparray(type, NULL, c);
+ return chartype_get_digit_mapn(&unicode_chartype, uc);
+}
+
+/*
* Create chartype from mapping file
* Still TODO:
* Handle more encodings (singlebyte & dbcs implemented so far)
@@ -195,9 +214,11 @@
}
type->from_unicode = chartype_from_unicode_cparray;
type->to_unicode = chartype_to_unicode_cparray;
- type->is_digit = chartype_is_digit_map1;
- type->get_digit = chartype_get_digit_map1;
- type->digit_map = &default_digit_map;
+ /* XXX Should generate a custom digit mapping table */
+ if (enum_charclass_digit < enum_charclass_SLOW)
+ type->is_charclass[enum_charclass_digit] = chartype_is_digit_Unicode;
+ type->get_digit = chartype_get_digit_Unicode;
+ type->digit_map = NULL;
map = mem_sys_allocate(sizeof(struct chartype_unicode_map_t));
map->n1 = one2one;
map->cparray = cparray;
@@ -258,7 +279,8 @@
* Generic versions of the digit handling functions
*/
Parrot_Int
-chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c)
+chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c,
+ unsigned int class)
{
return c >= type->digit_map->first_code && c <= type->digit_map->last_code;
}
@@ -275,7 +297,8 @@
}
Parrot_Int
-chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c)
+chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c,
+ unsigned int class)
{
const struct chartype_digit_map_t *map = type->digit_map;
while (map->first_value >= 0) {
1.154 +4 -4 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.153
retrieving revision 1.154
diff -u -w -r1.153 -r1.154
--- string.c 3 Nov 2003 12:54:48 -0000 1.153
+++ string.c 4 Nov 2003 16:22:33 -0000 1.154
@@ -1,7 +1,7 @@
/* string.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string.c,v 1.153 2003/11/03 12:54:48 petergibbs Exp $
+ * $Id: string.c,v 1.154 2003/11/04 16:22:33 petergibbs Exp $
* Overview:
* This is the api definitions for the string subsystem
* Data Structure and Algorithms:
@@ -1232,7 +1232,7 @@
UINTVAL c = s->encoding->decode(s->strstart);
- if (s->type->is_digit(s->type,c)
+ if (Parrot_char_is_digit(s->type,c)
&& s->type->get_digit(s->type,c) == 0) {
return 0;
}
@@ -1305,7 +1305,7 @@
while (start < end) {
UINTVAL c = s->encoding->decode(start);
- if (s->type->is_digit(s->type,c)) {
+ if (Parrot_char_is_digit(s->type,c)) {
in_number = 1;
i = i * 10 + (c - '0');
}
@@ -1354,7 +1354,7 @@
while (start < end) {
UINTVAL c = s->encoding->decode(start);
- INTVAL df = s->type->is_digit(s->type,c);
+ INTVAL df = Parrot_char_is_digit(s->type,c);
if (df && !digit_family)
digit_family = df;