cvsuser     03/11/04 08:22:33

  Modified:    chartypes unicode.c usascii.c
               config/gen parrot_include.pl
               include/parrot chartype.h exceptions.h
               src      chartype.c string.c
  Log:
  Improve support for dynamically loaded DBCS chartypes
  Add infrastructure for character classification functions
  
  Revision  Changes    Path
  1.19      +17 -2     parrot/chartypes/unicode.c
  
  Index: unicode.c
  ===================================================================
  RCS file: /cvs/public/parrot/chartypes/unicode.c,v
  retrieving revision 1.18
  retrieving revision 1.19
  diff -u -w -r1.18 -r1.19
  --- unicode.c 7 Sep 2003 09:01:18 -0000       1.18
  +++ unicode.c 4 Nov 2003 16:22:27 -0000       1.19
  @@ -1,7 +1,7 @@
   /* unicode.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: unicode.c,v 1.18 2003/09/07 09:01:18 petergibbs Exp $
  + *     $Id: unicode.c,v 1.19 2003/11/04 16:22:27 petergibbs Exp $
    *  Overview:
    *     This defines the US-ASCII character type routines.
    *  Data Structure and Algorithms:
  @@ -46,12 +46,27 @@
       { NULL, NULL, NULL }
   };
   
  +static INTVAL
  +unicode_is_charclass(const struct parrot_chartype_t *type, const Parrot_UInt c, 
  +                     const unsigned int class)
  +{
  +    switch (class) {
  +        case enum_charclass_digit:
  +            return chartype_is_digit_mapn(type, c, class);
  +    }
  +    internal_exception(INVALID_CHARCLASS, "Unknown character class <%d>\n", 
  +                       class);
  +    return 0;
  +}
   
   CHARTYPE unicode_chartype = {
       enum_chartype_unicode,
       "unicode",
       "utf32",
  +    { 
       chartype_is_digit_mapn,          /* is_digit() */
  +        unicode_is_charclass         /* is_charclass() - slow version */
  +    },
       chartype_get_digit_mapn,         /* get_digit() */
       unicode_digit_map,               /* digit_map */
       NULL,                            /* unicode_map */
  
  
  
  1.16      +49 -2     parrot/chartypes/usascii.c
  
  Index: usascii.c
  ===================================================================
  RCS file: /cvs/public/parrot/chartypes/usascii.c,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -w -r1.15 -r1.16
  --- usascii.c 17 Sep 2003 20:41:41 -0000      1.15
  +++ usascii.c 4 Nov 2003 16:22:27 -0000       1.16
  @@ -1,7 +1,7 @@
   /* usascii.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: usascii.c,v 1.15 2003/09/17 20:41:41 petergibbs Exp $
  + *     $Id: usascii.c,v 1.16 2003/11/04 16:22:27 petergibbs Exp $
    *  Overview:
    *     This defines the US-ASCII character type routines.
    *  Data Structure and Algorithms:
  @@ -25,11 +25,58 @@
       return c;
   }
   
  +static INTVAL
  +usascii_is_charclass(const struct parrot_chartype_t *type, const Parrot_UInt c, 
  +                     const unsigned int class)
  +{
  +    switch (class) {
  +        case enum_charclass_alnum:
  +            return (c >= 0x30 && c <= 0x39) ||
  +                   (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
  +        case enum_charclass_alpha:
  +            return (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
  +        case enum_charclass_ascii:
  +            return c < 128;
  +        case enum_charclass_blank:
  +            return c == 0x20 || c == 0x09;
  +        case enum_charclass_cntrl:
  +            return c < 0x20 || c == 0x7F;
  +        case enum_charclass_digit:
  +            return c >= 0x30 && c <= 0x39;
  +        case enum_charclass_graph:
  +            return c > 0x20 && c < 0x7F;
  +        case enum_charclass_lower:
  +            return c >= 0x61 && c <= 0x7A;
  +        case enum_charclass_print:
  +            return c >= 0x20 && c < 0x7F;
  +        case enum_charclass_punct:
  +            return (c >= 0x21 && c <= 0x2F) ||
  +                   (c >= 0x3A && c <= 0x40) ||
  +                   (c >= 0x5B && c <= 0x60) ||
  +                   (c >= 0x7B && c <= 0x7E);
  +        case enum_charclass_space:
  +            return c == 0x20 || c == 0x0C || c == 0x0A || c == 0x0D ||
  +                   c == 0x09 || c == 0x0B;
  +        case enum_charclass_upper:
  +            return c >= 0x41 && c <= 0x5A;
  +        case enum_charclass_xdigit:
  +            return (c >= 0x30 && c <= 0x39) ||
  +                   (c >= 0x41 && c <= 0x46) ||
  +                   (c >= 0x61 && c <= 0x66);
  +    }
  +    internal_exception(INVALID_CHARCLASS, "Unknown character class <%d>\n", 
  +                       class);
  +    return 0;
  +}
  +
   CHARTYPE usascii_chartype = {
       enum_chartype_usascii,
       "usascii",
       "singlebyte",
  +    { 
       chartype_is_digit_map1,          /* is_digit() */
  +        usascii_is_charclass         /* is_charclass() - slow version */
  +    },
       chartype_get_digit_map1,         /* get_digit() */
       &usascii_digit_map,              /* digit_map */
       NULL,                            /* unicode_map */
  
  
  
  1.7       +28 -3     parrot/config/gen/parrot_include.pl
  
  Index: parrot_include.pl
  ===================================================================
  RCS file: /cvs/public/parrot/config/gen/parrot_include.pl,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -w -r1.6 -r1.7
  --- parrot_include.pl 23 Oct 2003 19:21:24 -0000      1.6
  +++ parrot_include.pl 4 Nov 2003 16:22:29 -0000       1.7
  @@ -33,10 +33,11 @@
       for my $f (@files) {
        my $in_def = ''; # in #define='def', in enum='enum'
        my ($inc, $prefix, $last_val, $subst, %values);
  +        my (%var, $match, $block);
        open F, "<$f" or die "Can't open $f\n";
        while (<F>) {
            if (m!
  -             &gen_from_(enum|def)\((.*?)\)
  +             &gen_from_(enum|def|template)\((.*?)\)
                (\s+prefix\((\w+)\))?
                (\s+subst\((s/.*?/.*?/\w*)\))?
                !x
  @@ -49,6 +50,7 @@
                $last_val = -1;
                %values = ();
                open INC, ">$inc.tmp" or die "Can't write $inc.tmp";
  +    print INC "/*\n" if $inc =~ /\.h/;
                print INC <<"EOF";
   # DO NOT EDIT THIS FILE.
   #
  @@ -58,12 +60,16 @@
   # Any changes made here will be lost.
   #
   EOF
  +    print INC "*/\n" if $inc =~ /\.h/;
                next;
            }
            if (/&end_gen/) {
                close INC;
  -             move_if_diff("$inc.tmp", "$destdir/$inc");
  -             push(@generated, "$destdir/$inc");
  +                my $destfile = ($inc =~ m[/]) ? "$inc" : "$destdir/$inc";
  +             #move_if_diff("$inc.tmp", "$destdir/$inc");
  +             #push(@generated, "$destdir/$inc");
  +             move_if_diff("$inc.tmp", "$destfile");
  +             push(@generated, "$destfile");
                $in_def = '';
                next;
            }
  @@ -96,6 +102,25 @@
                    eval $subst if ($subst ne '');
                    print INC ".constant $_\n";
                }
  +            }
  +         elsif ($in_def eq 'template') {
  +                if (/match{(.*)}/) {
  +                    $match = $1;
  +                    next;
  +                }
  +                if (/eval{{/) {
  +                    while (<F>) {
  +                        last if /}}/;
  +                        $block .= $_;
  +                    }
  +                    next;
  +                }
  +                if (/$match/) {
  +                    select INC;
  +                    eval $block;
  +                    select STDOUT;
  +                    die $@ if $@;
  +                }
            }
   
        }
  
  
  
  1.22      +55 -5     parrot/include/parrot/chartype.h
  
  Index: chartype.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/chartype.h,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -w -r1.21 -r1.22
  --- chartype.h        4 Nov 2003 14:51:40 -0000       1.21
  +++ chartype.h        4 Nov 2003 16:22:31 -0000       1.22
  @@ -1,7 +1,7 @@
   /* chartype.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: chartype.h,v 1.21 2003/11/04 14:51:40 boemmels Exp $
  + *     $Id: chartype.h,v 1.22 2003/11/04 16:22:31 petergibbs Exp $
    *  Overview:
    *     This is the api header for the string character type subsystem
    *  Data Structure and Algorithms:
  @@ -31,6 +31,55 @@
   /* &end_gen */
   
   /*
  + * Character classifications 
  + */
  +/* 
  + &gen_from_template(include/parrot/charclass.h)
  +  match{enum_charclass_(\w+)}
  +  eval{{
  +    if ($1 eq 'SLOW') {
  +        $var{slow} = 1;
  +    }
  +    elsif ($1 eq 'MAX') {
  +    }
  +    elsif ($var{slow}) {
  +      print "#define Parrot_char_is_$1(ct,c) \\\n";
  +      print " ct->is_charclass[enum_charclass_SLOW](ct,c,enum_charclass_$1)\n";
  +    }
  +    else {
  +      print "#define Parrot_char_is_$1(ct,c) \\\n";
  +      print " ct->is_charclass[enum_charclass_$1](ct,c,enum_charclass_$1)\n";
  +    }
  +  }}
  + */
  +enum {
  +    enum_charclass_digit,
  +    enum_charclass_SLOW,
  +    enum_charclass_alnum,
  +    enum_charclass_alpha,
  +    enum_charclass_ascii,
  +    enum_charclass_blank,
  +    enum_charclass_cntrl,
  +    enum_charclass_graph,
  +    enum_charclass_lower,
  +    enum_charclass_print,
  +    enum_charclass_punct,
  +    enum_charclass_space,
  +    enum_charclass_upper,
  +    enum_charclass_xdigit,
  +    enum_charclass_MAX
  +};
  +/* &end_gen */
  +
  +/* Generated file containing access macros */
  +#include "parrot/charclass.h"
  +
  +typedef Parrot_Int (*Parrot_is_charclass)
  +           (const struct parrot_chartype_t *type,
  +            const Parrot_UInt c,
  +            const unsigned int charclass);
  +
  +/*
    * Character code to digit value translation map
    */
   struct chartype_digit_map_t {
  @@ -59,8 +108,7 @@
       Parrot_Int index;
       const char *name;
       const char *default_encoding;
  -    Parrot_Int (*is_digit)
  -        (const struct parrot_chartype_t *type, Parrot_UInt c);
  +    Parrot_is_charclass is_charclass[enum_charclass_SLOW+1];
       Parrot_Int (*get_digit)
           (const struct parrot_chartype_t *type, Parrot_UInt c);
       const struct chartype_digit_map_t *digit_map;
  @@ -83,9 +131,11 @@
   const CHARTYPE * chartype_lookup_index(INTVAL n);
   INTVAL chartype_find_chartype(const char *name);
   
  -Parrot_Int chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c);
  +Parrot_Int chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c,
  +                                  const unsigned int class);
  +Parrot_Int chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c,
  +                                  const unsigned int class);
   Parrot_Int chartype_get_digit_map1(const CHARTYPE* type, const UINTVAL c);
  -Parrot_Int chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c);
   Parrot_Int chartype_get_digit_mapn(const CHARTYPE* type, const UINTVAL c);
   Parrot_UInt chartype_transcode_nop(const struct parrot_chartype_t *from,
                                      const struct parrot_chartype_t *to,
  
  
  
  1.42      +2 -1      parrot/include/parrot/exceptions.h
  
  Index: exceptions.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/exceptions.h,v
  retrieving revision 1.41
  retrieving revision 1.42
  diff -u -w -r1.41 -r1.42
  --- exceptions.h      28 Oct 2003 03:06:39 -0000      1.41
  +++ exceptions.h      4 Nov 2003 16:22:31 -0000       1.42
  @@ -1,7 +1,7 @@
   /* exceptions.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: exceptions.h,v 1.41 2003/10/28 03:06:39 mrjoltcola Exp $
  + *     $Id: exceptions.h,v 1.42 2003/11/04 16:22:31 petergibbs Exp $
    *  Overview:
    *     define the internal interpreter exceptions
    *  Data Structure and Algorithms:
  @@ -37,6 +37,7 @@
   #define INVALID_CHARACTER 1
   #define INVALID_CHARTYPE 1
   #define INVALID_ENCODING 1
  +#define INVALID_CHARCLASS 1
   #define NEG_REPEAT 1
   #define NEG_SUBSTR 1
   #define NEG_SLEEP 1
  
  
  
  1.19      +29 -6     parrot/src/chartype.c
  
  Index: chartype.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/chartype.c,v
  retrieving revision 1.18
  retrieving revision 1.19
  diff -u -w -r1.18 -r1.19
  --- chartype.c        3 Nov 2003 15:05:09 -0000       1.18
  +++ chartype.c        4 Nov 2003 16:22:33 -0000       1.19
  @@ -1,7 +1,7 @@
   /* chartype.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: chartype.c,v 1.18 2003/11/03 15:05:09 petergibbs Exp $
  + *     $Id: chartype.c,v 1.19 2003/11/04 16:22:33 petergibbs Exp $
    *  Overview:
    *     This defines the string character type subsystem
    *  Data Structure and Algorithms:
  @@ -121,6 +121,25 @@
   }
   
   /*
  + * Digit mapping via translation to Unicode
  + * XXX Replace by generation of custom digit mapping table
  + */
  +static Parrot_Int
  +chartype_is_digit_Unicode(const CHARTYPE* type, const UINTVAL c, 
  +                          unsigned int class)
  +{
  +    UINTVAL uc = chartype_to_unicode_cparray(type, NULL, c);
  +    return chartype_is_digit_mapn(&unicode_chartype, uc, enum_charclass_digit);
  +}
  +
  +static Parrot_Int
  +chartype_get_digit_Unicode(const CHARTYPE* type, const UINTVAL c)
  +{
  +    UINTVAL uc = chartype_to_unicode_cparray(type, NULL, c);
  +    return chartype_get_digit_mapn(&unicode_chartype, uc);
  +}
  +
  +/*
    * Create chartype from mapping file
    * Still TODO:
    *   Handle more encodings (singlebyte & dbcs implemented so far)
  @@ -195,9 +214,11 @@
       }
       type->from_unicode = chartype_from_unicode_cparray;
       type->to_unicode = chartype_to_unicode_cparray;
  -    type->is_digit = chartype_is_digit_map1;
  -    type->get_digit = chartype_get_digit_map1;
  -    type->digit_map = &default_digit_map;
  +    /* XXX Should generate a custom digit mapping table */
  +    if (enum_charclass_digit < enum_charclass_SLOW)
  +        type->is_charclass[enum_charclass_digit] = chartype_is_digit_Unicode;
  +    type->get_digit = chartype_get_digit_Unicode;
  +    type->digit_map = NULL;
       map = mem_sys_allocate(sizeof(struct chartype_unicode_map_t));
       map->n1 = one2one;
       map->cparray = cparray;
  @@ -258,7 +279,8 @@
    * Generic versions of the digit handling functions
    */
   Parrot_Int
  -chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c)
  +chartype_is_digit_map1(const CHARTYPE* type, const UINTVAL c, 
  +                       unsigned int class)
   {
       return c >= type->digit_map->first_code && c <= type->digit_map->last_code;
   }
  @@ -275,7 +297,8 @@
   }
   
   Parrot_Int
  -chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c)
  +chartype_is_digit_mapn(const CHARTYPE* type, const UINTVAL c,
  +                       unsigned int class)
   {
       const struct chartype_digit_map_t *map = type->digit_map;
       while (map->first_value >= 0) {
  
  
  
  1.154     +4 -4      parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.153
  retrieving revision 1.154
  diff -u -w -r1.153 -r1.154
  --- string.c  3 Nov 2003 12:54:48 -0000       1.153
  +++ string.c  4 Nov 2003 16:22:33 -0000       1.154
  @@ -1,7 +1,7 @@
   /* string.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string.c,v 1.153 2003/11/03 12:54:48 petergibbs Exp $
  + *     $Id: string.c,v 1.154 2003/11/04 16:22:33 petergibbs Exp $
    *  Overview:
    *     This is the api definitions for the string subsystem
    *  Data Structure and Algorithms:
  @@ -1232,7 +1232,7 @@
   
           UINTVAL c = s->encoding->decode(s->strstart);
   
  -        if (s->type->is_digit(s->type,c)
  +        if (Parrot_char_is_digit(s->type,c)
            && s->type->get_digit(s->type,c) == 0) {
               return 0;
           }
  @@ -1305,7 +1305,7 @@
           while (start < end) {
               UINTVAL c = s->encoding->decode(start);
   
  -            if (s->type->is_digit(s->type,c)) {
  +            if (Parrot_char_is_digit(s->type,c)) {
                   in_number = 1;
                   i = i * 10 + (c - '0');
               }
  @@ -1354,7 +1354,7 @@
   
           while (start < end) {
               UINTVAL c = s->encoding->decode(start);
  -            INTVAL df = s->type->is_digit(s->type,c);
  +            INTVAL df = Parrot_char_is_digit(s->type,c);
   
               if (df && !digit_family)
                   digit_family = df;
  
  
  

Reply via email to