cvsuser     04/07/09 01:43:13

  Modified:    classes  perlstring.pmc
               imcc     imcc.l imcc.y pbc.c symreg.c symreg.h
               include/parrot string_funcs.h
               languages/python pie-thon.pl
               languages/python/t/basic 03_types.t
               pf       pf_items.c
               src      string.c
  Log:
  Pie-thon 40 - unicode string parsing and repr()
  
  Revision  Changes    Path
  1.82      +12 -7     parrot/classes/perlstring.pmc
  
  Index: perlstring.pmc
  ===================================================================
  RCS file: /cvs/public/parrot/classes/perlstring.pmc,v
  retrieving revision 1.81
  retrieving revision 1.82
  diff -u -w -r1.81 -r1.82
  --- perlstring.pmc    9 Jul 2004 05:23:38 -0000       1.81
  +++ perlstring.pmc    9 Jul 2004 08:42:47 -0000       1.82
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: perlstring.pmc,v 1.81 2004/07/09 05:23:38 leo Exp $
  +$Id: perlstring.pmc,v 1.82 2004/07/09 08:42:47 leo Exp $
   
   =head1 NAME
   
  @@ -150,12 +150,17 @@
   */
   
       STRING* get_repr() {
  -        STRING *q = const_string(interpreter, "'");
  -        STRING *s = DYNSELF.get_string();
  -        STRING *repr = string_copy(interpreter, q);
  -        s = string_append(interpreter, q, s, 0);
  -        s = string_append(interpreter, s, q, 0);
  -        return s;
  +        STRING *start, *s, *q, *repr;
  +        q = const_string(interpreter, "'");
  +        s = DYNSELF.get_string();
  +        if (PObj_get_FLAGS(s) & PObj_private7_FLAG)
  +            start = const_string(interpreter, "u'");
  +        else
  +            start = q;
  +        repr = string_copy(interpreter, start);
  +        repr = string_append(interpreter, repr, s, 0);
  +        repr = string_append(interpreter, repr, q, 0);
  +        return repr;
       }
   
   /*
  
  
  
  1.108     +10 -0     parrot/imcc/imcc.l
  
  Index: imcc.l
  ===================================================================
  RCS file: /cvs/public/parrot/imcc/imcc.l,v
  retrieving revision 1.107
  retrieving revision 1.108
  diff -u -w -r1.107 -r1.108
  --- imcc.l    2 Jul 2004 20:36:07 -0000       1.107
  +++ imcc.l    9 Jul 2004 08:42:53 -0000       1.108
  @@ -90,6 +90,10 @@
   LABELLETTERDIGIT     ([EMAIL PROTECTED]|"::")
   ID              {LETTER}{LABELLETTERDIGIT}*
   STRINGCONSTANT  \"(\\\"|[^"\n]*)*\"
  +ENCCHAR         {LETTER}|{DIGIT}|"-"
  +ENCCHARS        {ENCCHAR}*
  +ENC             {LETTER}{ENCCHARS}":"
  +UNICODE         {ENC}{STRINGCONSTANT}
   CHARCONSTANT    \'[^'\n]*\'
   RANKSPEC        \[[,]*\]
   EOL          \r?\n
  @@ -426,6 +430,12 @@
           return(STRINGC); /* XXX delete quotes, -> emit, pbc */
       }
   
  +<*>{UNICODE} {
  +        char *p = strchr(yytext, '"');
  +     valp->s = str_dup(p);    /* enc:"..." */
  +        /* TODO pass charset */
  +        return(USTRINGC); /* XXX delete quotes, -> emit, pbc */
  +    }
   <*>{CHARCONSTANT} {
           valp->s = str_dup(yytext); /* XXX delete quotes, -> emit, pbc */
           return(STRINGC);
  
  
  
  1.142     +3 -1      parrot/imcc/imcc.y
  
  Index: imcc.y
  ===================================================================
  RCS file: /cvs/public/parrot/imcc/imcc.y,v
  retrieving revision 1.141
  retrieving revision 1.142
  diff -u -w -r1.141 -r1.142
  --- imcc.y    6 Jul 2004 13:00:35 -0000       1.141
  +++ imcc.y    9 Jul 2004 08:42:53 -0000       1.142
  @@ -277,7 +277,8 @@
   %token <t> PROTOTYPED NON_PROTOTYPED MAIN LOAD IMMEDIATE POSTCOMP METHOD
   %token <s> LABEL
   %token <t> EMIT EOM
  -%token <s> IREG NREG SREG PREG IDENTIFIER STRINGC INTC FLOATC REG MACRO ENDM
  +%token <s> IREG NREG SREG PREG IDENTIFIER REG MACRO ENDM
  +%token <s> STRINGC INTC FLOATC USTRINGC
   %token <s> PARROT_OP
   %type <t> type newsub ptr
   %type <i> program class class_body member_decls member_decl field_decl
  @@ -1064,6 +1065,7 @@
        INTC          {  $$ = mk_const($1, 'I'); }
      | FLOATC        {  $$ = mk_const($1, 'N'); }
      | STRINGC       {  $$ = mk_const($1, 'S'); }
  +   | USTRINGC      {  $$ = mk_const($1, 'U'); }
      ;
   
   string:
  
  
  
  1.84      +16 -8     parrot/imcc/pbc.c
  
  Index: pbc.c
  ===================================================================
  RCS file: /cvs/public/parrot/imcc/pbc.c,v
  retrieving revision 1.83
  retrieving revision 1.84
  diff -u -w -r1.83 -r1.84
  --- pbc.c     6 Jul 2004 13:00:35 -0000       1.83
  +++ pbc.c     9 Jul 2004 08:42:53 -0000       1.84
  @@ -68,7 +68,7 @@
   } globals;
   
   
  -static int add_const_str(Interp *, char *str);
  +static int add_const_str(Interp *, SymReg *r);
   
   static void imcc_globals_destroy(int ex, void *param);
   static opcode_t build_key(Interp *interpreter, SymReg *reg);
  @@ -421,7 +421,7 @@
               code_size += 2;
               /* add inter_segment jump */
               r[0] = mk_const(glabel, 'S');
  -            r[0]->color = add_const_str(interpreter, glabel);
  +            r[0]->color = add_const_str(interpreter, r[0]);
               INS(interpreter, unit, "branch_cs", "", r, 1, 0, 1);
           }
       }
  @@ -489,26 +489,34 @@
   
   /* add constant string to constant_table */
   static int
  -add_const_str(Interp *interpreter, char *str)
  +add_const_str(Interp *interpreter, SymReg *r)
   {
       int k;
  -    char *buf = str;
  +    char *buf = r->name;
       STRING *s = NULL;
  +    char *charset = NULL;
   
       /*
        * TODO strip delimiters in lexer, this needs adjustment in printint strings
        */
       if (*buf == '"') {
           buf++;
  -        s = string_unescape_cstring(interpreter, buf, '"');
  +        if (r->type & VT_UNICODE) {
  +            /*
  +             * not really a charset but our reprensentation
  +             */
  +            charset = "iso-8859-1"; /* still begin with ascii */
  +        }
  +        s = string_unescape_cstring(interpreter, buf, '"', charset);
       }
  -    else if (*buf == '\'') {
  +    else if (*buf == '\'') {   /* TODO handle python raw strings */
           buf++;
           s = string_make(interpreter, buf, strlen(buf) - 1, "iso-8859-1",
                                                PObj_constant_FLAG);
       }
       else {
  -        s = string_unescape_cstring(interpreter, buf, 0);
  +        /* unquoted bare name - ascii only for now */
  +        s = string_unescape_cstring(interpreter, buf, 0, NULL);
       }
   
       k = PDB_extend_const_table(interpreter);
  @@ -751,7 +759,7 @@
                   r->color = atol(r->name);
               break;
           case 'S':
  -            r->color = add_const_str(interpreter, r->name);
  +            r->color = add_const_str(interpreter, r);
               break;
           case 'N':
               r->color = add_const_num(interpreter, r->name);
  
  
  
  1.54      +4 -0      parrot/imcc/symreg.c
  
  Index: symreg.c
  ===================================================================
  RCS file: /cvs/public/parrot/imcc/symreg.c,v
  retrieving revision 1.53
  retrieving revision 1.54
  diff -u -w -r1.53 -r1.54
  --- symreg.c  17 Jun 2004 08:59:27 -0000      1.53
  +++ symreg.c  9 Jul 2004 08:42:53 -0000       1.54
  @@ -317,6 +317,10 @@
   {
       SymReg * r = _mk_symreg(hsh, name, t);
       r->type = VTCONST;
  +    if (t == 'U') {
  +        r->set = 'S';
  +        r->type |= VT_UNICODE;
  +    }
       r->use_count++;
       return r;
   }
  
  
  
  1.53      +2 -1      parrot/imcc/symreg.h
  
  Index: symreg.h
  ===================================================================
  RCS file: /cvs/public/parrot/imcc/symreg.h,v
  retrieving revision 1.52
  retrieving revision 1.53
  diff -u -w -r1.52 -r1.53
  --- symreg.h  7 Jul 2004 11:02:24 -0000       1.52
  +++ symreg.h  9 Jul 2004 08:42:53 -0000       1.53
  @@ -24,7 +24,8 @@
       VT_END_SLICE   = PF_VT_END_SLICE   ,
       VT_START_ZERO  = PF_VT_START_ZERO  ,   /* .. y 0..start */
       VT_END_INF     = PF_VT_END_INF     ,   /* x..  start..inf */
  -    VT_SLICE_BITS  = PF_VT_SLICE_BITS
  +    VT_SLICE_BITS  = PF_VT_SLICE_BITS,
  +    VT_UNICODE  = 1 << 16       /* unicode string constant */
   };
   
   /* this VARTYPE needs register allocation and such */
  
  
  
  1.42      +3 -3      parrot/include/parrot/string_funcs.h
  
  Index: string_funcs.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
  retrieving revision 1.41
  retrieving revision 1.42
  diff -u -w -r1.41 -r1.42
  --- string_funcs.h    18 Jun 2004 15:14:52 -0000      1.41
  +++ string_funcs.h    9 Jul 2004 08:42:57 -0000       1.42
  @@ -1,7 +1,7 @@
   /* string_funcs.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string_funcs.h,v 1.41 2004/06/18 15:14:52 leo Exp $
  + *     $Id: string_funcs.h,v 1.42 2004/07/09 08:42:57 leo Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -86,8 +86,8 @@
   UINTVAL string_decode_and_advance(struct string_iterator_t *i);
   
   size_t string_hash(Interp *interpreter, STRING *s);
  -STRING * string_unescape_cstring(struct Parrot_Interp *, char *cstring,
  -                                                                     char 
delimiter);
  +STRING * string_unescape_cstring(struct Parrot_Interp *,
  +        const char *cstring, char delimiter, const char *enc_or_charset);
   
   STRING *string_upcase(struct Parrot_Interp *, const STRING *);
   STRING *string_downcase(struct Parrot_Interp *, const STRING *);
  
  
  
  1.24      +37 -19    parrot/languages/python/pie-thon.pl
  
  Index: pie-thon.pl
  ===================================================================
  RCS file: /cvs/public/parrot/languages/python/pie-thon.pl,v
  retrieving revision 1.23
  retrieving revision 1.24
  diff -u -w -r1.23 -r1.24
  --- pie-thon.pl       8 Jul 2004 16:11:39 -0000       1.23
  +++ pie-thon.pl       9 Jul 2004 08:43:01 -0000       1.24
  @@ -163,7 +163,7 @@
                for (my $i = 0; $i < $n; $i++) {
                    my ($a, $def) = split(/=/, $args[$i]);
                    $a =~ s/\s//g;
  -                 $a = "'$a'";   # quote argument
  +                 $a = qq!"$a"!;   # quote argument
                    $def_arg_names{$f}{$a} = $i;
                    # print STDERR "def $f($a = $i)\n";
                }
  @@ -309,12 +309,15 @@
       elsif ($c =~ /^[+-]?\d+$/) {     # int
        $t = 'I';
       }
  -    elsif ($c =~ /^\d+[lL]$/) {      # bigint   XXX
  +    elsif ($c =~ /^\d+[lL]$/) {      # bigint
        $t = 'B';
       }
  -    elsif ($c =~ /^'.*'$/) { # string
  +    elsif ($c =~ /^'.*'$/) { # string consts are single quoted by dis
        $t = 'S';
       }
  +    elsif ($c =~ /^u'.*'$/) {        # unicode-string TODO r raw
  +     $t = 'U';
  +    }
       elsif (is_num($c)) {        # num
        $t = 'N';
       }
  @@ -337,6 +340,8 @@
   
   sub LOAD_CONST {
       my ($n, $c, $cmt) = @_;
  +    my $typ = typ($c);
  +    if ($typ eq 'P') {
       if ($c =~ /^[_a-zA-Z]/ && !$names{$c}) { # True, False ...
        print <<EOC;
        .local pmc $c $cmt
  @@ -344,7 +349,7 @@
   EOC
        $names{$c} = 1;
       }
  -    elsif (typ($c) eq 'P') {
  +     else {
        my $typ = $DEFVAR;
        if (is_imag($c)) {
            $typ = '.Complex';
  @@ -358,7 +363,8 @@
        push @stack, [$n, $pmc, 'P'];
        return;
       }
  -    elsif (typ($c) eq 'B') {
  +    }
  +    elsif ($typ eq 'B') {   # bigint
        my $typ = $DEFVAR;
           my $pmc = temp('P');
        $c =~ s/[lL]$//;
  @@ -369,12 +375,25 @@
        push @stack, [$n, $pmc, 'P'];
        return;
       }
  +    elsif ($typ =~ /[US]/) {   # strings
  +     # parrot has double quoted escapes
  +     $c =~ s/"/\\"/g;        # XXX unescape
  +     my $u = defined $1 ? $1 : "";
  +     if ($c =~ /^(u|U)?'(.*)'/) {
  +         my $u = defined $1 ? "u:" : "";
  +         my $s = $2;
  +         $c =~ s/.*/$u"$s"/;
  +     }
  +     print <<EOC;
  +     \t$cmt
  +EOC
  +    }
       else {
        print <<EOC;
        \t$cmt
   EOC
       }
  -    push @stack, [$n, $c, typ($c)];
  +    push @stack, [$n, $c, $typ];
   }
   sub STORE_NAME {
       my ($n, $c, $cmt) = @_;
  @@ -854,7 +873,6 @@
        my $val = pop @stack;
        my $arg = pop @stack;
        my $arg_name = $arg->[1];
  -     $val = $val;
        $j = $def_arg_names{$name}{$arg_name};
        print <<EOC;
        # func $name named arg $j name $arg_name val $val->[1]
  
  
  
  1.6       +8 -2      parrot/languages/python/t/basic/03_types.t
  
  Index: 03_types.t
  ===================================================================
  RCS file: /cvs/public/parrot/languages/python/t/basic/03_types.t,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -w -r1.5 -r1.6
  --- 03_types.t        9 Jul 2004 05:23:40 -0000       1.5
  +++ 03_types.t        9 Jul 2004 08:43:04 -0000       1.6
  @@ -1,9 +1,9 @@
  -# $Id: 03_types.t,v 1.5 2004/07/09 05:23:40 leo Exp $
  +# $Id: 03_types.t,v 1.6 2004/07/09 08:43:04 leo Exp $
   
   use strict;
   use lib '../../lib';
   
  -use Parrot::Test tests => 5;
  +use Parrot::Test tests => 6;
   
   sub test {
       language_output_is('python', $_[0], '', $_[1]);
  @@ -45,3 +45,9 @@
       print `"ab"`, "ab"
   CODE
   
  +test(<<'CODE', 'repr of strings');
  +if __name__ == '__main__':
  +    print `"ab"`, "ab"
  +    print `u"ab"`, u"ab"
  +CODE
  +
  
  
  
  1.18      +8 -8      parrot/pf/pf_items.c
  
  Index: pf_items.c
  ===================================================================
  RCS file: /cvs/public/parrot/pf/pf_items.c,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -w -r1.17 -r1.18
  --- pf_items.c        14 Apr 2004 08:26:22 -0000      1.17
  +++ pf_items.c        9 Jul 2004 08:43:07 -0000       1.18
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: pf_items.c,v 1.17 2004/04/14 08:26:22 leo Exp $
  +$Id: pf_items.c,v 1.18 2004/07/09 08:43:07 leo Exp $
   
   =head1 NAME
   
  @@ -505,7 +505,7 @@
   
       flags = PF_fetch_opcode(pf, cursor);
       /* don't let PBC mess our internals - only constant or not */
  -    flags &= PObj_constant_FLAG;
  +    flags &= (PObj_constant_FLAG | PObj_private7_FLAG);
       representation = PF_fetch_opcode(pf, cursor);
   
       /* These may need to be separate */
  @@ -589,7 +589,7 @@
           padded_size += sizeof(opcode_t) - (padded_size % sizeof(opcode_t));
       }
   
  -    *cursor++ = PObj_get_FLAGS(s); /* only constant_FLAG */
  +    *cursor++ = PObj_get_FLAGS(s); /* only constant_FLAG and private7 */
       *cursor++ = s->representation;
       *cursor++ = s->bufused;
   
  
  
  
  1.207     +11 -6     parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.206
  retrieving revision 1.207
  diff -u -w -r1.206 -r1.207
  --- string.c  18 Jun 2004 15:14:56 -0000      1.206
  +++ string.c  9 Jul 2004 08:43:13 -0000       1.207
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: string.c,v 1.206 2004/06/18 15:14:56 leo Exp $
  +$Id: string.c,v 1.207 2004/07/09 08:43:13 leo Exp $
   
   =head1 NAME
   
  @@ -243,7 +243,7 @@
   string_init(Parrot_Interp interpreter)
   {
       size_t i;
  -    char *data_dir;
  +    const char *data_dir;
       int free_data_dir = 0;
   
       /* DEFAULT_ICU_DATA_DIR is configured at build time, or it may be
  @@ -2895,7 +2895,7 @@
   
   =item C<STRING *
   string_unescape_cstring(Interp * interpreter,
  -    char *cstring, char delimiter)>
  +    char *cstring, char delimiter, char *charset)>
   
   Unescapes the specified C string. These sequences are covered:
   
  @@ -2987,7 +2987,7 @@
   
   STRING *
   string_unescape_cstring(Interp * interpreter,
  -    char *cstring, char delimiter)
  +    const char *cstring, char delimiter, const char *charset)
   {
       size_t clength = strlen(cstring);
       STRING *result;
  @@ -2995,11 +2995,16 @@
       Parrot_UInt4 r;
       Parrot_unescape_cb char_at;
       char_setter_func set_char_at;
  +    UINTVAL flags;
   
       if (delimiter && clength)
           --clength;
  -    result = string_make(interpreter, cstring, clength, "iso-8859-1",
  -            PObj_constant_FLAG);
  +    flags = PObj_constant_FLAG;
  +    if (!charset)
  +        charset = "iso-8859-1";
  +    else
  +        flags |= PObj_private7_FLAG;  /* Pythonic unicode flag */
  +    result = string_make(interpreter, cstring, clength, charset, flags);
       char_at     = set_char_getter(result);
       set_char_at = set_char_setter(result);
   
  
  
  

Reply via email to