cvsuser 04/07/09 01:43:13
Modified: classes perlstring.pmc
imcc imcc.l imcc.y pbc.c symreg.c symreg.h
include/parrot string_funcs.h
languages/python pie-thon.pl
languages/python/t/basic 03_types.t
pf pf_items.c
src string.c
Log:
Pie-thon 40 - unicode string parsing and repr()
Revision Changes Path
1.82 +12 -7 parrot/classes/perlstring.pmc
Index: perlstring.pmc
===================================================================
RCS file: /cvs/public/parrot/classes/perlstring.pmc,v
retrieving revision 1.81
retrieving revision 1.82
diff -u -w -r1.81 -r1.82
--- perlstring.pmc 9 Jul 2004 05:23:38 -0000 1.81
+++ perlstring.pmc 9 Jul 2004 08:42:47 -0000 1.82
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: perlstring.pmc,v 1.81 2004/07/09 05:23:38 leo Exp $
+$Id: perlstring.pmc,v 1.82 2004/07/09 08:42:47 leo Exp $
=head1 NAME
@@ -150,12 +150,17 @@
*/
STRING* get_repr() {
- STRING *q = const_string(interpreter, "'");
- STRING *s = DYNSELF.get_string();
- STRING *repr = string_copy(interpreter, q);
- s = string_append(interpreter, q, s, 0);
- s = string_append(interpreter, s, q, 0);
- return s;
+ STRING *start, *s, *q, *repr;
+ q = const_string(interpreter, "'");
+ s = DYNSELF.get_string();
+ if (PObj_get_FLAGS(s) & PObj_private7_FLAG)
+ start = const_string(interpreter, "u'");
+ else
+ start = q;
+ repr = string_copy(interpreter, start);
+ repr = string_append(interpreter, repr, s, 0);
+ repr = string_append(interpreter, repr, q, 0);
+ return repr;
}
/*
1.108 +10 -0 parrot/imcc/imcc.l
Index: imcc.l
===================================================================
RCS file: /cvs/public/parrot/imcc/imcc.l,v
retrieving revision 1.107
retrieving revision 1.108
diff -u -w -r1.107 -r1.108
--- imcc.l 2 Jul 2004 20:36:07 -0000 1.107
+++ imcc.l 9 Jul 2004 08:42:53 -0000 1.108
@@ -90,6 +90,10 @@
LABELLETTERDIGIT ([EMAIL PROTECTED]|"::")
ID {LETTER}{LABELLETTERDIGIT}*
STRINGCONSTANT \"(\\\"|[^"\n]*)*\"
+ENCCHAR {LETTER}|{DIGIT}|"-"
+ENCCHARS {ENCCHAR}*
+ENC {LETTER}{ENCCHARS}":"
+UNICODE {ENC}{STRINGCONSTANT}
CHARCONSTANT \'[^'\n]*\'
RANKSPEC \[[,]*\]
EOL \r?\n
@@ -426,6 +430,12 @@
return(STRINGC); /* XXX delete quotes, -> emit, pbc */
}
+<*>{UNICODE} {
+ char *p = strchr(yytext, '"');
+ valp->s = str_dup(p); /* enc:"..." */
+ /* TODO pass charset */
+ return(USTRINGC); /* XXX delete quotes, -> emit, pbc */
+ }
<*>{CHARCONSTANT} {
valp->s = str_dup(yytext); /* XXX delete quotes, -> emit, pbc */
return(STRINGC);
1.142 +3 -1 parrot/imcc/imcc.y
Index: imcc.y
===================================================================
RCS file: /cvs/public/parrot/imcc/imcc.y,v
retrieving revision 1.141
retrieving revision 1.142
diff -u -w -r1.141 -r1.142
--- imcc.y 6 Jul 2004 13:00:35 -0000 1.141
+++ imcc.y 9 Jul 2004 08:42:53 -0000 1.142
@@ -277,7 +277,8 @@
%token <t> PROTOTYPED NON_PROTOTYPED MAIN LOAD IMMEDIATE POSTCOMP METHOD
%token <s> LABEL
%token <t> EMIT EOM
-%token <s> IREG NREG SREG PREG IDENTIFIER STRINGC INTC FLOATC REG MACRO ENDM
+%token <s> IREG NREG SREG PREG IDENTIFIER REG MACRO ENDM
+%token <s> STRINGC INTC FLOATC USTRINGC
%token <s> PARROT_OP
%type <t> type newsub ptr
%type <i> program class class_body member_decls member_decl field_decl
@@ -1064,6 +1065,7 @@
INTC { $$ = mk_const($1, 'I'); }
| FLOATC { $$ = mk_const($1, 'N'); }
| STRINGC { $$ = mk_const($1, 'S'); }
+ | USTRINGC { $$ = mk_const($1, 'U'); }
;
string:
1.84 +16 -8 parrot/imcc/pbc.c
Index: pbc.c
===================================================================
RCS file: /cvs/public/parrot/imcc/pbc.c,v
retrieving revision 1.83
retrieving revision 1.84
diff -u -w -r1.83 -r1.84
--- pbc.c 6 Jul 2004 13:00:35 -0000 1.83
+++ pbc.c 9 Jul 2004 08:42:53 -0000 1.84
@@ -68,7 +68,7 @@
} globals;
-static int add_const_str(Interp *, char *str);
+static int add_const_str(Interp *, SymReg *r);
static void imcc_globals_destroy(int ex, void *param);
static opcode_t build_key(Interp *interpreter, SymReg *reg);
@@ -421,7 +421,7 @@
code_size += 2;
/* add inter_segment jump */
r[0] = mk_const(glabel, 'S');
- r[0]->color = add_const_str(interpreter, glabel);
+ r[0]->color = add_const_str(interpreter, r[0]);
INS(interpreter, unit, "branch_cs", "", r, 1, 0, 1);
}
}
@@ -489,26 +489,34 @@
/* add constant string to constant_table */
static int
-add_const_str(Interp *interpreter, char *str)
+add_const_str(Interp *interpreter, SymReg *r)
{
int k;
- char *buf = str;
+ char *buf = r->name;
STRING *s = NULL;
+ char *charset = NULL;
/*
* TODO strip delimiters in lexer, this needs adjustment in printint strings
*/
if (*buf == '"') {
buf++;
- s = string_unescape_cstring(interpreter, buf, '"');
+ if (r->type & VT_UNICODE) {
+ /*
+ * not really a charset but our reprensentation
+ */
+ charset = "iso-8859-1"; /* still begin with ascii */
+ }
+ s = string_unescape_cstring(interpreter, buf, '"', charset);
}
- else if (*buf == '\'') {
+ else if (*buf == '\'') { /* TODO handle python raw strings */
buf++;
s = string_make(interpreter, buf, strlen(buf) - 1, "iso-8859-1",
PObj_constant_FLAG);
}
else {
- s = string_unescape_cstring(interpreter, buf, 0);
+ /* unquoted bare name - ascii only for now */
+ s = string_unescape_cstring(interpreter, buf, 0, NULL);
}
k = PDB_extend_const_table(interpreter);
@@ -751,7 +759,7 @@
r->color = atol(r->name);
break;
case 'S':
- r->color = add_const_str(interpreter, r->name);
+ r->color = add_const_str(interpreter, r);
break;
case 'N':
r->color = add_const_num(interpreter, r->name);
1.54 +4 -0 parrot/imcc/symreg.c
Index: symreg.c
===================================================================
RCS file: /cvs/public/parrot/imcc/symreg.c,v
retrieving revision 1.53
retrieving revision 1.54
diff -u -w -r1.53 -r1.54
--- symreg.c 17 Jun 2004 08:59:27 -0000 1.53
+++ symreg.c 9 Jul 2004 08:42:53 -0000 1.54
@@ -317,6 +317,10 @@
{
SymReg * r = _mk_symreg(hsh, name, t);
r->type = VTCONST;
+ if (t == 'U') {
+ r->set = 'S';
+ r->type |= VT_UNICODE;
+ }
r->use_count++;
return r;
}
1.53 +2 -1 parrot/imcc/symreg.h
Index: symreg.h
===================================================================
RCS file: /cvs/public/parrot/imcc/symreg.h,v
retrieving revision 1.52
retrieving revision 1.53
diff -u -w -r1.52 -r1.53
--- symreg.h 7 Jul 2004 11:02:24 -0000 1.52
+++ symreg.h 9 Jul 2004 08:42:53 -0000 1.53
@@ -24,7 +24,8 @@
VT_END_SLICE = PF_VT_END_SLICE ,
VT_START_ZERO = PF_VT_START_ZERO , /* .. y 0..start */
VT_END_INF = PF_VT_END_INF , /* x.. start..inf */
- VT_SLICE_BITS = PF_VT_SLICE_BITS
+ VT_SLICE_BITS = PF_VT_SLICE_BITS,
+ VT_UNICODE = 1 << 16 /* unicode string constant */
};
/* this VARTYPE needs register allocation and such */
1.42 +3 -3 parrot/include/parrot/string_funcs.h
Index: string_funcs.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -w -r1.41 -r1.42
--- string_funcs.h 18 Jun 2004 15:14:52 -0000 1.41
+++ string_funcs.h 9 Jul 2004 08:42:57 -0000 1.42
@@ -1,7 +1,7 @@
/* string_funcs.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string_funcs.h,v 1.41 2004/06/18 15:14:52 leo Exp $
+ * $Id: string_funcs.h,v 1.42 2004/07/09 08:42:57 leo Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -86,8 +86,8 @@
UINTVAL string_decode_and_advance(struct string_iterator_t *i);
size_t string_hash(Interp *interpreter, STRING *s);
-STRING * string_unescape_cstring(struct Parrot_Interp *, char *cstring,
- char
delimiter);
+STRING * string_unescape_cstring(struct Parrot_Interp *,
+ const char *cstring, char delimiter, const char *enc_or_charset);
STRING *string_upcase(struct Parrot_Interp *, const STRING *);
STRING *string_downcase(struct Parrot_Interp *, const STRING *);
1.24 +37 -19 parrot/languages/python/pie-thon.pl
Index: pie-thon.pl
===================================================================
RCS file: /cvs/public/parrot/languages/python/pie-thon.pl,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -w -r1.23 -r1.24
--- pie-thon.pl 8 Jul 2004 16:11:39 -0000 1.23
+++ pie-thon.pl 9 Jul 2004 08:43:01 -0000 1.24
@@ -163,7 +163,7 @@
for (my $i = 0; $i < $n; $i++) {
my ($a, $def) = split(/=/, $args[$i]);
$a =~ s/\s//g;
- $a = "'$a'"; # quote argument
+ $a = qq!"$a"!; # quote argument
$def_arg_names{$f}{$a} = $i;
# print STDERR "def $f($a = $i)\n";
}
@@ -309,12 +309,15 @@
elsif ($c =~ /^[+-]?\d+$/) { # int
$t = 'I';
}
- elsif ($c =~ /^\d+[lL]$/) { # bigint XXX
+ elsif ($c =~ /^\d+[lL]$/) { # bigint
$t = 'B';
}
- elsif ($c =~ /^'.*'$/) { # string
+ elsif ($c =~ /^'.*'$/) { # string consts are single quoted by dis
$t = 'S';
}
+ elsif ($c =~ /^u'.*'$/) { # unicode-string TODO r raw
+ $t = 'U';
+ }
elsif (is_num($c)) { # num
$t = 'N';
}
@@ -337,6 +340,8 @@
sub LOAD_CONST {
my ($n, $c, $cmt) = @_;
+ my $typ = typ($c);
+ if ($typ eq 'P') {
if ($c =~ /^[_a-zA-Z]/ && !$names{$c}) { # True, False ...
print <<EOC;
.local pmc $c $cmt
@@ -344,7 +349,7 @@
EOC
$names{$c} = 1;
}
- elsif (typ($c) eq 'P') {
+ else {
my $typ = $DEFVAR;
if (is_imag($c)) {
$typ = '.Complex';
@@ -358,7 +363,8 @@
push @stack, [$n, $pmc, 'P'];
return;
}
- elsif (typ($c) eq 'B') {
+ }
+ elsif ($typ eq 'B') { # bigint
my $typ = $DEFVAR;
my $pmc = temp('P');
$c =~ s/[lL]$//;
@@ -369,12 +375,25 @@
push @stack, [$n, $pmc, 'P'];
return;
}
+ elsif ($typ =~ /[US]/) { # strings
+ # parrot has double quoted escapes
+ $c =~ s/"/\\"/g; # XXX unescape
+ my $u = defined $1 ? $1 : "";
+ if ($c =~ /^(u|U)?'(.*)'/) {
+ my $u = defined $1 ? "u:" : "";
+ my $s = $2;
+ $c =~ s/.*/$u"$s"/;
+ }
+ print <<EOC;
+ \t$cmt
+EOC
+ }
else {
print <<EOC;
\t$cmt
EOC
}
- push @stack, [$n, $c, typ($c)];
+ push @stack, [$n, $c, $typ];
}
sub STORE_NAME {
my ($n, $c, $cmt) = @_;
@@ -854,7 +873,6 @@
my $val = pop @stack;
my $arg = pop @stack;
my $arg_name = $arg->[1];
- $val = $val;
$j = $def_arg_names{$name}{$arg_name};
print <<EOC;
# func $name named arg $j name $arg_name val $val->[1]
1.6 +8 -2 parrot/languages/python/t/basic/03_types.t
Index: 03_types.t
===================================================================
RCS file: /cvs/public/parrot/languages/python/t/basic/03_types.t,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -w -r1.5 -r1.6
--- 03_types.t 9 Jul 2004 05:23:40 -0000 1.5
+++ 03_types.t 9 Jul 2004 08:43:04 -0000 1.6
@@ -1,9 +1,9 @@
-# $Id: 03_types.t,v 1.5 2004/07/09 05:23:40 leo Exp $
+# $Id: 03_types.t,v 1.6 2004/07/09 08:43:04 leo Exp $
use strict;
use lib '../../lib';
-use Parrot::Test tests => 5;
+use Parrot::Test tests => 6;
sub test {
language_output_is('python', $_[0], '', $_[1]);
@@ -45,3 +45,9 @@
print `"ab"`, "ab"
CODE
+test(<<'CODE', 'repr of strings');
+if __name__ == '__main__':
+ print `"ab"`, "ab"
+ print `u"ab"`, u"ab"
+CODE
+
1.18 +8 -8 parrot/pf/pf_items.c
Index: pf_items.c
===================================================================
RCS file: /cvs/public/parrot/pf/pf_items.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -w -r1.17 -r1.18
--- pf_items.c 14 Apr 2004 08:26:22 -0000 1.17
+++ pf_items.c 9 Jul 2004 08:43:07 -0000 1.18
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: pf_items.c,v 1.17 2004/04/14 08:26:22 leo Exp $
+$Id: pf_items.c,v 1.18 2004/07/09 08:43:07 leo Exp $
=head1 NAME
@@ -505,7 +505,7 @@
flags = PF_fetch_opcode(pf, cursor);
/* don't let PBC mess our internals - only constant or not */
- flags &= PObj_constant_FLAG;
+ flags &= (PObj_constant_FLAG | PObj_private7_FLAG);
representation = PF_fetch_opcode(pf, cursor);
/* These may need to be separate */
@@ -589,7 +589,7 @@
padded_size += sizeof(opcode_t) - (padded_size % sizeof(opcode_t));
}
- *cursor++ = PObj_get_FLAGS(s); /* only constant_FLAG */
+ *cursor++ = PObj_get_FLAGS(s); /* only constant_FLAG and private7 */
*cursor++ = s->representation;
*cursor++ = s->bufused;
1.207 +11 -6 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.206
retrieving revision 1.207
diff -u -w -r1.206 -r1.207
--- string.c 18 Jun 2004 15:14:56 -0000 1.206
+++ string.c 9 Jul 2004 08:43:13 -0000 1.207
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.206 2004/06/18 15:14:56 leo Exp $
+$Id: string.c,v 1.207 2004/07/09 08:43:13 leo Exp $
=head1 NAME
@@ -243,7 +243,7 @@
string_init(Parrot_Interp interpreter)
{
size_t i;
- char *data_dir;
+ const char *data_dir;
int free_data_dir = 0;
/* DEFAULT_ICU_DATA_DIR is configured at build time, or it may be
@@ -2895,7 +2895,7 @@
=item C<STRING *
string_unescape_cstring(Interp * interpreter,
- char *cstring, char delimiter)>
+ char *cstring, char delimiter, char *charset)>
Unescapes the specified C string. These sequences are covered:
@@ -2987,7 +2987,7 @@
STRING *
string_unescape_cstring(Interp * interpreter,
- char *cstring, char delimiter)
+ const char *cstring, char delimiter, const char *charset)
{
size_t clength = strlen(cstring);
STRING *result;
@@ -2995,11 +2995,16 @@
Parrot_UInt4 r;
Parrot_unescape_cb char_at;
char_setter_func set_char_at;
+ UINTVAL flags;
if (delimiter && clength)
--clength;
- result = string_make(interpreter, cstring, clength, "iso-8859-1",
- PObj_constant_FLAG);
+ flags = PObj_constant_FLAG;
+ if (!charset)
+ charset = "iso-8859-1";
+ else
+ flags |= PObj_private7_FLAG; /* Pythonic unicode flag */
+ result = string_make(interpreter, cstring, clength, charset, flags);
char_at = set_char_getter(result);
set_char_at = set_char_setter(result);