cvsuser 05/03/02 05:47:33
Modified: charset ascii.c iso-8859-1.c unicode.c
ops ops.num string.ops
src string.c string_primitives.c
t/op string_cs.t
Log:
Strings. Finally. 13 - some unicode conversions
* convert to and from unicode - utf8 only
* fix string unescaping, use a string iter
* new bytelength opcode
Revision Changes Path
1.18 +29 -4 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -r1.17 -r1.18
--- ascii.c 2 Mar 2005 10:43:13 -0000 1.17
+++ ascii.c 2 Mar 2005 13:47:28 -0000 1.18
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.17 2005/03/02 10:43:13 leo Exp $
+$Id: ascii.c,v 1.18 2005/03/02 13:47:28 leo Exp $
=head1 NAME
@@ -137,10 +137,35 @@
}
STRING *
-ascii_to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
+ascii_to_unicode(Interp *interpreter, STRING *src, STRING *dest)
{
- internal_exception(UNIMPLEMENTED,
- "to_unicode for iso-8859-1 not implemented");
+ UINTVAL offs, c;
+ String_iter iter;
+
+ if (dest) {
+ dest->charset = Parrot_unicode_charset_ptr;
+ dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, dest);
+ Parrot_reallocate_string(interpreter, dest, src->strlen);
+ ENCODING_ITER_INIT(interpreter, dest, &iter);
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = ENCODING_GET_BYTE(interpreter, src, offs);
+ if (iter.bytepos >= PObj_buflen(dest) - 4) {
+ UINTVAL need = (src->strlen - offs) * 1.5;
+ if (need < 16)
+ need = 16;
+ Parrot_reallocate_string(interpreter, dest,
+ PObj_buflen(dest) + need);
+ }
+ iter.set_and_advance(interpreter, &iter, c);
+ }
+ dest->bufused = iter.bytepos;
+ dest->strlen = iter.charpos;
+ return dest;
+ }
+ else {
+ internal_exception(UNIMPLEMENTED,
+ "to_unicode inplace for iso-8859-1 not implemented");
+ }
return NULL;
}
1.13 +23 -4 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -r1.12 -r1.13
--- iso-8859-1.c 1 Mar 2005 17:25:44 -0000 1.12
+++ iso-8859-1.c 2 Mar 2005 13:47:28 -0000 1.13
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.12 2005/03/01 17:25:44 leo Exp $
+$Id: iso-8859-1.c,v 1.13 2005/03/02 13:47:28 leo Exp $
=head1 NAME
@@ -64,10 +64,29 @@
}
static STRING *
-from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
+from_charset(Interp *interpreter, STRING *src, STRING *dest)
{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
+ UINTVAL offs, c;
+ String_iter iter;
+
+ if (dest) {
+ Parrot_reallocate_string(interpreter, dest, src->strlen);
+ dest->bufused = src->strlen;
+ dest->strlen = src->strlen;
+ }
+ ENCODING_ITER_INIT(interpreter, src, &iter);
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = iter.get_and_advance(interpreter, &iter);
+ if (c >= 0x100) {
+ EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
+ }
+ if (dest)
+ ENCODING_SET_BYTE(interpreter, dest, offs, c);
+ }
+ if (dest)
+ return dest;
+ src->charset = Parrot_ascii_charset_ptr;
+ return src;
}
static STRING *
1.2 +22 -5 parrot/charset/unicode.c
Index: unicode.c
===================================================================
RCS file: /cvs/public/parrot/charset/unicode.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- unicode.c 2 Mar 2005 09:03:25 -0000 1.1
+++ unicode.c 2 Mar 2005 13:47:28 -0000 1.2
@@ -1,6 +1,6 @@
/*
Copyright: 2005 The Perl Foundation. All Rights Reserved.
-$Id: unicode.c,v 1.1 2005/03/02 09:03:25 leo Exp $
+$Id: unicode.c,v 1.2 2005/03/02 13:47:28 leo Exp $
=head1 NAME
@@ -50,10 +50,19 @@
}
static STRING*
-to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING
*dest)
+to_charset(Interp *interpreter, STRING *src,
+ CHARSET *new_charset, STRING *dest)
{
- UNIMPL;
- return NULL;
+ charset_converter_t conversion_func;
+
+ if ((conversion_func = Parrot_find_charset_converter(interpreter,
+ src->charset, new_charset))) {
+ return conversion_func(interpreter, src, dest);
+ }
+ else {
+ return new_charset->from_charset(interpreter, src, dest);
+
+ }
}
static STRING*
@@ -64,8 +73,16 @@
}
static STRING*
-from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
+from_charset(Interp *interpreter, STRING *src, STRING *dest)
{
+ if (src->charset == Parrot_unicode_charset_ptr) {
+ if (!dest) {
+ /* inplace ok */
+ return src;
+ }
+ Parrot_reuse_COW_reference(interpreter, src, dest);
+ return dest;
+ }
UNIMPL;
return NULL;
}
1.61 +2 -0 parrot/ops/ops.num
Index: ops.num
===================================================================
RCS file: /cvs/public/parrot/ops/ops.num,v
retrieving revision 1.60
retrieving revision 1.61
diff -u -r1.60 -r1.61
--- ops.num 1 Mar 2005 14:19:47 -0000 1.60
+++ ops.num 2 Mar 2005 13:47:29 -0000 1.61
@@ -1428,3 +1428,5 @@
trans_charset_s_s_ic 1398
trans_charset_s_sc_i 1399
trans_charset_s_sc_ic 1400
+bytelength_i_s 1401
+bytelength_i_sc 1402
1.36 +17 -0 parrot/ops/string.ops
Index: string.ops
===================================================================
RCS file: /cvs/public/parrot/ops/string.ops,v
retrieving revision 1.35
retrieving revision 1.36
diff -u -r1.35 -r1.36
--- string.ops 1 Mar 2005 14:19:47 -0000 1.35
+++ string.ops 2 Mar 2005 13:47:29 -0000 1.36
@@ -155,6 +155,10 @@
Set $1 to the length (in characters) of the string in $2.
+=item B<bytelength>(out INT, in STR)
+
+Set $1 to the length (in bytes) of the string in $2.
+
=cut
inline op length(out INT, in STR) :base_mem {
@@ -162,6 +166,19 @@
goto NEXT();
}
+inline op bytelength(out INT, in STR) :base_mem {
+ UINTVAL n;
+ STRING *s = $2;
+ if (!s)
+ n = 0;
+ else {
+ n = s->bufused;
+ assert(n == ENCODING_BYTES(interpreter, $2));
+ }
+ $1 = n;
+ goto NEXT();
+}
+
=item B<pin>(inout STR)
Make the memory in $1 immobile. This memory will I<not> be moved by
1.242 +22 -19 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.241
retrieving revision 1.242
diff -u -r1.241 -r1.242
--- string.c 2 Mar 2005 10:43:16 -0000 1.241
+++ string.c 2 Mar 2005 13:47:32 -0000 1.242
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.241 2005/03/02 10:43:16 leo Exp $
+$Id: string.c,v 1.242 2005/03/02 13:47:32 leo Exp $
=head1 NAME
@@ -318,12 +318,6 @@
mem_sys_free((void*)data_dir); /* cast away the constness */
}
-/* --- Perhaps these should be uncommented - Leo
- encoding_init();
- chartype_init();
- string_native_type = chartype_lookup("usascii");
- string_unicode_type = chartype_lookup("unicode");
-*/
/*
* initialize the constant string table
*/
@@ -615,16 +609,8 @@
"string_make: no charset name specified");
}
- if (strcmp(charset_name, "iso-8859-1") == 0 ) {
- charset = Parrot_iso_8859_1_charset_ptr;
- }
- else if (strcmp(charset_name, "ascii") == 0 ) {
- charset = Parrot_ascii_charset_ptr;
- }
- else if (strcmp(charset_name, "binary") == 0 ) {
- charset = Parrot_binary_charset_ptr;
- }
- else {
+ charset = Parrot_find_charset(interpreter, charset_name);
+ if (!charset) {
internal_exception(UNIMPLEMENTED,
"Can't make '%s' charset strings", charset_name);
}
@@ -2284,6 +2270,8 @@
UINTVAL offs, d;
Parrot_UInt4 r;
UINTVAL flags;
+ String_iter iter;
+ ENCODING *encoding;
if (delimiter && clength)
--clength;
@@ -2293,6 +2281,15 @@
else
flags |= PObj_private7_FLAG; /* Pythonic unicode flag */
result = string_make(interpreter, cstring, clength, charset, flags);
+ result->strlen = clength;
+
+ ENCODING_ITER_INIT(interpreter, result, &iter);
+ /*
+ * reset encoding so that the destination encoding isn't used
+ * TODO if an encoding is given too just use it
+ */
+ encoding = result->encoding;
+ result->encoding = Parrot_fixed_8_encoding_ptr;
for (offs = d = 0; offs < clength; ++offs) {
r = CHARSET_GET_CODEPOINT(interpreter, result, offs);
@@ -2306,13 +2303,19 @@
--offs;
}
if (d == offs) {
+ /* we did it in place - no action */
++d;
+ iter.bytepos++;
+ iter.charpos++;
continue;
}
- CHARSET_SET_CODEPOINT(interpreter, result, d++, r);
+ assert(d < offs);
+ iter.set_and_advance(interpreter, &iter, r);
+ ++d;
}
+ result->encoding = encoding;
result->strlen = d;
- result->bufused = string_max_bytes(interpreter, result, d);
+ result->bufused = iter.bytepos;
return result;
}
1.11 +11 -11 parrot/src/string_primitives.c
Index: string_primitives.c
===================================================================
RCS file: /cvs/public/parrot/src/string_primitives.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- string_primitives.c 27 Feb 2005 09:58:47 -0000 1.10
+++ string_primitives.c 2 Mar 2005 13:47:32 -0000 1.11
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: string_primitives.c,v 1.10 2005/02/27 09:58:47 leo Exp $
+$Id: string_primitives.c,v 1.11 2005/03/02 13:47:32 leo Exp $
=head1 NAME
@@ -154,11 +154,11 @@
UINTVAL charcount = 0;
UINTVAL len = string_length(interpreter, string);
/* Well, not right now */
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
switch (codepoint) {
case 'x':
++*offset;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar = codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -171,7 +171,7 @@
++*offset;
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -188,7 +188,7 @@
return workchar;
case 'c':
++*offset;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= 'A' && codepoint <= 'Z') {
workchar = codepoint - 'A' + 1;
} else {
@@ -198,7 +198,7 @@
return workchar;
case 'u':
++*offset;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar = codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -212,7 +212,7 @@
for (charcount = 1; charcount < 4; charcount++) {
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string,
*offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -230,7 +230,7 @@
return workchar;
case 'U':
++*offset;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar = codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -244,7 +244,7 @@
for (charcount = 1; charcount < 8; charcount++) {
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string,
*offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
} else if (codepoint >= 'a' && codepoint <= 'f') {
@@ -274,7 +274,7 @@
++*offset;
if (*offset < len) {
workchar *= 8;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '7') {
workchar += codepoint - '0';
} else {
@@ -286,7 +286,7 @@
++*offset;
if (*offset < len) {
workchar *= 8;
- codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
+ codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
if (codepoint >= '0' && codepoint <= '7') {
workchar += codepoint - '0';
} else {
1.8 +46 -2 parrot/t/op/string_cs.t
Index: string_cs.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string_cs.t,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- string_cs.t 1 Mar 2005 15:41:31 -0000 1.7
+++ string_cs.t 2 Mar 2005 13:47:33 -0000 1.8
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: string_cs.t,v 1.7 2005/03/01 15:41:31 leo Exp $
+# $Id: string_cs.t,v 1.8 2005/03/02 13:47:33 leo Exp $
=head1 NAME
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 26;
+use Parrot::Test tests => 28;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -401,3 +401,47 @@
iso-8859-1
OUTPUT
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to unicode");
+ set S0, "abc_�_"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ length I2, S1
+ print I2
+ print "\n"
+ end
+CODE
+abc_\xc3\xa4_
+unicode
+6
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i unicode to iso-8859-1");
+ set S0, unicode:"abc_\xe4_"
+ bytelength I2, S0 # XXX its 7 for utf8 only
+ print I2
+ print "\n"
+ find_charset I0, "iso-8859-1"
+ trans_charset S1, S0, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ length I2, S1
+ print I2
+ print "\n"
+ end
+CODE
+7
+abc_�_
+iso-8859-1
+6
+OUTPUT
+