cvsuser 05/03/02 09:00:52
Modified: charset unicode.c
encodings utf8.c
io io_utf8.c
src string.c
Log:
Strings. Finally. 15 - utf8 transcoding
* implement utf8.copy_to_encoding
* use it in utf8 IO filter
* remove constness warning
Revision Changes Path
1.4 +10 -7 parrot/charset/unicode.c
Index: unicode.c
===================================================================
RCS file: /cvs/public/parrot/charset/unicode.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- unicode.c 2 Mar 2005 15:32:58 -0000 1.3
+++ unicode.c 2 Mar 2005 17:00:49 -0000 1.4
@@ -1,6 +1,6 @@
/*
Copyright: 2005 The Perl Foundation. All Rights Reserved.
-$Id: unicode.c,v 1.3 2005/03/02 15:32:58 leo Exp $
+$Id: unicode.c,v 1.4 2005/03/02 17:00:49 leo Exp $
=head1 NAME
@@ -165,7 +165,6 @@
return 0;
}
-/* Binary's always valid */
static UINTVAL
validate(Interp *interpreter, STRING *source_string)
{
@@ -287,12 +286,16 @@
static STRING *
string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
{
- STRING *return_string;
+ STRING *dest;
+ String_iter iter;
- return_string = string_make(interpreter, "", 1, "unicode", 0);
- return_string->strlen = 1;
- ENCODING_SET_CODEPOINT(interpreter, return_string, 0, codepoint);
- return return_string;
+ dest = string_make(interpreter, "", 1, "unicode", 0);
+ dest->strlen = 1;
+ ENCODING_ITER_INIT(interpreter, dest, &iter);
+ iter.set_and_advance(interpreter, &iter, codepoint);
+ dest->bufused = iter.bytepos;
+
+ return dest;
}
static size_t
1.24 +46 -8 parrot/encodings/utf8.c
Index: utf8.c
===================================================================
RCS file: /cvs/public/parrot/encodings/utf8.c,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- utf8.c 2 Mar 2005 15:32:59 -0000 1.23
+++ utf8.c 2 Mar 2005 17:00:50 -0000 1.24
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: utf8.c,v 1.23 2005/03/02 15:32:59 leo Exp $
+$Id: utf8.c,v 1.24 2005/03/02 17:00:50 leo Exp $
=head1 NAME
@@ -306,23 +306,55 @@
/* This function needs to go through and get all the code points one
- by one and turn them into a byte */
+ by one and turn them into a utf8 sequence */
static void
to_encoding(Interp *interpreter, STRING *src)
{
+ if (src->encoding == Parrot_utf8_encoding_ptr)
+ return;
UNIMPL;
}
static STRING *
copy_to_encoding(Interp *interpreter, STRING *src)
{
- STRING *return_string = NULL;
-
- UNIMPL;
- return return_string;
+ STRING *dest;
+ String_iter src_iter, dest_iter;
+ UINTVAL offs, c;
+
+ if (src->encoding == Parrot_utf8_encoding_ptr)
+ return string_copy(interpreter, src);
+
+ /*
+ * TODO adapt string creation functions
+ */
+ dest = string_make_empty(interpreter, enum_stringrep_one, src->strlen);
+ dest->charset = Parrot_unicode_charset_ptr;
+ dest->encoding = Parrot_utf8_encoding_ptr;
+ dest->strlen = src->strlen;
+
+ if (!src->strlen)
+ return dest;
+
+ ENCODING_ITER_INIT(interpreter, src, &src_iter);
+ ENCODING_ITER_INIT(interpreter, dest, &dest_iter);
+
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = src_iter.get_and_advance(interpreter, &src_iter);
+ if (dest_iter.bytepos >= PObj_buflen(dest) - 4) {
+ UINTVAL need = (src->strlen - offs) * 1.5;
+ if (need < 16)
+ need = 16;
+ Parrot_reallocate_string(interpreter, dest,
+ PObj_buflen(dest) + need);
+ }
+ dest_iter.set_and_advance(interpreter, &dest_iter, c);
+ }
+ assert(dest->strlen == dest_iter.charpos);
+ dest->bufused = dest_iter.bytepos;
+ return dest;
}
-/* codepoints are bytes, so delegate */
static UINTVAL
get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
{
@@ -337,9 +369,15 @@
UINTVAL offset, UINTVAL codepoint)
{
const void *start;
+ void *p;
+ union {
+ const void * __c_ptr;
+ void * __ptr;
+ } __ptr_u;
start = utf8_skip_forward(src->strstart, offset);
- utf8_encode(start, codepoint);
+ p = const_cast(start);
+ utf8_encode(p, codepoint);
}
static UINTVAL
1.4 +8 -14 parrot/io/io_utf8.c
Index: io_utf8.c
===================================================================
RCS file: /cvs/public/parrot/io/io_utf8.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- io_utf8.c 14 Feb 2005 11:34:22 -0000 1.3
+++ io_utf8.c 2 Mar 2005 17:00:51 -0000 1.4
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: io_utf8.c,v 1.3 2005/02/14 11:34:22 leo Exp $
+$Id: io_utf8.c,v 1.4 2005/03/02 17:00:51 leo Exp $
=head1 NAME
@@ -47,19 +47,13 @@
static size_t
PIO_utf8_write(theINTERP, ParrotIOLayer *l, ParrotIO *io, STRING *s)
{
- STRING n;
- size_t idx, length = string_length(interpreter, s);
- char *buffer = malloc(4*length);
- char *cursor = buffer;
-
- for( idx = 0; idx < length; ++idx )
- {
- cursor = Parrot_utf8_encode(cursor, string_index(interpreter, s,
idx));
- }
-
- n.strstart = buffer;
- n.bufused = cursor - buffer;
- return PIO_write_down(interpreter, l->down, io, &n);
+ STRING *dest;
+
+ if (s->encoding == Parrot_utf8_encoding_ptr)
+ return PIO_write_down(interpreter, l->down, io, s);
+
+ dest = Parrot_utf8_encoding_ptr->copy_to_encoding(interpreter, s);
+ return PIO_write_down(interpreter, l->down, io, dest);
}
static const ParrotIOLayerAPI pio_utf8_layer_api = {
1.244 +4 -1 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.243
retrieving revision 1.244
diff -u -r1.243 -r1.244
--- string.c 2 Mar 2005 15:32:59 -0000 1.243
+++ string.c 2 Mar 2005 17:00:51 -0000 1.244
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.243 2005/03/02 15:32:59 leo Exp $
+$Id: string.c,v 1.244 2005/03/02 17:00:51 leo Exp $
=head1 NAME
@@ -384,6 +384,9 @@
s = new_string_header(interpreter, 0);
+ /*
+ * TODO adapt string creation functions
+ */
if (representation == enum_stringrep_one) {
s->charset = PARROT_DEFAULT_CHARSET;
s->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, s);;