Author: leo
Date: Fri Nov 11 06:27:10 2005
New Revision: 9901
Modified:
trunk/charset/unicode.c
trunk/encodings/fixed_8.c
trunk/encodings/fixed_8.h
trunk/encodings/ucs2.c
trunk/encodings/utf16.c
trunk/encodings/utf8.c
trunk/include/parrot/encoding.h
trunk/io/io_utf8.c
trunk/src/string.c
Log:
unicode encodings - simplify to_encoding
* actually to_encoding is a few lines longer now
* but does the work of copy_to_enconding now too
* this gets rid of code code duplication and half
of the unimplemented conversions
* start adding doc
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Fri Nov 11 06:27:10 2005
@@ -123,7 +123,7 @@ downcase(Interp *interpreter, STRING *sr
UErrorCode err;
int dest_len, src_len;
- Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+ src = Parrot_utf16_encoding_ptr->to_encoding(interpreter, src, NULL);
/*
U_CAPI int32_t U_EXPORT2
u_strToLower(UChar *dest, int32_t destCapacity,
Modified: trunk/encodings/fixed_8.c
==============================================================================
--- trunk/encodings/fixed_8.c (original)
+++ trunk/encodings/fixed_8.c Fri Nov 11 06:27:10 2005
@@ -19,23 +19,14 @@ This file implements the encoding functi
#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl fixed_8")
-/* This function needs to go through and get all the code points one
- by one and turn them into a byte */
-static void
-to_encoding(Interp *interpreter, STRING *source_string)
-{
- UNIMPL;
-}
-
static STRING *
-copy_to_encoding(Interp *interpreter, STRING *source_string)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
{
- STRING *return_string = NULL;
-
UNIMPL;
- return return_string;
+ return NULL;
}
+
/* codepoints are bytes, so delegate */
static UINTVAL
get_codepoint(Interp *interpreter, const STRING *source_string,
@@ -214,7 +205,6 @@ Parrot_encoding_fixed_8_init(Interp *int
"fixed_8",
1, /* Max bytes per codepoint */
to_encoding,
- copy_to_encoding,
get_codepoint,
set_codepoint,
get_byte,
Modified: trunk/encodings/fixed_8.h
==============================================================================
--- trunk/encodings/fixed_8.h (original)
+++ trunk/encodings/fixed_8.h Fri Nov 11 06:27:10 2005
@@ -13,8 +13,6 @@
#if !defined(PARROT_ENCODING_FIXED_8_H_GUARD)
#define PARROT_ENCODING_FIXED_8_H_GUARD
-static void to_encoding(Interp *interpreter, STRING *source_string);
-static STRING *copy_to_encoding(Interp *interpreter, STRING *source_string);
static UINTVAL get_codepoint(Interp *interpreter, const STRING *source_string,
UINTVAL offset);
static void set_codepoint(Interp *interpreter, STRING *source_string, UINTVAL
offset, UINTVAL codepoint);
static UINTVAL get_byte(Interp *interpreter, const STRING *source_string,
UINTVAL offset);
Modified: trunk/encodings/ucs2.c
==============================================================================
--- trunk/encodings/ucs2.c (original)
+++ trunk/encodings/ucs2.c Fri Nov 11 06:27:10 2005
@@ -33,37 +33,18 @@ UCS-2 encoding with the help of the ICU
static void iter_init(Interp *, String *src, String_iter *iter);
-static void
-to_encoding(Interp *interpreter, STRING *src)
-{
- if (src->encoding == Parrot_ucs2_encoding_ptr)
- return;
- Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
- /*
- * conversion to utf16 downgrads to ucs-2 if possible - check result
- */
- if (src->encoding == Parrot_utf16_encoding_ptr) {
- real_exception(interpreter, NULL, E_UnicodeError,
- "can't convert string with surrogates to ucs2");
- }
-}
-
static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
{
- STRING *dest;
-
- if (src->encoding == Parrot_ucs2_encoding_ptr)
- return string_copy(interpreter, src);
- dest = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, src);
+ STRING *result = Parrot_utf16_encoding_ptr->to_encoding(interpreter, src,
dest);
/*
* conversion to utf16 downgrads to ucs-2 if possible - check result
*/
- if (dest->encoding == Parrot_utf16_encoding_ptr) {
+ if (result->encoding == Parrot_utf16_encoding_ptr) {
real_exception(interpreter, NULL, E_UnicodeError,
"can't convert string with surrogates to ucs2");
}
- return dest;
+ return result;
}
static UINTVAL
@@ -258,7 +239,6 @@ Parrot_encoding_ucs2_init(Interp *interp
"ucs2",
2, /* Max bytes per codepoint 0 .. 0x10ffff */
to_encoding,
- copy_to_encoding,
get_codepoint,
set_codepoint,
get_byte,
Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c (original)
+++ trunk/encodings/utf16.c Fri Nov 11 06:27:10 2005
@@ -32,10 +32,20 @@ UTF-16 encoding with the help of the ICU
static void iter_init(Interp *, String *src, String_iter *iter);
-/* This function needs to go through and get all the code points one
- by one and turn them into a utf16 sequence */
-static void
-to_encoding(Interp *interpreter, STRING *src)
+
+/*
+
+=item C<static STRING *to_encoding(Interp *, STRING *src, STRING *dest)>
+
+Convert string C<src> to this encoding. If C<dest> is set
+fill it with the converted result, else operate inplace.
+
+=cut
+
+*/
+
+static STRING *
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
{
#if PARROT_HAS_ICU
UErrorCode err;
@@ -43,18 +53,29 @@ to_encoding(Interp *interpreter, STRING
UChar *p;
#endif
int src_len;
+ int in_place = dest == NULL;
+ STRING *result;
if (src->encoding == Parrot_utf16_encoding_ptr ||
- src->encoding == Parrot_ucs2_encoding_ptr)
- return;
+ src->encoding == Parrot_ucs2_encoding_ptr)
+ return in_place ? src : string_copy(interpreter, src);
/*
* TODO adapt string creation functions
*/
- src->charset = Parrot_unicode_charset_ptr;
- src->encoding = Parrot_utf16_encoding_ptr;
src_len = src->strlen;
- if (!src_len)
- return;
+ if (in_place) {
+ result = src;
+ }
+ else {
+ result = dest;
+ }
+ result->charset = Parrot_unicode_charset_ptr;
+ result->encoding = Parrot_utf16_encoding_ptr;
+ result->strlen = src_len;
+ if (!src_len) {
+ result->encoding = Parrot_ucs2_encoding_ptr;
+ return result;
+ }
/*
u_strFromUTF8(UChar *dest,
int32_t destCapacity,
@@ -62,11 +83,18 @@ to_encoding(Interp *interpreter, STRING
const char *src,
int32_t srcLength,
UErrorCode *pErrorCode);
- */
+ */
#if PARROT_HAS_ICU
- /* need intermediate memory */
- p = mem_sys_allocate(src_len * sizeof(UChar));
- if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+ if (in_place) {
+ /* need intermediate memory */
+ p = mem_sys_allocate(src_len * sizeof(UChar));
+ }
+ else {
+ Parrot_reallocate_string(interpreter, dest, sizeof(UChar) * src_len);
+ p = dest->strstart;
+ }
+ if (src->charset == Parrot_iso_8859_1_charset_ptr ||
+ src->charset == Parrot_ascii_charset_ptr) {
for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len) {
p[dest_len] = (UChar)((unsigned char*)src->strstart)[dest_len];
}
@@ -79,80 +107,32 @@ to_encoding(Interp *interpreter, STRING
/*
* have to resize - required len in UChars is in dest_len
*/
- p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+ if (in_place)
+ p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+ else {
+ Parrot_reallocate_string(interpreter, dest, sizeof(UChar) *
dest_len);
+ p = dest->strstart;
+ }
u_strFromUTF8(p, dest_len,
&dest_len, src->strstart, src->bufused, &err);
assert(U_SUCCESS(err));
}
}
- src->bufused = dest_len * sizeof(UChar);
- Parrot_reallocate_string(interpreter, src, src->bufused);
- memcpy(src->strstart, p, src->bufused);
- mem_sys_free(p);
-
- /* downgrade if possible */
- if (dest_len == (int)src->strlen)
- src->encoding = Parrot_ucs2_encoding_ptr;
-#else
- real_exception(interpreter, NULL, E_LibraryNotLoadedError,
- "no ICU lib loaded");
-#endif
-}
-
-static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
-{
- STRING *dest;
-#if PARROT_HAS_ICU
- UErrorCode err;
- int dest_len;
-#endif
- int src_len;
-
- if (src->encoding == Parrot_utf16_encoding_ptr ||
- src->encoding == Parrot_ucs2_encoding_ptr)
- return string_copy(interpreter, src);
-
- src_len = src->strlen;
- if (!src_len) {
- return string_make_direct(interpreter, NULL, 0,
- Parrot_utf16_encoding_ptr,
- Parrot_unicode_charset_ptr, 0);
-
- }
-#if PARROT_HAS_ICU
- dest = string_make_direct(interpreter, NULL, sizeof(UChar) * src_len,
- Parrot_utf16_encoding_ptr,
- Parrot_unicode_charset_ptr, 0);
- dest->strlen = src_len;
- if (src->charset == Parrot_iso_8859_1_charset_ptr) {
- UChar *p = (UChar*) dest->strstart;
- for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len, ++p) {
- *p = (UChar)((unsigned char*)src->strstart)[dest_len];
- }
- }
- else {
- err = U_ZERO_ERROR;
- u_strFromUTF8(dest->strstart, src_len,
- &dest_len, src->strstart, src->bufused, &err);
- if (!U_SUCCESS(err)) {
- Parrot_allocate_string(interpreter, dest, sizeof(UChar) *
dest_len);
- u_strFromUTF8(dest->strstart, dest_len,
- &dest_len, src->strstart, src->bufused, &err);
- assert(U_SUCCESS(err));
- }
+ result->bufused = dest_len * sizeof(UChar);
+ if (in_place) {
+ Parrot_reallocate_string(interpreter, src, src->bufused);
+ memcpy(src->strstart, p, src->bufused);
+ mem_sys_free(p);
}
- dest->bufused = dest_len * sizeof(UChar);
+
/* downgrade if possible */
if (dest_len == (int)src->strlen)
- dest->encoding = Parrot_ucs2_encoding_ptr;
-
+ result->encoding = Parrot_ucs2_encoding_ptr;
#else
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
"no ICU lib loaded");
#endif
-
- return dest;
+ return result;
}
static UINTVAL
@@ -367,7 +347,6 @@ Parrot_encoding_utf16_init(Interp *inter
"utf16",
4, /* Max bytes per codepoint 0 .. 0x10ffff */
to_encoding,
- copy_to_encoding,
get_codepoint,
set_codepoint,
get_byte,
Modified: trunk/encodings/utf8.c
==============================================================================
--- trunk/encodings/utf8.c (original)
+++ trunk/encodings/utf8.c Fri Nov 11 06:27:10 2005
@@ -307,54 +307,78 @@ utf8_set_position(Interp *interpreter, S
}
-/* This function needs to go through and get all the code points one
- by one and turn them into a utf8 sequence */
-static void
-to_encoding(Interp *interpreter, STRING *src)
-{
- if (src->encoding == Parrot_utf8_encoding_ptr)
- return;
- UNIMPL;
-}
-
static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
{
- STRING *dest;
- String_iter src_iter, dest_iter;
- UINTVAL offs, c;
+ STRING *result;
+ String_iter src_iter;
+ UINTVAL offs, c, dest_len, dest_pos, src_len;
+ int in_place = dest == NULL;
+ unsigned char *new_pos, *pos, *p;
if (src->encoding == Parrot_utf8_encoding_ptr)
- return string_copy(interpreter, src);
+ return in_place ? src : string_copy(interpreter, src);
+ src_len = src->strlen;
+ if (in_place) {
+ result = src;
+ }
+ else {
+ result = dest;
+ }
- /*
- * TODO adapt string creation functions
- */
- dest = string_make_empty(interpreter, enum_stringrep_one, src->strlen);
- dest->charset = Parrot_unicode_charset_ptr;
- dest->encoding = Parrot_utf8_encoding_ptr;
- dest->strlen = src->strlen;
+ result->charset = Parrot_unicode_charset_ptr;
+ result->encoding = Parrot_utf8_encoding_ptr;
+ result->strlen = src_len;
if (!src->strlen)
return dest;
- ENCODING_ITER_INIT(interpreter, src, &src_iter);
- ENCODING_ITER_INIT(interpreter, dest, &dest_iter);
-
- for (offs = 0; offs < src->strlen; ++offs) {
- c = src_iter.get_and_advance(interpreter, &src_iter);
- if (dest_iter.bytepos >= PObj_buflen(dest) - 4) {
- UINTVAL need = (src->strlen - offs) * 1.5;
- if (need < 16)
- need = 16;
- Parrot_reallocate_string(interpreter, dest,
- PObj_buflen(dest) + need);
+ if (in_place) {
+ /* need intermediate memory */
+ p = mem_sys_allocate(src_len);
+ }
+ else {
+ Parrot_reallocate_string(interpreter, dest, src_len);
+ p = dest->strstart;
+ }
+ if (src->charset == Parrot_iso_8859_1_charset_ptr ||
+ src->charset == Parrot_ascii_charset_ptr) {
+ for (dest_len = 0; dest_len < src_len; ++dest_len) {
+ p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
+ }
+ result->bufused = dest_len;
+ }
+ else {
+ ENCODING_ITER_INIT(interpreter, src, &src_iter);
+ dest_len = src_len;
+ dest_pos = 0;
+ for (offs = 0; offs < src_len; ++offs) {
+ c = src_iter.get_and_advance(interpreter, &src_iter);
+ if (dest_len - dest_pos < 6) {
+ UINTVAL need = (src->strlen - offs) * 1.5;
+ if (need < 16)
+ need = 16;
+ dest_len += need;
+ if (in_place)
+ p = mem_sys_realloc(p, dest_len);
+ else {
+ Parrot_reallocate_string(interpreter, dest, dest_len);
+ p = dest->strstart;
+ }
+ }
+
+ pos = p + dest_pos;
+ new_pos = utf8_encode(pos, c);
+ dest_pos += (new_pos - pos);
}
- dest_iter.set_and_advance(interpreter, &dest_iter, c);
+ result->bufused = dest_pos;
+ }
+ if (in_place) {
+ Parrot_reallocate_string(interpreter, src, src->bufused);
+ memcpy(src->strstart, p, src->bufused);
+ mem_sys_free(p);
}
- assert(dest->strlen == dest_iter.charpos);
- dest->bufused = dest_iter.bytepos;
- return dest;
+ return result;
}
static UINTVAL
@@ -533,7 +557,6 @@ Parrot_encoding_utf8_init(Interp *interp
"utf8",
4, /* Max bytes per codepoint 0 .. 0x10ffff */
to_encoding,
- copy_to_encoding,
get_codepoint,
set_codepoint,
get_byte,
Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h (original)
+++ trunk/include/parrot/encoding.h Fri Nov 11 06:27:10 2005
@@ -15,8 +15,7 @@
#include "parrot/parrot.h"
-typedef void (*encoding_to_encoding_t)(Interp*, STRING *string);
-typedef STRING *(*encoding_copy_to_encoding_t)(Interp*, STRING *string);
+typedef STRING * (*encoding_to_encoding_t)(Interp*, STRING *src, STRING *dest);
typedef UINTVAL (*encoding_get_codepoint_t)(Interp*, const STRING *src,
UINTVAL offset);
typedef void (*encoding_set_codepoint_t)(Interp*, STRING *src, UINTVAL offset,
UINTVAL codepoint);
typedef UINTVAL (*encoding_get_byte_t)(Interp*, const STRING *src, UINTVAL
offset);
@@ -42,7 +41,6 @@ struct _encoding {
const char *name;
UINTVAL max_bytes_per_codepoint;
encoding_to_encoding_t to_encoding;
- encoding_copy_to_encoding_t copy_to_encoding;
encoding_get_codepoint_t get_codepoint;
encoding_set_codepoint_t set_codepoint;
encoding_get_byte_t get_byte;
@@ -91,10 +89,6 @@ const char * Parrot_encoding_c_name(Inte
#define ENCODING_MAX_BYTES_PER_CODEPOINT(i, src) \
((ENCODING *)src->encoding)->max_bytes_per_codepoint
-#define ENCODING_TO_ENCODING(i, src, offset, count) \
- ((ENCODING *)src->encoding)->to_encoding(i, src, offset, count)
-#define ENCODING_COPY_TO_ENCODING(i, src) \
- ((ENCODING *)src->encoding)->copy_to_encoding(i, src)
#define ENCODING_GET_CODEPOINT(i, src, offset) \
((ENCODING *)src->encoding)->get_codepoint(i, src, offset)
#define ENCODING_SET_CODEPOINT(i, src, offset, codepoint) \
Modified: trunk/io/io_utf8.c
==============================================================================
--- trunk/io/io_utf8.c (original)
+++ trunk/io/io_utf8.c Fri Nov 11 06:27:10 2005
@@ -52,7 +52,8 @@ PIO_utf8_write(theINTERP, ParrotIOLayer
if (s->encoding == Parrot_utf8_encoding_ptr)
return PIO_write_down(interpreter, l->down, io, s);
- dest = Parrot_utf8_encoding_ptr->copy_to_encoding(interpreter, s);
+ dest = Parrot_utf8_encoding_ptr->to_encoding(interpreter, s,
+ new_string_header(interpreter, 0));
return PIO_write_down(interpreter, l->down, io, dest);
}
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Fri Nov 11 06:27:10 2005
@@ -484,8 +484,9 @@ string_append(Interp *interpreter,
a->charset = cs;
else {
/* upgrade to utf16 */
- Parrot_utf16_encoding_ptr->to_encoding(interpreter, a);
- b = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, b);
+ Parrot_utf16_encoding_ptr->to_encoding(interpreter, a, NULL);
+ b = Parrot_utf16_encoding_ptr->to_encoding(interpreter, b,
+ new_string_header(interpreter, 0));
/*
* result could be mixed ucs2 / utf16
*/
@@ -1178,8 +1179,9 @@ string_replace(Interp *interpreter, STRI
/* may have different reps..... */
if ( !(cs = string_rep_compatible(interpreter, src, rep, &enc))) {
- Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
- rep = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, rep);
+ Parrot_utf16_encoding_ptr->to_encoding(interpreter, src, NULL);
+ rep = Parrot_utf16_encoding_ptr->to_encoding(interpreter, rep,
+ new_string_header(interpreter, 0));
}
else {
src->charset = cs;
@@ -2659,8 +2661,12 @@ Parrot_string_find_not_cclass(Interp *in
Parrot_string_trans_charset(Interp *interpreter, STRING *src,
INTVAL charset_nr, STRING *dest)>
-If C<dest> == NULL convert C<src> to the given charset inplace, else
-return a copy of C<src> with the charset in dest.
+=item C< STRING*
+Parrot_string_trans_encoding(Interp *interpreter, STRING *src,
+ INTVAL charset_nr, STRING *dest)>
+
+If C<dest> == NULL convert C<src> to the given charset or encoding inplace,
+else return a copy of C<src> with the charset/encoding in dest.
=cut
@@ -2718,20 +2724,18 @@ Parrot_string_trans_encoding(Interp *int
* operation is desired
*/
if (dest) {
+ dest->encoding = new_encoding;
if (new_encoding == src->encoding) {
Parrot_reuse_COW_reference(interpreter, src, dest);
- dest->encoding = new_encoding;
return dest;
}
- return new_encoding->copy_to_encoding(interpreter, src);
}
else {
if (new_encoding == src->encoding) {
return src;
}
}
- new_encoding->to_encoding(interpreter, src);
- return src;
+ return new_encoding->to_encoding(interpreter, src, dest);
}
/*