The attached patch is a first stab at implementing string transcoding and the unicode string types.
The transcoder will currently only map one UTF type to another - there is no attempt to implement mapping to or from native strings as I wasn't sure what the plan was for that. Presumably we will have to determine what the native character set is at configure time and then generate some code to map between that and unicode somehow? There are currently no proper tests because there is no way to generate anything other than a native string using the current assembler. There is a small C test harness (trans-test.c) which I have used to validate the transcoder to a certain extent. This patch also fixes a bug in the existing native strings where string_native_compute_strlen was returning the number of bytes that had been allocated rather than the number that were in use. Tom -- Tom Hughes ([EMAIL PROTECTED]) http://www.compton.nu/
diff -urNw --exclude CVS parrot/Makefile.in parrot-transcode/Makefile.in --- parrot/Makefile.in Sun Oct 7 15:58:56 2001 +++ parrot-transcode/Makefile.in Sun Oct 7 16:08:49 2001 @@ -4,7 +4,7 @@ INC=include/parrot H_FILES = $(INC)/config.h $(INC)/exceptions.h $(INC)/io.h $(INC)/op.h $(INC)/register.h $(INC)/string.h $(INC)/events.h $(INC)/interpreter.h $(INC)/memory.h $(INC)/parrot.h $(INC)/stacks.h $(INC)/packfile.h $(INC)/global_setup.h $(INC)/vtable.h -O_FILES = global_setup$(O) interpreter$(O) parrot$(O) register$(O) basic_opcodes$(O) memory$(O) packfile$(O) string$(O) strnative$(O) +O_FILES = global_setup$(O) interpreter$(O) parrot$(O) register$(O) basic_opcodes$(O) +memory$(O) packfile$(O) string$(O) strnative$(O) strutf8$(O) strutf16$(O) +strutf32$(O) transcode$(O) #DO NOT ADD C COMPILER FLAGS HERE #Add them in Configure.pl--look for the @@ -32,8 +32,8 @@ $(TEST_PROG): test_main$(O) $(O_FILES) interp_guts$(O) op_info$(O) $(CC) $(CFLAGS) -o $(TEST_PROG) $(O_FILES) interp_guts$(O) op_info$(O) test_main$(O) $(C_LIBS) -$(PDUMP): pdump$(O) packfile$(O) memory$(O) global_setup$(O) string$(O) strnative$(O) - $(CC) $(CFLAGS) -o $(PDUMP) pdump$(O) packfile$(O) memory$(O) global_setup$(O) string$(O) strnative$(O) $(C_LIBS) +$(PDUMP): pdump$(O) packfile$(O) memory$(O) global_setup$(O) string$(O) strnative$(O) +strutf8$(O) strutf16$(O) strutf32$(O) transcode$(O) + $(CC) $(CFLAGS) -o $(PDUMP) pdump$(O) packfile$(O) memory$(O) global_setup$(O) +string$(O) strnative$(O) strutf8$(O) strutf16$(O) strutf32$(O) transcode$(O) $(C_LIBS) test_main$(O): $(H_FILES) $(INC)/interp_guts.h @@ -42,6 +42,14 @@ string$(O): $(H_FILES) strnative$(O): $(H_FILES) + +strutf8$(O): $(H_FILES) + +strutf16$(O): $(H_FILES) + +strutf32$(O): $(H_FILES) + +transcode$(O): $(H_FILES) $(INC)/interp_guts.h interp_guts.c $(INC)/op_info.h op_info.c: opcode_table build_interp_starter.pl $(PERL) build_interp_starter.pl diff -urNw --exclude CVS parrot/global_setup.c parrot-transcode/global_setup.c --- parrot/global_setup.c Sun Sep 16 12:32:21 2001 +++ parrot-transcode/global_setup.c Sat Oct 6 15:43:20 2001 @@ -17,6 +17,7 @@ void init_world() { string_init(); /* Set up the string subsystem */ + transcode_init(); /* Set up the transcoding subsystem */ } /* diff -urNw --exclude CVS parrot/include/parrot/exceptions.h parrot-transcode/include/parrot/exceptions.h --- parrot/include/parrot/exceptions.h Mon Sep 24 22:40:32 2001 +++ parrot-transcode/include/parrot/exceptions.h Sun Oct 7 15:36:46 2001 @@ -17,6 +17,9 @@ #define NO_REG_FRAMES 1 #define SUBSTR_OUT_OF_STRING 1 +#define MALFORMED_UTF8 1 +#define MALFORMED_UTF16 1 +#define MALFORMED_UTF32 1 #endif diff -urNw --exclude CVS parrot/include/parrot/parrot.h parrot-transcode/include/parrot/parrot.h --- parrot/include/parrot/parrot.h Sat Oct 6 15:10:50 2001 +++ parrot-transcode/include/parrot/parrot.h Sun Oct 7 15:21:57 2001 @@ -66,6 +66,7 @@ #include "parrot/global_setup.h" #include "parrot/string.h" +#include "parrot/transcode.h" #include "parrot/vtable.h" #include "parrot/interpreter.h" #include "parrot/register.h" diff -urNw --exclude CVS parrot/include/parrot/string.h parrot-transcode/include/parrot/string.h --- parrot/include/parrot/string.h Tue Oct 2 22:02:00 2001 +++ parrot-transcode/include/parrot/string.h Sun Oct 7 15:21:46 2001 @@ -85,6 +85,9 @@ VAR_SCOPE STRING_VTABLE Parrot_string_vtable[enc_max]; #include "parrot/strnative.h" +#include "parrot/strutf8.h" +#include "parrot/strutf16.h" +#include "parrot/strutf32.h" #endif /* diff -urNw --exclude CVS parrot/include/parrot/strutf16.h parrot-transcode/include/parrot/strutf16.h --- parrot/include/parrot/strutf16.h Thu Jan 1 01:00:00 1970 +++ parrot-transcode/include/parrot/strutf16.h Sun Oct 7 15:21:02 2001 @@ -0,0 +1,29 @@ +/* strutf16.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * UTF-16 string handling functions header + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_STRUTF16_H_GUARD) +#define PARROT_STRUTF16_H_GUARD + +STRING_VTABLE +string_utf16_vtable (void); + +#endif + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/include/parrot/strutf32.h parrot-transcode/include/parrot/strutf32.h --- parrot/include/parrot/strutf32.h Thu Jan 1 01:00:00 1970 +++ parrot-transcode/include/parrot/strutf32.h Sun Oct 7 15:21:07 2001 @@ -0,0 +1,29 @@ +/* strutf32.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * UTF-32 string handling functions header + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_STRUTF32_H_GUARD) +#define PARROT_STRUTF32_H_GUARD + +STRING_VTABLE +string_utf32_vtable (void); + +#endif + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/include/parrot/strutf8.h parrot-transcode/include/parrot/strutf8.h --- parrot/include/parrot/strutf8.h Thu Jan 1 01:00:00 1970 +++ parrot-transcode/include/parrot/strutf8.h Sun Oct 7 15:20:51 2001 @@ -0,0 +1,29 @@ +/* strutf8.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * UTF-8 string handling functions header + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_STRUTF8_H_GUARD) +#define PARROT_STRUTF8_H_GUARD + +STRING_VTABLE +string_utf8_vtable (void); + +#endif + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/include/parrot/transcode.h parrot-transcode/include/parrot/transcode.h --- parrot/include/parrot/transcode.h Thu Jan 1 01:00:00 1970 +++ parrot-transcode/include/parrot/transcode.h Sun Oct 7 15:24:15 2001 @@ -0,0 +1,33 @@ +/* transcode.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * This is the api header for the transcoding subsystem + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_TRANSCODE_H_GUARD) +#define PARROT_TRANSCODE_H_GUARD + +typedef STRING* (*transcode_t)(STRING* from, STRING* to); + +VAR_SCOPE transcode_t Parrot_transcode_table[enc_max][enc_max]; + +void +transcode_init(void); + +#endif + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/include/parrot/unicode.h parrot-transcode/include/parrot/unicode.h --- parrot/include/parrot/unicode.h Thu Jan 1 01:00:00 1970 +++ parrot-transcode/include/parrot/unicode.h Sun Oct 7 15:47:25 2001 @@ -0,0 +1,96 @@ +/* unicode.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * Unicode support header + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_UNICODE_H_GUARD) +#define PARROT_UNICODE_H_GUARD + +typedef unsigned char utf8_t; +typedef unsigned short utf16_t; +typedef unsigned long utf32_t; + +#define UNICODE_SURROGATE_FIRST 0xD800u +#define UNICODE_SURROGATE_LAST 0xDFFFu +#define UNICODE_HIGH_SURROGATE_FIRST 0xD800u +#define UNICODE_HIGH_SURROGATE_LAST 0xDBFFu +#define UNICODE_HIGH_SURROGATE_SHIFT 10 +#define UNICODE_LOW_SURROGATE_FIRST 0xDC00u +#define UNICODE_LOW_SURROGATE_LAST 0xDFFFu +#define UNICODE_LOW_SURROGATE_MASK 0x3FFu + +#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ + (c) <= UNICODE_SURROGATE_LAST) +#define UNICODE_IS_HIGH_SURROGATE(c) ((c) >= UNICODE_HIGH_SURROGATE_FIRST && \ + (c) <= UNICODE_HIGH_SURROGATE_LAST) +#define UNICODE_IS_LOW_SURROGATE(c) ((c) >= UNICODE_LOW_SURROGATE_FIRST && \ + (c) <= UNICODE_LOW_SURROGATE_LAST) +#define UNICODE_IS_INVARIANT(c) ((c) < 0x80u) + +#define UNICODE_HIGH_SURROGATE(c) \ + ((((c) - 0x10000u) >> UNICODE_HIGH_SURROGATE_SHIFT) + UNICODE_HIGH_SURROGATE_FIRST) +#define UNICODE_LOW_SURROGATE(c) \ + ((((c) - 0x10000u) & UNICODE_LOW_SURROGATE_MASK) + UNICODE_LOW_SURROGATE_FIRST) +#define UNICODE_DECODE_SURROGATE(high,low) \ + ((((high) - UNICODE_HIGH_SURROGATE_FIRST) << UNICODE_HIGH_SURROGATE_SHIFT) + \ + ((low) - UNICODE_LOW_SURROGATE_FIRST) + 0x10000u) + +#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ + (uv) < 0x800 ? 2 : \ + (uv) < 0x10000 ? 3 : 4 ) + +#define UTF16SKIP(s) ( UNICODE_IS_HIGH_SURROGATE(*s) ? 2 : 1 ) + +/* + + The following table is from Unicode 3.1. + + Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte + + U+0000..U+007F 00..7F + U+0080..U+07FF C2..DF 80..BF + U+0800..U+0FFF E0 A0..BF 80..BF + U+1000..U+FFFF E1..EF 80..BF 80..BF + U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + + */ + +#define UTF8_IS_START(c) ((c) >= 0xC0u && (c) <= 0xFDu) +#define UTF8_IS_CONTINUATION(c) ((c) >= 0x80u && (c) <= 0xBFu) +#define UTF8_IS_CONTINUED(c) ((c) & 0x80u) + +#define UTF8_START_MARK(len) (0xFEu << (7-len)) +#define UTF8_START_MASK(len) (0x1Fu >> (len-2)) + +#define UTF8_CONTINUATION_MARK 0x80u +#define UTF8_ACCUMULATION_SHIFT 6 +#define UTF8_CONTINUATION_MASK 0x3Fu +#define UTF8_ACCUMULATE(old, new) (((old) << UTF8_ACCUMULATION_SHIFT) | ((new) +& UTF8_CONTINUATION_MASK)) + +extern const char Parrot_utf8skip[256]; + +#define UTF8SKIP(s) Parrot_utf8skip[*(s)] + +#define UTF8_MAXLEN 4 +#define UTF16_MAXLEN 4 + +#endif + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ Binary files parrot/pdump and parrot-transcode/pdump differ diff -urNw --exclude CVS parrot/string.c parrot-transcode/string.c --- parrot/string.c Wed Oct 3 23:31:19 2001 +++ parrot-transcode/string.c Sun Oct 7 11:44:44 2001 @@ -20,6 +20,9 @@ void string_init(void) { Parrot_string_vtable[enc_native] = string_native_vtable(); + Parrot_string_vtable[enc_utf8] = string_utf8_vtable(); + Parrot_string_vtable[enc_utf16] = string_utf16_vtable(); + Parrot_string_vtable[enc_utf32] = string_utf32_vtable(); } /*=for api string string_make diff -urNw --exclude CVS parrot/strnative.c parrot-transcode/strnative.c --- parrot/strnative.c Sat Oct 6 11:17:39 2001 +++ parrot-transcode/strnative.c Sat Oct 6 23:34:49 2001 @@ -23,7 +23,7 @@ */ static INTVAL string_native_compute_strlen (STRING *s) { - return s->buflen; + return s->bufused; } /*=for api string_native string_native_max_bytes @@ -41,6 +41,9 @@ string_native_concat(STRING* a, STRING* b, INTVAL flags) { if (flags && a->encoding != b->encoding) { /* Transcode */ + STRING* t = b; + b = string_make(NULL, 0, enc_native, 0, 0); + (Parrot_transcode_table[t->encoding->which][enc_native])(t, b); } /* b is now in native format */ string_grow(a, a->strlen + b->strlen); diff -urNw --exclude CVS parrot/strutf16.c parrot-transcode/strutf16.c --- parrot/strutf16.c Thu Jan 1 01:00:00 1970 +++ parrot-transcode/strutf16.c Sun Oct 7 15:58:09 2001 @@ -0,0 +1,150 @@ +/* strutf16.c + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * This defines the UTF-16 string routines. + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#include "parrot/parrot.h" +#include "parrot/unicode.h" + +/* Functions for handling strings in UTF-16 format */ + +/*=for api string_utf16 string_utf16_compute_strlen + return the length of s +*/ +static INTVAL +string_utf16_compute_strlen (STRING *s) { + INTVAL strlen = 0; + utf16_t *start = s->bufstart; + utf16_t *end = start + s->bufused / sizeof(utf16_t); + + while (start < end) { + start += UTF16SKIP(start); + strlen++; + } + + if (start > end) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, "Unaligned end in UTF-16 string\n"); + } + + return strlen; +} + +/*=for api string_utf16 string_utf16_max_bytes + return the max bytes needed for x characters. +*/ +static INTVAL +string_utf16_max_bytes (INTVAL x) { + return x * UTF16_MAXLEN; +} + +/*=for api string_utf16 string_utf16_concat + concatenate two strings +*/ +static STRING* +string_utf16_concat(STRING* a, STRING* b, INTVAL flags) { + if (flags && a->encoding != b->encoding) { + /* Transcode */ + STRING* t = b; + b = string_make(NULL, 0, enc_utf16, 0, 0); + (Parrot_transcode_table[t->encoding->which][enc_utf16])(t, b); + } + /* b is now in UTF-16 format */ + string_grow(a, a->strlen + b->strlen); + mem_sys_memcopy((void*)((ptrcast_t)a->bufstart + a->bufused), b->bufstart, +b->bufused); + a->strlen = a->strlen + b->strlen; + a->bufused = a->bufused + b->bufused; + return a; +} + +/*=for api string_utf16 string_utf16_chopn + remove the last n characters from s +*/ +static STRING* +string_utf16_chopn(STRING* s, INTVAL n) { + utf16_t *start = s->bufstart; + utf16_t *end = start + s->bufused; + + s->strlen -= n; + + while (end >= start && n--) { + end--; + if (UNICODE_IS_LOW_SURROGATE(*end)) { + end--; + if (!UNICODE_IS_HIGH_SURROGATE(*end)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + } + else if (UNICODE_IS_HIGH_SURROGATE(*end)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + } + + if (n > 0) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, "Malformed UTF-16 string\n"); + } + + s->bufused = end - start; + + return s; +} + +/*=for api string_utf16 string_utf16_substr + substring out length characters from src starting from offset + and store in dest. Grow dest if needed. Return dest +*/ +static STRING* +string_utf16_substr(STRING* src, INTVAL offset, INTVAL length, STRING* dest) +{ + utf16_t *start; + utf16_t *end; + + if (dest->encoding->which != enc_utf16) { + /* It is now, matey. */ + dest->encoding = &(Parrot_string_vtable[enc_utf16]); + } + + /* Offset and length have already been "normalized" */ + string_grow(dest, length); + dest->strlen = length; + for (start = src->bufstart; offset > 0; offset--) start += UTF16SKIP(start); + for (end = start; length > 0; length--) end += UTF16SKIP(end); + mem_sys_memcopy(dest->bufstart, start, end - start); + dest->bufused = end - start; + + return dest; +} + +/*=for api string_utf16 string_utf16_vtable + return the vtable for the native string +*/ +STRING_VTABLE +string_utf16_vtable (void) { + STRING_VTABLE sv = { + enc_utf16, + string_utf16_compute_strlen, + string_utf16_max_bytes, + string_utf16_concat, + string_utf16_chopn, + string_utf16_substr, + }; + return sv; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/strutf32.c parrot-transcode/strutf32.c --- parrot/strutf32.c Thu Jan 1 01:00:00 1970 +++ parrot-transcode/strutf32.c Sat Oct 6 17:38:24 2001 @@ -0,0 +1,107 @@ +/* strutf32.c + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * This defines the UTF-32 string routines. + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#include "parrot/parrot.h" + +/* Functions for handling strings in UTF-32 format */ + +/*=for api string_utf32 string_utf32_compute_strlen + return the length of s +*/ +static INTVAL +string_utf32_compute_strlen (STRING *s) { + return s->buflen / 4; +} + +/*=for api string_utf32 string_utf32_max_bytes + return the max bytes needed for x characters. +*/ +static INTVAL +string_utf32_max_bytes (INTVAL x) { + return x * 4; +} + +/*=for api string_utf32 string_utf32_concat + concatenate two strings +*/ +static STRING* +string_utf32_concat(STRING* a, STRING* b, INTVAL flags) { + if (flags && a->encoding != b->encoding) { + /* Transcode */ + STRING* t = b; + b = string_make(NULL, 0, enc_utf32, 0, 0); + (Parrot_transcode_table[t->encoding->which][enc_utf32])(t, b); + } + /* b is now in UTF-32 format */ + string_grow(a, a->strlen + b->strlen); + mem_sys_memcopy((void*)((ptrcast_t)a->bufstart + a->bufused), b->bufstart, +b->bufused); + a->strlen = a->strlen + b->strlen; + a->bufused = a->bufused + b->bufused; + return a; +} + +/*=for api string_utf32 string_utf32_chopn + remove the last n characters from s +*/ +static STRING* +string_utf32_chopn(STRING* s, INTVAL n) { + s->bufused -= n * 4; + s->strlen -= n * 4; + return s; +} + +/*=for api string_utf32 string_utf32_substr + substring out length characters from src starting from offset + and store in dest. Grow dest if needed. Return dest +*/ +static STRING* +string_utf32_substr(STRING* src, INTVAL offset, INTVAL length, STRING* dest) +{ + if (dest->encoding->which != enc_utf32) { + /* It is now, matey. */ + dest->encoding = &(Parrot_string_vtable[enc_utf32]); + } + + /* Offset and length have already been "normalized" */ + string_grow(dest, length); + mem_sys_memcopy(dest->bufstart, (void*)((ptrcast_t)src->bufstart + offset * 4), +length * 4); + dest->strlen = length; + dest->bufused = length * 4; + + return dest; +} + +/*=for api string_utf32 string_utf32_vtable + return the vtable for the native string +*/ +STRING_VTABLE +string_utf32_vtable (void) { + STRING_VTABLE sv = { + enc_utf32, + string_utf32_compute_strlen, + string_utf32_max_bytes, + string_utf32_concat, + string_utf32_chopn, + string_utf32_substr, + }; + return sv; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ diff -urNw --exclude CVS parrot/strutf8.c parrot-transcode/strutf8.c --- parrot/strutf8.c Thu Jan 1 01:00:00 1970 +++ parrot-transcode/strutf8.c Sun Oct 7 15:47:09 2001 @@ -0,0 +1,151 @@ +/* strutf8.c + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * This defines the UTF-8 string routines. + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#include "parrot/parrot.h" +#include "parrot/unicode.h" + +const char Parrot_utf8skip[256] = { +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */ +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* scripts */ +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6, /* cjk etc. */ +}; + +/* Functions for handling strings in UTF-8 format */ + +/*=for api string_utf8 string_utf8_compute_strlen + return the length of s +*/ +static INTVAL +string_utf8_compute_strlen (STRING *s) { + INTVAL strlen = 0; + utf8_t *start = s->bufstart; + utf8_t *end = start + s->bufused; + + while (start < end) { + start += UTF8SKIP(start); + strlen++; + } + + if (start > end) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); + } + + return strlen; +} + +/*=for api string_utf8 string_utf8_max_bytes + return the max bytes needed for x characters. +*/ +static INTVAL +string_utf8_max_bytes (INTVAL x) { + return x * UTF8_MAXLEN; +} + +/*=for api string_utf8 string_utf8_concat + concatenate two strings +*/ +static STRING* +string_utf8_concat(STRING* a, STRING* b, INTVAL flags) { + if (flags && a->encoding != b->encoding) { + /* Transcode */ + STRING* t = b; + b = string_make(NULL, 0, enc_utf8, 0, 0); + (Parrot_transcode_table[t->encoding->which][enc_utf8])(t, b); + } + /* b is now in UTF-8 format */ + string_grow(a, a->strlen + b->strlen); + mem_sys_memcopy((void*)((ptrcast_t)a->bufstart + a->bufused), b->bufstart, +b->bufused); + a->strlen = a->strlen + b->strlen; + a->bufused = a->bufused + b->bufused; + return a; +} + +/*=for api string_utf8 string_utf8_chopn + remove the last n characters from s +*/ +static STRING* +string_utf8_chopn(STRING* s, INTVAL n) { + utf8_t *start = s->bufstart; + utf8_t *end = start + s->bufused; + + s->strlen -= n; + + while (end >= start && n--) { + end--; + while (end >= start && UTF8_IS_CONTINUATION(*end)) end--; + } + + if (n > 0) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, "Malformed UTF-8 string\n"); + } + + s->bufused = end - start; + + return s; +} + +/*=for api string_utf8 string_utf8_substr + substring out length characters from src starting from offset + and store in dest. Grow dest if needed. Return dest +*/ +static STRING* +string_utf8_substr(STRING* src, INTVAL offset, INTVAL length, STRING* dest) +{ + utf8_t *start; + utf8_t *end; + + if (dest->encoding->which != enc_utf8) { + /* It is now, matey. */ + dest->encoding = &(Parrot_string_vtable[enc_utf8]); + } + + /* Offset and length have already been "normalized" */ + string_grow(dest, length); + dest->strlen = length; + for (start = src->bufstart; offset > 0; offset--) start += UTF8SKIP(start); + for (end = start; length > 0; length--) end += UTF8SKIP(end); + mem_sys_memcopy(dest->bufstart, start, end - start); + dest->bufused = end - start; + + return dest; +} + +/*=for api string_utf8 string_utf8_vtable + return the vtable for the native string +*/ +STRING_VTABLE +string_utf8_vtable (void) { + STRING_VTABLE sv = { + enc_utf8, + string_utf8_compute_strlen, + string_utf8_max_bytes, + string_utf8_concat, + string_utf8_chopn, + string_utf8_substr, + }; + return sv; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/ Binary files parrot/trans-test and parrot-transcode/trans-test differ diff -urNw --exclude CVS parrot/trans-test.c parrot-transcode/trans-test.c --- parrot/trans-test.c Thu Jan 1 01:00:00 1970 +++ parrot-transcode/trans-test.c Sun Oct 7 16:04:51 2001 @@ -0,0 +1,149 @@ +#include "parrot/parrot.h" + +#define MAX_STRINGS 9 + +static unsigned char *utf8_strings[MAX_STRINGS] = { + "Catal\xc3\xa0", + "Fran\xc3\xa7" "ais", + "G\xc3\xa1idhlig", + "F\xc3\xb8royskt", + "\xc3\x8dslensku", + "S\xc3\xa1mi", + "Portugu\xc3\xaas", + "Espan\xc3\xb5l", + "\x00\x7f\xc2\x80\xdf\xbf\xe0\xa0\x80\xe0\xbf\xbf\xe1\x80\x80\xef\xbf\xbf" + "\xf0\x90\x80\x80\xf0\xbf\xbf\xbf\xf1\x80\x80\x80\xf3\xbf\xbf\xbf" + "\xf4\x80\x80\x80\xf4\x8f\xbf\xbf" +}; + +static unsigned char *utf16_strings[MAX_STRINGS] = { + "C\0a\0t\0a\0l\0\xe0\0", + "F\0r\0a\0n\0\xe7\0a\0i\0s\0", + "G\0\xe1\0i\0d\0h\0l\0i\0g\0", + "F\0\xf8\0r\0o\0y\0s\0k\0t\0", + "\xcd\0s\0l\0e\0n\0s\0k\0u\0", + "S\0\xe1\0m\0i\0", + "P\0o\0r\0t\0u\0g\0u\0\xea\0s\0", + "E\0s\0p\0a\0n\0\xf5\0l\0", + "\x00\x00\x7f\x00\x80\x00\xff\x07\x00\x08\xff\x0f\x00\x10\xff\xff" + "\x00\xd8\x00\xdc\xbf\xd8\xff\xdf\xc0\xd8\x00\xdc\xbf\xdb\xff\xdf" + "\xc0\xdb\x00\xdc\xff\xdb\xff\xdf" +}; + +static unsigned char *utf32_strings[MAX_STRINGS] = { + "C\0\0\0a\0\0\0t\0\0\0a\0\0\0l\0\0\0\xe0\0\0\0", + "F\0\0\0r\0\0\0a\0\0\0n\0\0\0\xe7\0\0\0a\0\0\0i\0\0\0s\0\0\0", + "G\0\0\0\xe1\0\0\0i\0\0\0d\0\0\0h\0\0\0l\0\0\0i\0\0\0g\0\0\0", + "F\0\0\0\xf8\0\0\0r\0\0\0o\0\0\0y\0\0\0s\0\0\0k\0\0\0t\0\0\0", + "\xcd\0\0\0s\0\0\0l\0\0\0e\0\0\0n\0\0\0s\0\0\0k\0\0\0u\0\0\0", + "S\0\0\0\xe1\0\0\0m\0\0\0i\0\0\0", + "P\0\0\0o\0\0\0r\0\0\0t\0\0\0u\0\0\0g\0\0\0u\0\0\0\xea\0\0\0s\0\0\0", + "E\0\0\0s\0\0\0p\0\0\0a\0\0\0n\0\0\0\xf5\0\0\0l\0\0\0", + "\x00\x00\x00\x00\x7f\x00\x00\x00\x80\x00\x00\x00\xff\x07\x00\x00" + "\x00\x08\x00\x00\xff\x0f\x00\x00\x00\x10\x00\x00\xff\xff\x00\x00" + "\x00\x00\x01\x00\xff\xff\x03\x00\x00\x00\x04\x00\xff\xff\x0f\x00" + "\x00\x00\x10\x00\xff\xff\x10\x00" +}; + +static int utf8_string_lengths[MAX_STRINGS] = { + 7, 9, 9, 9, 9, 5, 10, 8, 42 +}; + +static int utf16_string_lengths[MAX_STRINGS] = { + 12, 16, 16, 16, 16, 8, 18, 14, 40 +}; + +static int utf32_string_lengths[MAX_STRINGS] = { + 24, 32, 32, 32, 32, 16, 36, 28, 56 +}; + +int main(int argc, char **argv) +{ + int i; + + init_world(); + + for (i = 0; i < MAX_STRINGS; i++) { + STRING *s8; + STRING *s16; + STRING *s32; + + s8 = string_make( utf8_strings[i], utf8_string_lengths[i], enc_utf8, 0, 0 ); + + s16 = Parrot_transcode_table[enc_utf8][enc_utf16]( s8, NULL ); + + if ( utf16_string_lengths[i] == s16->bufused && + memcmp( utf16_strings[i], s16->bufstart, s16->bufused ) == 0 ) { + printf( "utf8_to_utf16: string %d passed\n", i ); + } + else { + printf( "utf8_to_utf16: string %d failed\n", i ); + } + + s32 = Parrot_transcode_table[enc_utf8][enc_utf32]( s8, NULL ); + + if ( utf32_string_lengths[i] == s32->bufused && + memcmp( utf32_strings[i], s32->bufstart, s32->bufused ) == 0 ) { + printf( "utf8_to_utf32: string %d passed\n", i ); + } + else { + printf( "utf8_to_utf32: string %d failed\n", i ); + } + + s16 = string_make( utf16_strings[i], utf16_string_lengths[i], enc_utf16, 0, 0 +); + + s8 = Parrot_transcode_table[enc_utf16][enc_utf8]( s16, NULL ); + + if ( utf8_string_lengths[i] == s8->bufused && + memcmp( utf8_strings[i], s8->bufstart, s8->bufused ) == 0 ) { + printf( "utf16_to_utf8: string %d passed\n", i ); + } + else { + printf( "utf16_to_utf8: string %d failed\n", i ); + } + + s32 = Parrot_transcode_table[enc_utf16][enc_utf32]( s16, NULL ); + + if ( utf32_string_lengths[i] == s32->bufused && + memcmp( utf32_strings[i], s32->bufstart, s32->bufused ) == 0 ) { + printf( "utf16_to_utf32: string %d passed\n", i ); + } + else { + printf( "utf16_to_utf32: string %d failed\n", i ); + } + + s32 = string_make( utf32_strings[i], utf32_string_lengths[i], enc_utf32, 0, 0 +); + + s8 = Parrot_transcode_table[enc_utf32][enc_utf8]( s32, NULL ); + + if ( utf8_string_lengths[i] == s8->bufused && + memcmp( utf8_strings[i], s8->bufstart, s8->bufused ) == 0 ) { + printf( "utf32_to_utf8: string %d passed\n", i ); + } + else { + printf( "utf32_to_utf8: string %d failed\n", i ); + } + + s16 = Parrot_transcode_table[enc_utf32][enc_utf16]( s32, NULL ); + + if ( utf16_string_lengths[i] == s16->bufused && + memcmp( utf16_strings[i], s16->bufstart, s16->bufused ) == 0 ) { + printf( "utf32_to_utf16: string %d passed\n", i ); + } + else { + printf( "utf32_to_utf16: string %d failed\n", i ); + } + } + + exit(0); +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ diff -urNw --exclude CVS parrot/transcode.c parrot-transcode/transcode.c --- parrot/transcode.c Thu Jan 1 01:00:00 1970 +++ parrot-transcode/transcode.c Sun Oct 7 16:03:44 2001 @@ -0,0 +1,368 @@ +/* transcode.c + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * This is the api definitions for the transcoding subsystem + * Data Structure and Algorithms: + * History: + * Notes: + * References: + */ + +#include "parrot/parrot.h" +#include "parrot/unicode.h" + +/*=for api transcode transcode_utf8_to_utf16 + transcode UTF-8 to UTF-16 + */ +static STRING* +transcode_utf8_to_utf16(STRING* from, STRING* to) +{ + utf8_t *fp; + utf16_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf16]; + } + else { + to = string_make(NULL, 0, enc_utf16, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf32_t c = *fp++; + + if (UNICODE_IS_INVARIANT(c)) { + *tp++ = c; + } + else if (UTF8_IS_START(c)) { + INTVAL len = UTF8SKIP(fp - 1); + INTVAL count; + + c &= UTF8_START_MASK(len); + for (count = 1; count < len; count++) { + if (!UTF8_IS_CONTINUATION(*fp)) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + c = UTF8_ACCUMULATE(c, *fp); + fp++; + } + + if (c < 0x10000u) { + if (UNICODE_IS_SURROGATE(c)) { + INTERNAL_EXCEPTION(MALFORMED_UTF32, + "Surrogate in UTF-8 string\n"); + } + *tp++ = c; + } + else { + *tp++ = UNICODE_HIGH_SURROGATE(c); + *tp++ = UNICODE_LOW_SURROGATE(c); + } + } + else { + INTERNAL_EXCEPTION(MALFORMED_UTF8, "Malformed UTF-8 string\n"); + } + + to->strlen++; + } + + to->bufused = (tp - (utf16_t *)to->bufstart) * sizeof(utf16_t); + + return to; +} + +/*=for api transcode transcode_utf8_to_utf32 + transcode UTF-8 to UTF-32 + */ +static STRING* +transcode_utf8_to_utf32(STRING* from, STRING* to) +{ + utf8_t *fp; + utf32_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf32]; + } + else { + to = string_make(NULL, 0, enc_utf32, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf32_t c = *fp++; + + if (UNICODE_IS_INVARIANT(c)) { + *tp++ = c; + } + else if (UTF8_IS_START(c)) { + INTVAL len = UTF8SKIP(fp - 1); + INTVAL count; + c &= UTF8_START_MASK(len); + for (count = 1; count < len; count++) { + if (!UTF8_IS_CONTINUATION(*fp)) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + c = UTF8_ACCUMULATE(c, *fp); + fp++; + } + *tp++ = c; + } + else { + INTERNAL_EXCEPTION(MALFORMED_UTF8, "Malformed UTF-8 string\n"); + } + + to->strlen++; + } + + to->bufused = (tp - (utf32_t *)to->bufstart) * sizeof(utf32_t); + + return to; +} + +/*=for api transcode transcode_utf16_to_utf8 + transcode UTF-16 to UTF-8 + */ +static STRING* +transcode_utf16_to_utf8(STRING* from, STRING* to) +{ + utf16_t *fp; + utf8_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf8]; + } + else { + to = string_make(NULL, 0, enc_utf8, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf32_t c = *fp++; + + if (UNICODE_IS_INVARIANT(c)) { + *tp++ = c; + } + else { + INTVAL len; + utf8_t *p; + + if (UNICODE_IS_HIGH_SURROGATE(c)) { + utf16_t low = *fp++; + + if (!UNICODE_IS_LOW_SURROGATE(low)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + + c = UNICODE_DECODE_SURROGATE(c, low); + } + else if (UNICODE_IS_LOW_SURROGATE(c)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + + len = UNISKIP(c); + p = tp + len - 1; + while (p > tp) { + *p-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK; + c >>= UTF8_ACCUMULATION_SHIFT; + } + *p = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len); + tp += len; + } + + to->strlen++; + } + + to->bufused = (tp - (utf8_t *)to->bufstart) * sizeof(utf8_t); + + return to; +} + +/*=for api transcode transcode_utf16_to_utf32 + transcode UTF-16 to UTF-32 + */ +static STRING* +transcode_utf16_to_utf32(STRING* from, STRING* to) +{ + utf16_t *fp; + utf32_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf32]; + } + else { + to = string_make(NULL, 0, enc_utf32, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf16_t c = *fp++; + + if (UNICODE_IS_INVARIANT(c)) { + *tp++ = c; + } + else { + if (UNICODE_IS_HIGH_SURROGATE(c)) { + utf16_t low = *fp++; + + if (!UNICODE_IS_LOW_SURROGATE(low)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + + *tp++ = UNICODE_DECODE_SURROGATE(c, low); + } + else if (UNICODE_IS_LOW_SURROGATE(c)) { + INTERNAL_EXCEPTION(MALFORMED_UTF16, + "Malformed UTF-16 surrogate\n"); + } + else { + *tp++ = c; + } + } + + to->strlen++; + } + + to->bufused = (tp - (utf32_t *)to->bufstart) * sizeof(utf32_t); + + return to; +} + +/*=for api transcode transcode_utf32_to_utf8 + transcode UTF-32 to UTF-8 + */ +static STRING* +transcode_utf32_to_utf8(STRING* from, STRING* to) +{ + utf32_t *fp; + utf8_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf8]; + } + else { + to = string_make(NULL, 0, enc_utf8, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf32_t c = *fp++; + + if (UNICODE_IS_INVARIANT(c)) { + *tp++ = c; + } + else { + INTVAL len = UNISKIP(c); + utf8_t *p = tp + len - 1; + while (p > tp) { + *p-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK; + c >>= UTF8_ACCUMULATION_SHIFT; + } + *p = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len); + tp += len; + } + + to->strlen++; + } + + to->bufused = (tp - (utf8_t *)to->bufstart) * sizeof(utf8_t); + + return to; +} + +/*=for api transcode transcode_utf32_to_utf16 + transcode UTF-32 to UTF-16 + */ +static STRING* +transcode_utf32_to_utf16(STRING* from, STRING* to) +{ + utf32_t *fp; + utf16_t *tp; + + if (to) { + to->encoding = &Parrot_string_vtable[enc_utf16]; + } + else { + to = string_make(NULL, 0, enc_utf16, 0, 0); + } + + string_grow(to, from->strlen); + to->strlen = 0; + + fp = from->bufstart; + tp = to->bufstart; + + while (to->strlen < from->strlen) { + utf32_t c = *fp++; + + if (c < 0x10000u) { + if (UNICODE_IS_SURROGATE(c)) { + INTERNAL_EXCEPTION(MALFORMED_UTF32, + "Surrogate in UTF-32 string\n"); + } + *tp++ = c; + } + else { + *tp++ = UNICODE_HIGH_SURROGATE(c); + *tp++ = UNICODE_LOW_SURROGATE(c); + } + + to->strlen++; + } + + to->bufused = (tp - (utf16_t *)to->bufstart) * sizeof(utf16_t); + + return to; +} + +void +transcode_init(void) +{ + Parrot_transcode_table[enc_utf8][enc_utf16] = transcode_utf8_to_utf16; + Parrot_transcode_table[enc_utf8][enc_utf32] = transcode_utf8_to_utf32; + Parrot_transcode_table[enc_utf16][enc_utf8] = transcode_utf16_to_utf8; + Parrot_transcode_table[enc_utf16][enc_utf32] = transcode_utf16_to_utf32; + Parrot_transcode_table[enc_utf32][enc_utf8] = transcode_utf32_to_utf8; + Parrot_transcode_table[enc_utf32][enc_utf16] = transcode_utf32_to_utf16; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: +*/