Author: leo
Date: Thu Nov 10 09:18:21 2005
New Revision: 9885
Modified:
trunk/encodings/ucs2.c
trunk/encodings/utf16.c
trunk/t/op/string_cs.t
Log:
unicode improvements - string_index
* string_index should now handle all charsets and encodings
* convert from latin1 to ucs2 too
* 2 tests
More tests welcome
Modified: trunk/encodings/ucs2.c
==============================================================================
--- trunk/encodings/ucs2.c (original)
+++ trunk/encodings/ucs2.c Thu Nov 10 09:18:21 2005
@@ -203,8 +203,8 @@ bytes(Interp *interpreter, STRING *src)
static UINTVAL
ucs2_decode_and_advance(Interp *interpreter, String_iter *i)
{
- UChar *s = (UChar*) i->str->strstart;
- UINTVAL c, pos;
+ UChar *s = (UChar*) i->str->strstart, c;
+ size_t pos;
pos = i->bytepos / sizeof(UChar);
/* TODO either make sure that we don't go past end or use SAFE
* iter versions
@@ -221,7 +221,7 @@ ucs2_encode_and_advance(Interp *interpre
UChar *s = (UChar*) i->str->strstart;
UINTVAL pos;
pos = i->bytepos / sizeof(UChar);
- s[pos++] = c;
+ s[pos++] = (UChar)c;
i->charpos++;
i->bytepos = pos * sizeof(UChar);
}
Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c (original)
+++ trunk/encodings/utf16.c Thu Nov 10 09:18:21 2005
@@ -64,19 +64,26 @@ to_encoding(Interp *interpreter, STRING
UErrorCode *pErrorCode);
*/
#if PARROT_HAS_ICU
- err = U_ZERO_ERROR;
- /* XXX these inplace operations are all shit (sorry) */
+ /* need intermediate memory */
p = mem_sys_allocate(src_len * sizeof(UChar));
- u_strFromUTF8(p, src_len,
- &dest_len, src->strstart, src->bufused, &err);
- if (!U_SUCCESS(err)) {
- /*
- * have to resize - required len in UChars is in dest_len
- */
- p = mem_sys_realloc(p, dest_len * sizeof(UChar));
- u_strFromUTF8(p, dest_len,
+ if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+ for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len) {
+ p[dest_len] = (UChar)((unsigned char*)src->strstart)[dest_len];
+ }
+ }
+ else {
+ err = U_ZERO_ERROR;
+ u_strFromUTF8(p, src_len,
&dest_len, src->strstart, src->bufused, &err);
- assert(U_SUCCESS(err));
+ if (!U_SUCCESS(err)) {
+ /*
+ * have to resize - required len in UChars is in dest_len
+ */
+ p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+ u_strFromUTF8(p, dest_len,
+ &dest_len, src->strstart, src->bufused, &err);
+ assert(U_SUCCESS(err));
+ }
}
src->bufused = dest_len * sizeof(UChar);
Parrot_reallocate_string(interpreter, src, src->bufused);
@@ -103,36 +110,42 @@ copy_to_encoding(Interp *interpreter, ST
int src_len;
if (src->encoding == Parrot_utf16_encoding_ptr ||
- src->encoding == Parrot_ucs2_encoding_ptr)
+ src->encoding == Parrot_ucs2_encoding_ptr)
return string_copy(interpreter, src);
- /*
- * TODO adapt string creation functions
- */
- dest = new_string_header(interpreter, 0);
- src_len = src->strlen;
- dest->strlen = src_len;
- dest->charset = Parrot_unicode_charset_ptr;
- dest->encoding = Parrot_utf16_encoding_ptr;
+ src_len = src->strlen;
if (!src_len) {
- Parrot_allocate_string(interpreter, dest, 0);
- return dest;
+ return string_make_direct(interpreter, NULL, 0,
+ Parrot_utf16_encoding_ptr,
+ Parrot_unicode_charset_ptr, 0);
+
}
#if PARROT_HAS_ICU
- Parrot_allocate_string(interpreter, dest, sizeof(UChar) * src_len);
- err = U_ZERO_ERROR;
- u_strFromUTF8(dest->strstart, src_len,
- &dest_len, src->strstart, src->bufused, &err);
- if (!U_SUCCESS(err)) {
- Parrot_allocate_string(interpreter, dest, sizeof(UChar) * dest_len);
- u_strFromUTF8(dest->strstart, dest_len,
+ dest = string_make_direct(interpreter, NULL, sizeof(UChar) * src_len,
+ Parrot_utf16_encoding_ptr,
+ Parrot_unicode_charset_ptr, 0);
+ dest->strlen = src_len;
+ if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+ UChar *p = (UChar*) dest->strstart;
+ for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len, ++p) {
+ *p = (UChar)((unsigned char*)src->strstart)[dest_len];
+ }
+ }
+ else {
+ err = U_ZERO_ERROR;
+ u_strFromUTF8(dest->strstart, src_len,
&dest_len, src->strstart, src->bufused, &err);
- assert(U_SUCCESS(err));
+ if (!U_SUCCESS(err)) {
+ Parrot_allocate_string(interpreter, dest, sizeof(UChar) *
dest_len);
+ u_strFromUTF8(dest->strstart, dest_len,
+ &dest_len, src->strstart, src->bufused, &err);
+ assert(U_SUCCESS(err));
+ }
}
dest->bufused = dest_len * sizeof(UChar);
/* downgrade if possible */
if (dest_len == (int)src->strlen)
- src->encoding = Parrot_ucs2_encoding_ptr;
+ dest->encoding = Parrot_ucs2_encoding_ptr;
#else
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Thu Nov 10 09:18:21 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 39;
+use Parrot::Test tests => 41;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -634,6 +634,7 @@ output_is( <<'CODE', <<"OUTPUT", "utf16
CODE
\xc3\xb6t
OUTPUT
+
output_is( <<'CODE', <<"OUTPUT", "utf16 replace");
set S0, iso-8859-1:"T�tsch"
find_charset I0, "unicode"
@@ -653,3 +654,36 @@ CODE
\xc3\xb6
Toetsch
OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search");
+ set S0, iso-8859-1:"T�TSCH"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ downcase S1
+ set S2, iso-8859-1:"�t"
+ index I0, S1, S2
+ print I0
+ print "\n"
+ end
+CODE
+1
+OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search");
+ set S0, iso-8859-1:"T�TSCH"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ downcase S1
+ set S2, iso-8859-1:"�t"
+ index I0, S1, S2
+ print I0
+ print "\n"
+ concat S1, S2
+ index I0, S1, S2, 2
+ print I0
+ print "\n"
+ end
+CODE
+1
+6
+OUTPUT