Author: leo
Date: Thu Nov 10 09:18:21 2005
New Revision: 9885

Modified:
   trunk/encodings/ucs2.c
   trunk/encodings/utf16.c
   trunk/t/op/string_cs.t
Log:
unicode improvements - string_index

* string_index should now handle all charsets and encodings
* convert from latin1 to ucs2 too
* 2 tests

More tests welcome


Modified: trunk/encodings/ucs2.c
==============================================================================
--- trunk/encodings/ucs2.c      (original)
+++ trunk/encodings/ucs2.c      Thu Nov 10 09:18:21 2005
@@ -203,8 +203,8 @@ bytes(Interp *interpreter, STRING *src)
 static UINTVAL
 ucs2_decode_and_advance(Interp *interpreter, String_iter *i)
 {
-    UChar *s = (UChar*) i->str->strstart;
-    UINTVAL c, pos;
+    UChar *s = (UChar*) i->str->strstart, c;
+    size_t pos;
     pos = i->bytepos / sizeof(UChar);
     /* TODO either make sure that we don't go past end or use SAFE
      *      iter versions
@@ -221,7 +221,7 @@ ucs2_encode_and_advance(Interp *interpre
     UChar *s = (UChar*) i->str->strstart;
     UINTVAL pos;
     pos = i->bytepos / sizeof(UChar);
-    s[pos++] = c;
+    s[pos++] = (UChar)c;
     i->charpos++;
     i->bytepos = pos * sizeof(UChar);
 }

Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c     (original)
+++ trunk/encodings/utf16.c     Thu Nov 10 09:18:21 2005
@@ -64,19 +64,26 @@ to_encoding(Interp *interpreter, STRING 
        UErrorCode *pErrorCode);
      */
 #if PARROT_HAS_ICU
-    err = U_ZERO_ERROR;
-    /* XXX these inplace operations are all shit (sorry) */
+    /* need intermediate memory */
     p = mem_sys_allocate(src_len * sizeof(UChar));
-    u_strFromUTF8(p, src_len,
-            &dest_len, src->strstart, src->bufused, &err);
-    if (!U_SUCCESS(err)) {
-        /*
-         * have to resize - required len in UChars is in dest_len
-         */
-        p = mem_sys_realloc(p, dest_len * sizeof(UChar));
-        u_strFromUTF8(p, dest_len,
+    if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+        for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len) {
+            p[dest_len] = (UChar)((unsigned char*)src->strstart)[dest_len];
+        }
+    }
+    else {
+        err = U_ZERO_ERROR;
+        u_strFromUTF8(p, src_len,
                 &dest_len, src->strstart, src->bufused, &err);
-        assert(U_SUCCESS(err));
+        if (!U_SUCCESS(err)) {
+            /*
+             * have to resize - required len in UChars is in dest_len
+             */
+            p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+            u_strFromUTF8(p, dest_len,
+                    &dest_len, src->strstart, src->bufused, &err);
+            assert(U_SUCCESS(err));
+        }
     }
     src->bufused = dest_len * sizeof(UChar);
     Parrot_reallocate_string(interpreter, src, src->bufused);
@@ -103,36 +110,42 @@ copy_to_encoding(Interp *interpreter, ST
     int src_len;
 
     if (src->encoding == Parrot_utf16_encoding_ptr ||
-        src->encoding == Parrot_ucs2_encoding_ptr)
+            src->encoding == Parrot_ucs2_encoding_ptr)
         return string_copy(interpreter, src);
 
-    /*
-     * TODO adapt string creation functions
-     */
-    dest = new_string_header(interpreter, 0);
-    src_len = src->strlen;
-    dest->strlen   = src_len;
-    dest->charset  = Parrot_unicode_charset_ptr;
-    dest->encoding = Parrot_utf16_encoding_ptr;
+    src_len  = src->strlen;
     if (!src_len) {
-        Parrot_allocate_string(interpreter, dest, 0);
-        return dest;
+        return string_make_direct(interpreter, NULL, 0,
+                Parrot_utf16_encoding_ptr,
+                Parrot_unicode_charset_ptr, 0);
+
     }
 #if PARROT_HAS_ICU
-    Parrot_allocate_string(interpreter, dest, sizeof(UChar) * src_len);
-    err = U_ZERO_ERROR;
-    u_strFromUTF8(dest->strstart, src_len,
-            &dest_len, src->strstart, src->bufused, &err);
-    if (!U_SUCCESS(err)) {
-        Parrot_allocate_string(interpreter, dest, sizeof(UChar) * dest_len);
-        u_strFromUTF8(dest->strstart, dest_len,
+    dest = string_make_direct(interpreter, NULL, sizeof(UChar) * src_len,
+                Parrot_utf16_encoding_ptr,
+                Parrot_unicode_charset_ptr, 0);
+    dest->strlen = src_len;
+    if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+        UChar *p = (UChar*) dest->strstart;
+        for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len, ++p) {
+            *p = (UChar)((unsigned char*)src->strstart)[dest_len];
+        }
+    }
+    else {
+        err = U_ZERO_ERROR;
+        u_strFromUTF8(dest->strstart, src_len,
                 &dest_len, src->strstart, src->bufused, &err);
-        assert(U_SUCCESS(err));
+        if (!U_SUCCESS(err)) {
+            Parrot_allocate_string(interpreter, dest, sizeof(UChar) * 
dest_len);
+            u_strFromUTF8(dest->strstart, dest_len,
+                    &dest_len, src->strstart, src->bufused, &err);
+            assert(U_SUCCESS(err));
+        }
     }
     dest->bufused = dest_len * sizeof(UChar);
     /* downgrade if possible */
     if (dest_len == (int)src->strlen)
-        src->encoding = Parrot_ucs2_encoding_ptr;
+        dest->encoding = Parrot_ucs2_encoding_ptr;
 
 #else
     real_exception(interpreter, NULL, E_LibraryNotLoadedError,

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Thu Nov 10 09:18:21 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 39;
+use Parrot::Test tests => 41;
 use Test::More;
 
 output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -634,6 +634,7 @@ output_is( <<'CODE', <<"OUTPUT", "utf16 
 CODE
 \xc3\xb6t
 OUTPUT
+
 output_is( <<'CODE', <<"OUTPUT", "utf16 replace");
     set S0, iso-8859-1:"T�tsch"
     find_charset I0, "unicode"
@@ -653,3 +654,36 @@ CODE
 \xc3\xb6
 Toetsch
 OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search");
+    set S0, iso-8859-1:"T�TSCH"
+    find_charset I0, "unicode"
+    trans_charset S1, S0, I0
+    downcase S1
+    set S2, iso-8859-1:"�t"
+    index I0, S1, S2
+    print I0
+    print "\n"
+    end
+CODE
+1
+OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search");
+    set S0, iso-8859-1:"T�TSCH"
+    find_charset I0, "unicode"
+    trans_charset S1, S0, I0
+    downcase S1
+    set S2, iso-8859-1:"�t"
+    index I0, S1, S2
+    print I0
+    print "\n"
+    concat S1, S2
+    index I0, S1, S2, 2
+    print I0
+    print "\n"
+    end
+CODE
+1
+6
+OUTPUT

Reply via email to