Author: leo
Date: Thu Nov 10 06:06:25 2005
New Revision: 9878

Modified:
   trunk/encodings/ucs2.c
   trunk/src/string.c
   trunk/t/op/string_cs.t
Log:
unicode improvements - concat, append

* string_concat and string_append should now handle all charsets
  and encodings
* use bytes in string_capacity instead of chars

A few tests are in t/op/string_cs.t


Modified: trunk/encodings/ucs2.c
==============================================================================
--- trunk/encodings/ucs2.c      (original)
+++ trunk/encodings/ucs2.c      Thu Nov 10 06:06:25 2005
@@ -168,8 +168,13 @@ become_encoding(Interp *interpreter, STR
 static UINTVAL
 codepoints(Interp *interpreter, STRING *src)
 {
-    UNIMPL;
+#if PARROT_HAS_ICU
+    return src->bufused / sizeof(UChar);
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
     return 0;
+#endif
 }
 
 static UINTVAL

Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Thu Nov 10 06:06:25 2005
@@ -356,7 +356,8 @@ string_deinit(Parrot_Interp interpreter)
 =item C<UINTVAL
 string_capacity(Interp *interpreter, STRING *s)>
 
-Returns the capacity of the specified Parrot string.
+Returns the capacity of the specified Parrot string in bytes, that
+is how many bytes can be appended onto strstart.
 
 =cut
 
@@ -365,10 +366,8 @@ Returns the capacity of the specified Pa
 UINTVAL
 string_capacity(Interp *interpreter, STRING *s)
 {
-    saneify_string(s);
     return ((ptrcast_t)PObj_bufstart(s) + PObj_buflen(s) -
-            (ptrcast_t)s->strstart) /
-        ENCODING_MAX_BYTES_PER_CODEPOINT(interpreter, s);
+            (ptrcast_t)s->strstart);
 }
 
 /*
@@ -475,47 +474,45 @@ string_append(Interp *interpreter,
     saneify_string(a);
     saneify_string(b);
 
-    /* If the destination's constant, then just fall back to
+    /* If the destination's constant, or external then just fall back to
        string_concat */
-    if (PObj_constant_TEST(a)) {
+    if (PObj_is_cowed_TESTALL(a)) {
         return string_concat(interpreter, a, b, Uflags);
     }
 
+    if ( (cs = string_rep_compatible(interpreter, a, b, NULL)))
+        a->charset = cs;
+    else {
+        /* upgrade to utf16 */
+        Parrot_utf16_encoding_ptr->to_encoding(interpreter, a);
+        b = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, b);
+        /*
+         * result could be mixed ucs2 / utf16
+         */
+        if (b->encoding == Parrot_utf16_encoding_ptr)
+            a->encoding = Parrot_utf16_encoding_ptr;
+    }
+    /*
+     * calc usable and total bytes
+     */
     a_capacity = string_capacity(interpreter, a);
-    total_length = string_length(interpreter, a) + b_len;
+    total_length = a->bufused + b->bufused;
 
-    /* make sure A's big enough for both */
-    if (a_capacity < total_length)
-    {
-        a = string_grow(interpreter, a,
-                (total_length - a_capacity) + EXTRA_SIZE);
-    }
-    else {
-        Parrot_unmake_COW(interpreter, a);
+    /* make sure A's big enough for both  */
+    if (a_capacity < total_length) {
+        Parrot_reallocate_string(interpreter, a,
+                total_length + EXTRA_SIZE);
     }
 
     /* A is now ready to receive the contents of B */
 
-    /* if compatible rep, can memcopy */
-    if ( (cs = string_rep_compatible(interpreter, a, b, NULL))) {
-        a->charset = cs;
-        /* Tack B on the end of A */
-        mem_sys_memcopy((void *)((ptrcast_t)a->strstart + a->bufused),
-                b->strstart, b->bufused);
-
-        a->bufused += b->bufused;
-        a->strlen += b_len;
-        return a;
-    }
-    else {
-        internal_exception(UNIMPLEMENTED,
-                "Cross-type string appending (%s/%s) (%s/%s) unsupported",
-                ((ENCODING *)(a->encoding))->name,
-                ((CHARSET *)(a->charset))->name,
-                ((ENCODING *)(b->encoding))->name,
-                ((CHARSET *)(b->charset))->name);
-    }
-
+    /* Tack B on the end of A */
+    mem_sys_memcopy((void *)((ptrcast_t)a->strstart + a->bufused),
+            b->strstart, b->bufused);
+
+    a->bufused += b->bufused;
+    a->strlen += b_len;
+    a->hashval = 0;
     return a;
 }
 
@@ -675,36 +672,29 @@ string_make_direct(Interp *interpreter, 
         void * __ptr;
     } __ptr_u;
 
-    /*    PIO_eprintf(NULL, "string_make(): length = %ld, encoding name = %s, 
buffer = %s\n",
-          len, charset, (const char *)buffer); */
-
-    if (len && !buffer) {
-        internal_exception(BAD_BUFFER_SIZE,
-                "string_make: buffer pointer NULL, but length nonzero");
-    }
-
     s = new_string_header(interpreter, flags);
     s->encoding = encoding;
     s->charset = charset;
 
-    if (encoding == Parrot_fixed_8_encoding_ptr &&
-            charset == Parrot_ascii_charset_ptr) {
+    if (flags & PObj_external_FLAG) {
         /*
          * fast path for external (constant) strings - don't allocate
          * and copy data
          */
-        if (flags & PObj_external_FLAG) {
-            /* The following cast discards the 'const'.  That raises
-               a warning with gcc, but is ok since the caller indicated
-               it was safe by setting PObj_external_FLAG.
-               (The cast is necessary to pacify TenDRA's tcc.)
-               */
-            PObj_bufstart(s) = s->strstart = const_cast(buffer);
-            PObj_buflen(s)   = s->strlen = s->bufused = len;
-            PObj_bufstart_external_SET(s);
+        /* The following cast discards the 'const'.  That raises
+           a warning with gcc, but is ok since the caller indicated
+           it was safe by setting PObj_external_FLAG.
+           (The cast is necessary to pacify TenDRA's tcc.)
+           */
+        PObj_bufstart(s) = s->strstart = const_cast(buffer);
+        PObj_buflen(s)   = s->bufused = len;
+        PObj_bufstart_external_SET(s);
+        if (encoding == Parrot_fixed_8_encoding_ptr)
+            s->strlen = len;
+        else
+            string_compute_strlen(interpreter, s);
 
-            return s;
-        }
+        return s;
     }
 
     Parrot_allocate_string(interpreter, s, len);
@@ -712,7 +702,10 @@ string_make_direct(Interp *interpreter, 
     if (buffer) {
         mem_sys_memcopy(s->strstart, buffer, len);
         s->bufused = len;
-        string_compute_strlen(interpreter, s);
+        if (encoding == Parrot_fixed_8_encoding_ptr)
+            s->strlen = len;
+        else
+            string_compute_strlen(interpreter, s);
     }
     else {
         s->strlen = s->bufused = 0;
@@ -991,8 +984,9 @@ string_concat(Interp *interpreter,
     if (a != NULL && a->strlen != 0) {
         if (b != NULL && b->strlen != 0) {
             STRING *result =
-                string_make_empty(interpreter, enum_stringrep_one,
-                        a->strlen + b->strlen);
+                string_make_direct(interpreter, NULL,
+                        a->bufused + b->bufused,
+                        a->encoding, a->charset, 0);
 
             string_append(interpreter, result, a, Uflags);
             string_append(interpreter, result, b, Uflags);

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Thu Nov 10 06:06:25 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 35;
+use Parrot::Test tests => 37;
 use Test::More;
 
 output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -572,3 +572,49 @@ output_is( <<'CODE', <<"OUTPUT", "chopn 
 CODE
 TT 2 2
 OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 append");
+    set S0, iso-8859-1:"T�tsch"
+    find_charset I0, "unicode"
+    trans_charset S1, S0, I0
+    find_encoding I0, "utf16"
+    trans_encoding S1, S1, I0
+    concat S1, " Leo"
+    length I0, S1
+    print_item I0
+    .include "stringinfo.pasm"
+    stringinfo I0, S1, .STRINGINFO_BUFUSED
+    print_item I0
+    print_newline
+    find_encoding I0, "utf8"
+    trans_encoding S2, S1, I0
+    print S2
+    print "\n"
+    end
+CODE
+10 20
+T\xc3\xb6tsch Leo
+OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 concat");
+    set S0, iso-8859-1:"T�tsch"
+    find_charset I0, "unicode"
+    trans_charset S1, S0, I0
+    find_encoding I0, "utf16"
+    trans_encoding S1, S1, I0
+    concat S2, S1, " Leo"
+    length I0, S2
+    print_item I0
+    .include "stringinfo.pasm"
+    stringinfo I0, S2, .STRINGINFO_BUFUSED
+    print_item I0
+    print_newline
+    find_encoding I0, "utf8"
+    trans_encoding S2, S2, I0
+    print S2
+    print "\n"
+    end
+CODE
+10 20
+T\xc3\xb6tsch Leo
+OUTPUT

Reply via email to