[svn:parrot] r10018 - in trunk: charset t/op

leo Wed, 16 Nov 2005 03:01:42 -0800

Author: leo
Date: Wed Nov 16 03:01:26 2005
New Revision: 10018

Modified:
   trunk/charset/unicode.c
   trunk/t/op/string_cs.t
Log:
icu (3.2) bug or not


* upcase with buffer overflow truncates the src string
* see comment in charset/unicode.c:upcase() and
* new test t/op/string_cs_45.pasm



Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c     (original)
+++ trunk/charset/unicode.c     Wed Nov 16 03:01:26 2005
@@ -99,24 +99,45 @@ u_strToUpper(UChar *dest, int32_t destCa
              UErrorCode *pErrorCode);
      */
     err = U_ZERO_ERROR;
+    /* use all available space - see below XXX */
+    /* TODO downcase, titlecase too */
+    dest_len = PObj_buflen(src) / sizeof(UChar);
     src_len = src->bufused / sizeof(UChar);
-    dest_len = u_strToUpper(src->strstart, src_len,
+    dest_len = u_strToUpper(src->strstart, dest_len,
             src->strstart, src_len,
             NULL,       /* locale = default */
             &err);
     src->bufused = dest_len * sizeof(UChar);
     if (!U_SUCCESS(err)) {
+        /*
+         * XXX troubles:
+         *   t/op/string_cs_44  upcase unicode:"\u01f0"
+         *   this creates \u004a \u030c J+NON-SPACING HACEK
+         *   the string needs resizing, *if* the src buffer is
+         *   too short. *But* (at least) with icu 3.2 the src string is
+         *   overwritten with \0 despite the icu docs sayeth:
+         *
+         *      The source string and the destination buffer
+         *      are allowed to overlap.
+         */
         err = U_ZERO_ERROR;
-        Parrot_reallocate_string(interpreter, src, src->bufused);
+        Parrot_reallocate_string(interpreter, src, dest_len);
         dest_len = u_strToUpper(src->strstart, dest_len,
                 src->strstart, src_len,
                 NULL,       /* locale = default */
                 &err);
         assert(U_SUCCESS(err));
+        src->bufused = dest_len * sizeof(UChar);
     }
     /* downgrade if possible */
     if (dest_len == (int)src->strlen)
         src->encoding = Parrot_ucs2_encoding_ptr;
+    else {
+        /* string is likely still ucs2 if it was earlier
+         * but strlen changed tue to combining char
+         */
+        src->strlen = dest_len;
+    }
 #else
     real_exception(interpreter, NULL, E_LibraryNotLoadedError,
             "no ICU lib loaded");

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Wed Nov 16 03:01:26 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 49;
+use Parrot::Test tests => 50;
 use Parrot::Config;
 use Test::More;
 
@@ -730,6 +730,43 @@ CODE
 HACEK J J\xcc\x8c
 OUTPUT
 
+output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char 3.2 bug?");
+    getstdout P0          # need to convert back to utf8
+    push P0, "utf8"       # push utf8 output layer
+    set S1, unicode:"\u01f0"
+    set I0, 5
+loop:
+    repeat S2, "_", I0
+    concat S2, S1
+    upcase S2
+    concat S2, '_'
+    print S2
+    print "\n"
+    inc I0
+    lt I0, 24, loop
+    end
+CODE
+_____J\xcc\x8c_
+______J\xcc\x8c_
+_______J\xcc\x8c_
+________J\xcc\x8c_
+_________J\xcc\x8c_
+__________J\xcc\x8c_
+___________J\xcc\x8c_
+____________J\xcc\x8c_
+_____________J\xcc\x8c_
+______________J\xcc\x8c_
+_______________J\xcc\x8c_
+________________J\xcc\x8c_
+_________________J\xcc\x8c_
+__________________J\xcc\x8c_
+___________________J\xcc\x8c_
+____________________J\xcc\x8c_
+_____________________J\xcc\x8c_
+______________________J\xcc\x8c_
+_______________________J\xcc\x8c_
+OUTPUT
+
 output_is( <<'CODE', <<"OUTPUT", "unicode titlecase");
     set S0, iso-8859-1:"t�tsch leo"
     find_charset I0, "unicode"

[svn:parrot] r10018 - in trunk: charset t/op

Reply via email to