Author: leo
Date: Wed Nov 16 03:01:26 2005
New Revision: 10018
Modified:
trunk/charset/unicode.c
trunk/t/op/string_cs.t
Log:
icu (3.2) bug or not
* upcase with buffer overflow truncates the src string
* see comment in charset/unicode.c:upcase() and
* new test t/op/string_cs_45.pasm
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Wed Nov 16 03:01:26 2005
@@ -99,24 +99,45 @@ u_strToUpper(UChar *dest, int32_t destCa
UErrorCode *pErrorCode);
*/
err = U_ZERO_ERROR;
+ /* use all available space - see below XXX */
+ /* TODO downcase, titlecase too */
+ dest_len = PObj_buflen(src) / sizeof(UChar);
src_len = src->bufused / sizeof(UChar);
- dest_len = u_strToUpper(src->strstart, src_len,
+ dest_len = u_strToUpper(src->strstart, dest_len,
src->strstart, src_len,
NULL, /* locale = default */
&err);
src->bufused = dest_len * sizeof(UChar);
if (!U_SUCCESS(err)) {
+ /*
+ * XXX troubles:
+ * t/op/string_cs_44 upcase unicode:"\u01f0"
+ * this creates \u004a \u030c J+NON-SPACING HACEK
+ * the string needs resizing, *if* the src buffer is
+ * too short. *But* (at least) with icu 3.2 the src string is
+ * overwritten with \0 despite the icu docs sayeth:
+ *
+ * The source string and the destination buffer
+ * are allowed to overlap.
+ */
err = U_ZERO_ERROR;
- Parrot_reallocate_string(interpreter, src, src->bufused);
+ Parrot_reallocate_string(interpreter, src, dest_len);
dest_len = u_strToUpper(src->strstart, dest_len,
src->strstart, src_len,
NULL, /* locale = default */
&err);
assert(U_SUCCESS(err));
+ src->bufused = dest_len * sizeof(UChar);
}
/* downgrade if possible */
if (dest_len == (int)src->strlen)
src->encoding = Parrot_ucs2_encoding_ptr;
+ else {
+ /* string is likely still ucs2 if it was earlier
+ * but strlen changed tue to combining char
+ */
+ src->strlen = dest_len;
+ }
#else
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
"no ICU lib loaded");
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Wed Nov 16 03:01:26 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 49;
+use Parrot::Test tests => 50;
use Parrot::Config;
use Test::More;
@@ -730,6 +730,43 @@ CODE
HACEK J J\xcc\x8c
OUTPUT
+output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char 3.2 bug?");
+ getstdout P0 # need to convert back to utf8
+ push P0, "utf8" # push utf8 output layer
+ set S1, unicode:"\u01f0"
+ set I0, 5
+loop:
+ repeat S2, "_", I0
+ concat S2, S1
+ upcase S2
+ concat S2, '_'
+ print S2
+ print "\n"
+ inc I0
+ lt I0, 24, loop
+ end
+CODE
+_____J\xcc\x8c_
+______J\xcc\x8c_
+_______J\xcc\x8c_
+________J\xcc\x8c_
+_________J\xcc\x8c_
+__________J\xcc\x8c_
+___________J\xcc\x8c_
+____________J\xcc\x8c_
+_____________J\xcc\x8c_
+______________J\xcc\x8c_
+_______________J\xcc\x8c_
+________________J\xcc\x8c_
+_________________J\xcc\x8c_
+__________________J\xcc\x8c_
+___________________J\xcc\x8c_
+____________________J\xcc\x8c_
+_____________________J\xcc\x8c_
+______________________J\xcc\x8c_
+_______________________J\xcc\x8c_
+OUTPUT
+
output_is( <<'CODE', <<"OUTPUT", "unicode titlecase");
set S0, iso-8859-1:"t�tsch leo"
find_charset I0, "unicode"