Author: leo
Date: Thu Nov 10 08:12:19 2005
New Revision: 9884
Modified:
trunk/src/string.c
trunk/t/op/string_cs.t
Log:
unicode improvements - string_replace
* string_replace should now handle all charsets and encodings
* 2 tests
More tests welcome
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Thu Nov 10 08:12:19 2005
@@ -1152,17 +1152,6 @@ string_replace(Interp *interpreter, STRI
true_offset = (UINTVAL)offset;
true_length = (UINTVAL)length;
-
- /* may have different reps..... */
- if ( !(cs = string_rep_compatible(interpreter, src, rep, &enc))) {
- internal_exception(UNIMPLEMENTED,
- "Cross-type string replace (%s/%s) (%s/%s) unsupported",
- ((ENCODING *)(src->encoding))->name,
- ((CHARSET *)(src->charset))->name,
- ((ENCODING *)(rep->encoding))->name,
- ((CHARSET *)(rep->charset))->name);
- }
-
/* abs(-offset) may not be > strlen-1 */
if (offset < 0) {
true_offset = (UINTVAL)(src->strlen + offset);
@@ -1180,6 +1169,23 @@ string_replace(Interp *interpreter, STRI
true_length = (UINTVAL)(src->strlen - true_offset);
}
+ /* Save the substring that is replaced for the return value */
+ if (d != NULL) {
+ dest = CHARSET_GET_CODEPOINTS(interpreter, src,
+ true_offset, true_length);
+ *d = dest;
+ }
+
+ /* may have different reps..... */
+ if ( !(cs = string_rep_compatible(interpreter, src, rep, &enc))) {
+ Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+ rep = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, rep);
+ }
+ else {
+ src->charset = cs;
+ src->encoding = enc;
+ }
+
/* get byte position of the part that will be replaced */
ENCODING_ITER_INIT(interpreter, src, &iter);
iter.set_position(interpreter, &iter, true_offset);
@@ -1192,26 +1198,7 @@ string_replace(Interp *interpreter, STRI
internal_exception(SUBSTR_OUT_OF_STRING,
"replace: subend somehow is less than substart");
}
- /* Save the substring that is replaced for the return value */
- if (d != NULL) {
- UINTVAL length_bytes = string_max_bytes(interpreter, src, true_length);
-
- dest = string_make_empty(interpreter, enum_stringrep_one,
length_bytes);
- dest->charset = src->charset;
- dest->encoding = src->encoding;
-
- mem_sys_memcopy(dest->strstart,
- (char *)src->strstart + start_byte,
- end_byte - start_byte);
-
- dest->bufused = end_byte - start_byte;
- dest->strlen = true_length;
-
- *d = dest;
- }
- src->charset = cs;
- src->encoding = enc;
/* Now do the replacement */
/*
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Thu Nov 10 08:12:19 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 37;
+use Parrot::Test tests => 39;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -618,3 +618,38 @@ CODE
10 20
T\xc3\xb6tsch Leo
OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "utf16 substr");
+ set S0, iso-8859-1:"T�tsch"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ find_encoding I0, "utf16"
+ trans_encoding S1, S1, I0
+ substr S2, S1, 1, 2
+ find_encoding I0, "utf8"
+ trans_encoding S2, S2, I0
+ print S2
+ print "\n"
+ end
+CODE
+\xc3\xb6t
+OUTPUT
+output_is( <<'CODE', <<"OUTPUT", "utf16 replace");
+ set S0, iso-8859-1:"T�tsch"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ find_encoding I0, "utf16"
+ trans_encoding S1, S1, I0
+ substr S2, S1, 1, 1, "oe"
+ find_encoding I0, "utf8"
+ trans_encoding S2, S2, I0
+ trans_encoding S1, S1, I0
+ print S2
+ print "\n"
+ print S1
+ print "\n"
+ end
+CODE
+\xc3\xb6
+Toetsch
+OUTPUT