Author: leo
Date: Tue Nov 15 03:12:43 2005
New Revision: 9980
Modified:
trunk/src/string.c
trunk/t/op/stringu.t
Log:
charsets and encodings - fix utf8:literals
* utf8:unicode:"foo" takes a string constant that is already utf8 encoded
* the string can now also contain escape sequences
* verify the result
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Tue Nov 15 03:12:43 2005
@@ -2439,50 +2439,46 @@ string_unescape_cstring(Interp * interpr
internal_exception(UNIMPLEMENTED,
"Can't make '%s' charset strings", p + 1);
}
- /*
- * XXX this is just wrong
- * we still need to unescape the string, then verify
- * that it is valid in the passed in encoding
- * then append the bytes w/o further processing to
- * the string buffer
- *
- * that is currently just fixed_8 encodings are correct
- */
result = string_make_direct(interpreter, cstring, clength,
encoding, charset, flags);
- string_compute_strlen(interpreter, result);
+ encoding = Parrot_fixed_8_encoding_ptr;
}
else {
result = string_make(interpreter, cstring, clength, enc_char, flags);
- ENCODING_ITER_INIT(interpreter, result, &iter);
- for (offs = d = 0; offs < clength; ++offs) {
- r = (Parrot_UInt4)((unsigned char*)result->strstart)[offs];
- /* There cannot be any NULs within this string. */
- assert(r != '\0');
- if (r == '\\') {
- ++offs;
- r = string_unescape_one(interpreter, &offs, result);
- --offs;
- }
- if (d == offs) {
- /* we did it in place - no action */
- ++d;
- iter.bytepos++;
- iter.charpos++;
- continue;
- }
- assert(d < offs);
- iter.set_and_advance(interpreter, &iter, r);
+ encoding = result->encoding;
+ }
+ encoding->iter_init(interpreter, result, &iter);
+ for (offs = d = 0; offs < clength; ++offs) {
+ r = (Parrot_UInt4)((unsigned char*)result->strstart)[offs];
+ /* There cannot be any NULs within this string. */
+ assert(r != '\0');
+ if (r == '\\') {
+ ++offs;
+ r = string_unescape_one(interpreter, &offs, result);
+ --offs;
+ }
+ if (d == offs) {
+ /* we did it in place - no action */
++d;
+ iter.bytepos++;
+ iter.charpos++;
+ continue;
}
- result->strlen = d;
- result->bufused = iter.bytepos;
- encoding = result->encoding;
+ assert(d < offs);
+ iter.set_and_advance(interpreter, &iter, r);
+ ++d;
+ }
+ result->strlen = d;
+ result->bufused = iter.bytepos;
+ if (encoding != result->encoding) {
+ /* this also validates the string */
+ string_compute_strlen(interpreter, result);
}
- if (!CHARSET_VALIDATE(interpreter, result, 0)) {
- internal_exception(INVALID_STRING_REPRESENTATION, "Malformed string");
+ else if (!CHARSET_VALIDATE(interpreter, result, 0)) {
+ internal_exception(INVALID_STRING_REPRESENTATION,
+ "Malformed string");
}
- if (encoding == Parrot_utf8_encoding_ptr) {
+ if (result->encoding == Parrot_utf8_encoding_ptr) {
/* Pythonic unicode flag - get rid of that, Python will
* probably need a second string class anyway
*/
Modified: trunk/t/op/stringu.t
==============================================================================
--- trunk/t/op/stringu.t (original)
+++ trunk/t/op/stringu.t Tue Nov 15 03:12:43 2005
@@ -18,7 +18,7 @@ Tests Parrot's unicode string system.
#'
-use Parrot::Test tests => 17;
+use Parrot::Test tests => 19;
use Test::More;
#use vars qw($TODO);
@@ -188,9 +188,37 @@ output_is( <<'CODE', <<OUTPUT, "UTF8 lit
length I0, S0
print I0
print "\n"
+ print S0
+ print "\n"
+ end
+CODE
+1
+\xc2\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
+ set S0, utf8:unicode:"\xc2\xab"
+ length I0, S0
+ print I0
+ print "\n"
+ print S0
+ print "\n"
end
CODE
1
+\xc2\xab
+OUTPUT
+
+output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
+ set S0, utf8:unicode:"\xf2\xab"
+ length I0, S0
+ print I0
+ print "\n"
+ print S0
+ print "\n"
+ end
+CODE
+/Malformed UTF-8 string/
OUTPUT
output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );