Author: leo
Date: Tue Nov 15 03:12:43 2005
New Revision: 9980

Modified:
   trunk/src/string.c
   trunk/t/op/stringu.t
Log:
charsets and encodings - fix utf8:literals

* utf8:unicode:"foo" takes a string constant that is already utf8 encoded
* the string can now also contain escape sequences
* verify the result


Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Tue Nov 15 03:12:43 2005
@@ -2439,50 +2439,46 @@ string_unescape_cstring(Interp * interpr
             internal_exception(UNIMPLEMENTED,
                     "Can't make '%s' charset strings", p + 1);
         }
-        /*
-         * XXX this is just wrong
-         *     we still need to unescape the string, then verify
-         *     that it is valid in the passed in encoding
-         *     then append the bytes w/o further processing to
-         *     the string buffer
-         *
-         * that is currently just fixed_8 encodings are correct
-         */
         result = string_make_direct(interpreter, cstring, clength,
                 encoding, charset, flags);
-        string_compute_strlen(interpreter, result);
+        encoding = Parrot_fixed_8_encoding_ptr;
     }
     else {
         result = string_make(interpreter, cstring, clength, enc_char, flags);
-        ENCODING_ITER_INIT(interpreter, result, &iter);
-        for (offs = d = 0; offs < clength; ++offs) {
-            r = (Parrot_UInt4)((unsigned char*)result->strstart)[offs];
-            /* There cannot be any NULs within this string.  */
-            assert(r != '\0');
-            if (r == '\\') {
-                ++offs;
-                r = string_unescape_one(interpreter, &offs, result);
-                --offs;
-            }
-            if (d == offs) {
-                /* we did it in place - no action */
-                ++d;
-                iter.bytepos++;
-                iter.charpos++;
-                continue;
-            }
-            assert(d < offs);
-            iter.set_and_advance(interpreter, &iter, r);
+        encoding = result->encoding;
+    }
+    encoding->iter_init(interpreter, result, &iter);
+    for (offs = d = 0; offs < clength; ++offs) {
+        r = (Parrot_UInt4)((unsigned char*)result->strstart)[offs];
+        /* There cannot be any NULs within this string.  */
+        assert(r != '\0');
+        if (r == '\\') {
+            ++offs;
+            r = string_unescape_one(interpreter, &offs, result);
+            --offs;
+        }
+        if (d == offs) {
+            /* we did it in place - no action */
             ++d;
+            iter.bytepos++;
+            iter.charpos++;
+            continue;
         }
-        result->strlen = d;
-        result->bufused = iter.bytepos;
-        encoding = result->encoding;
+        assert(d < offs);
+        iter.set_and_advance(interpreter, &iter, r);
+        ++d;
+    }
+    result->strlen = d;
+    result->bufused = iter.bytepos;
+    if (encoding != result->encoding) {
+        /* this also validates the string */
+        string_compute_strlen(interpreter, result);
     }
-    if (!CHARSET_VALIDATE(interpreter, result, 0)) {
-        internal_exception(INVALID_STRING_REPRESENTATION, "Malformed string");
+    else if (!CHARSET_VALIDATE(interpreter, result, 0)) {
+        internal_exception(INVALID_STRING_REPRESENTATION,
+                "Malformed string");
     }
-    if (encoding == Parrot_utf8_encoding_ptr) {
+    if (result->encoding == Parrot_utf8_encoding_ptr) {
         /* Pythonic unicode flag - get rid of that, Python will
          * probably need a second string class anyway
          */

Modified: trunk/t/op/stringu.t
==============================================================================
--- trunk/t/op/stringu.t        (original)
+++ trunk/t/op/stringu.t        Tue Nov 15 03:12:43 2005
@@ -18,7 +18,7 @@ Tests Parrot's unicode string system.
 
 #'
 
-use Parrot::Test tests => 17;
+use Parrot::Test tests => 19;
 use Test::More;
 #use vars qw($TODO);
 
@@ -188,9 +188,37 @@ output_is( <<'CODE', <<OUTPUT, "UTF8 lit
     length I0, S0
     print I0
     print "\n"
+    print S0
+    print "\n"
+    end
+CODE
+1
+\xc2\xab
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
+    set S0, utf8:unicode:"\xc2\xab"
+    length I0, S0
+    print I0
+    print "\n"
+    print S0
+    print "\n"
     end
 CODE
 1
+\xc2\xab
+OUTPUT
+
+output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
+    set S0, utf8:unicode:"\xf2\xab"
+    length I0, S0
+    print I0
+    print "\n"
+    print S0
+    print "\n"
+    end
+CODE
+/Malformed UTF-8 string/
 OUTPUT
 
 output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );

Reply via email to