Author: leo
Date: Tue Nov 15 08:54:33 2005
New Revision: 9989

Modified:
   trunk/include/parrot/string_funcs.h
   trunk/ops/experimental.ops
   trunk/src/string.c
   trunk/t/op/string_cs.t
Log:
new string op escape

* op escape(out STR, invar STR)
* should soon properly escape non-ascii chars
* some test in t/op/string_cs.t

Please note that the opcode takes no string constants.



Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Tue Nov 15 08:54:33 2005
@@ -94,6 +94,8 @@ UINTVAL string_decode_and_advance(struct
 size_t string_hash(Interp *interpreter, STRING *s, size_t seed);
 STRING * string_unescape_cstring(Interp *,
         const char *cstring, char delimiter, const char *enc_or_charset);
+STRING * string_escape_string(Interp *, STRING *);
+STRING * string_escape_string_delimited(Interp *, STRING *, UINTVAL len);
 
 STRING *string_upcase(Interp *, const STRING *);
 STRING *string_downcase(Interp *, const STRING *);

Modified: trunk/ops/experimental.ops
==============================================================================
--- trunk/ops/experimental.ops  (original)
+++ trunk/ops/experimental.ops  Tue Nov 15 08:54:33 2005
@@ -234,7 +234,19 @@ inline op newclosure(out PMC, in PMC) {
   $1 = parrot_new_closure(interpreter, $2);
   goto NEXT();
 }
-###############################################################################
+#######################################################################
+
+=item B<escape>(out STR, invar STR)
+
+Escape all non-ascii chars to backslashed escape sequences. A
+string with charset I<ascii> is created as result.
+
+=cut
+
+op escape(out STR, invar STR) {
+  $1 = string_escape_string(interpreter, $2);
+  goto NEXT();
+}
 
 =back
 

Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Tue Nov 15 08:54:33 2005
@@ -2383,7 +2383,113 @@ string_hash(Interp * interpreter, STRING
     return h;
 }
 
+/*
+
+=item C<STRING *
+string_escape_string(Interp * interpreter, STRING *src)>
+
+Escape all non-ascii chars to backslash sequences.
+
+=item C<STRING *
+string_escape_string_delimited(Interp * interpreter, STRING *src, UINTVAL len)>
+
+Like above but limit output to len chars (used for trace output of strings).
 
+=cut
+
+*/
+
+STRING *
+string_escape_string(Interp * interpreter, STRING *src)
+{
+    return string_escape_string_delimited(interpreter, src,
+            (UINTVAL) ~0);
+}
+
+STRING *
+string_escape_string_delimited(Interp * interpreter,
+        STRING *src, UINTVAL limit)
+{
+    STRING *result, *hex;
+    UINTVAL c, i, len, charlen;
+    String_iter iter;
+    unsigned char *dp;
+
+    if (!src)
+        return NULL;
+    len = src->strlen;
+    if (len > limit)
+        len = limit;
+    /* expect around 2x the chars */
+    charlen = 2 * len;
+    /* create ascii result */
+    result = string_make_direct(interpreter, NULL, charlen,
+            Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
+    /* more work TODO */
+    ENCODING_ITER_INIT(interpreter, src, &iter);
+    dp = result->strstart;
+    for (i = 0; len; --len) {
+        c = iter.get_and_advance(interpreter, &iter);
+        if (i >= charlen - 6) {        /* max seq len */
+            /* resize */
+                    charlen = i + len * 2 + 16;
+            Parrot_reallocate_string(interpreter, src, charlen);
+            assert (i < charlen - 6);
+        }
+        if (c >= 0x100) {
+            result->bufused = result->strlen = i;
+            hex = Parrot_sprintf_c(interpreter, "\\u%04x", c);
+            result = string_append(interpreter, result, hex, 0);
+            i += hex->strlen;
+        }
+        else if (c >= 0x80) {
+            result->bufused = result->strlen = i;
+            hex = Parrot_sprintf_c(interpreter, "\\x%02x", c);
+            result = string_append(interpreter, result, hex, 0);
+            i += hex->strlen;
+        }
+        else  {
+            switch (c) {
+                case '\a':
+                    dp[i++] = '\\';
+                    c = 'a';
+                    break;
+                case '\b':
+                    dp[i++] = '\\';
+                    c = 'b';
+                    break;
+                case '\t':
+                    dp[i++] = '\\';
+                    c = 't';
+                    break;
+                case '\n':
+                    dp[i++] = '\\';
+                    c = 'n';
+                    break;
+                case '\v':
+                    dp[i++] = '\\';
+                    c = 'v';
+                    break;
+                case '\f':
+                    dp[i++] = '\\';
+                    c = 'f';
+                    break;
+                case 27:
+                    dp[i++] = '\\';
+                    c = 'e';
+                    break;
+                case '"':
+                    dp[i++] = '\\';
+                    c = '"';
+                    break;
+            }
+            dp[i++] = c;
+        }
+        assert(i < charlen);
+    }
+    result->bufused = result->strlen = i;
+    return result;
+}
 /*
 
 =item C<STRING *

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Tue Nov 15 08:54:33 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 44;
+use Parrot::Test tests => 48;
 use Parrot::Config;
 use Test::More;
 
@@ -733,3 +733,43 @@ T\xc3\xb6tsch Leo
 OUTPUT
 
 }  # SKIP
+
+output_is( <<'CODE', <<'OUTPUT', "escape ascii" );
+    set S0, "abcdefghi\n"
+    escape S1, S0
+    print S1
+    print "\n"
+    end
+CODE
+abcdefghi\n
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape ctrl" );
+    set S0, "\a\b\t\n\v"
+    escape S1, S0
+    print S1
+    print "\n"
+    end
+CODE
+\a\b\t\n\v
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape latin1");
+    set S0, iso-8859-1:"t�tsch leo"
+    escape S1, S0
+    print S1
+    print "\n"
+    end
+CODE
+t\xf6tsch leo
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape unicode" );
+    set S0, unicode:"\u2001\u2002\u2003\u2004"
+    escape S1, S0
+    print S1
+    print "\n"
+    end
+CODE
+\u2001\u2002\u2003\u2004
+OUTPUT

Reply via email to