Author: leo
Date: Tue Nov 15 08:54:33 2005
New Revision: 9989
Modified:
trunk/include/parrot/string_funcs.h
trunk/ops/experimental.ops
trunk/src/string.c
trunk/t/op/string_cs.t
Log:
new string op escape
* op escape(out STR, invar STR)
* should soon properly escape non-ascii chars
* some test in t/op/string_cs.t
Please note that the opcode takes no string constants.
Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Tue Nov 15 08:54:33 2005
@@ -94,6 +94,8 @@ UINTVAL string_decode_and_advance(struct
size_t string_hash(Interp *interpreter, STRING *s, size_t seed);
STRING * string_unescape_cstring(Interp *,
const char *cstring, char delimiter, const char *enc_or_charset);
+STRING * string_escape_string(Interp *, STRING *);
+STRING * string_escape_string_delimited(Interp *, STRING *, UINTVAL len);
STRING *string_upcase(Interp *, const STRING *);
STRING *string_downcase(Interp *, const STRING *);
Modified: trunk/ops/experimental.ops
==============================================================================
--- trunk/ops/experimental.ops (original)
+++ trunk/ops/experimental.ops Tue Nov 15 08:54:33 2005
@@ -234,7 +234,19 @@ inline op newclosure(out PMC, in PMC) {
$1 = parrot_new_closure(interpreter, $2);
goto NEXT();
}
-###############################################################################
+#######################################################################
+
+=item B<escape>(out STR, invar STR)
+
+Escape all non-ascii chars to backslashed escape sequences. A
+string with charset I<ascii> is created as result.
+
+=cut
+
+op escape(out STR, invar STR) {
+ $1 = string_escape_string(interpreter, $2);
+ goto NEXT();
+}
=back
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Tue Nov 15 08:54:33 2005
@@ -2383,7 +2383,113 @@ string_hash(Interp * interpreter, STRING
return h;
}
+/*
+
+=item C<STRING *
+string_escape_string(Interp * interpreter, STRING *src)>
+
+Escape all non-ascii chars to backslash sequences.
+
+=item C<STRING *
+string_escape_string_delimited(Interp * interpreter, STRING *src, UINTVAL len)>
+
+Like above but limit output to len chars (used for trace output of strings).
+=cut
+
+*/
+
+STRING *
+string_escape_string(Interp * interpreter, STRING *src)
+{
+ return string_escape_string_delimited(interpreter, src,
+ (UINTVAL) ~0);
+}
+
+STRING *
+string_escape_string_delimited(Interp * interpreter,
+ STRING *src, UINTVAL limit)
+{
+ STRING *result, *hex;
+ UINTVAL c, i, len, charlen;
+ String_iter iter;
+ unsigned char *dp;
+
+ if (!src)
+ return NULL;
+ len = src->strlen;
+ if (len > limit)
+ len = limit;
+ /* expect around 2x the chars */
+ charlen = 2 * len;
+ /* create ascii result */
+ result = string_make_direct(interpreter, NULL, charlen,
+ Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
+ /* more work TODO */
+ ENCODING_ITER_INIT(interpreter, src, &iter);
+ dp = result->strstart;
+ for (i = 0; len; --len) {
+ c = iter.get_and_advance(interpreter, &iter);
+ if (i >= charlen - 6) { /* max seq len */
+ /* resize */
+ charlen = i + len * 2 + 16;
+ Parrot_reallocate_string(interpreter, src, charlen);
+ assert (i < charlen - 6);
+ }
+ if (c >= 0x100) {
+ result->bufused = result->strlen = i;
+ hex = Parrot_sprintf_c(interpreter, "\\u%04x", c);
+ result = string_append(interpreter, result, hex, 0);
+ i += hex->strlen;
+ }
+ else if (c >= 0x80) {
+ result->bufused = result->strlen = i;
+ hex = Parrot_sprintf_c(interpreter, "\\x%02x", c);
+ result = string_append(interpreter, result, hex, 0);
+ i += hex->strlen;
+ }
+ else {
+ switch (c) {
+ case '\a':
+ dp[i++] = '\\';
+ c = 'a';
+ break;
+ case '\b':
+ dp[i++] = '\\';
+ c = 'b';
+ break;
+ case '\t':
+ dp[i++] = '\\';
+ c = 't';
+ break;
+ case '\n':
+ dp[i++] = '\\';
+ c = 'n';
+ break;
+ case '\v':
+ dp[i++] = '\\';
+ c = 'v';
+ break;
+ case '\f':
+ dp[i++] = '\\';
+ c = 'f';
+ break;
+ case 27:
+ dp[i++] = '\\';
+ c = 'e';
+ break;
+ case '"':
+ dp[i++] = '\\';
+ c = '"';
+ break;
+ }
+ dp[i++] = c;
+ }
+ assert(i < charlen);
+ }
+ result->bufused = result->strlen = i;
+ return result;
+}
/*
=item C<STRING *
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Tue Nov 15 08:54:33 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 44;
+use Parrot::Test tests => 48;
use Parrot::Config;
use Test::More;
@@ -733,3 +733,43 @@ T\xc3\xb6tsch Leo
OUTPUT
} # SKIP
+
+output_is( <<'CODE', <<'OUTPUT', "escape ascii" );
+ set S0, "abcdefghi\n"
+ escape S1, S0
+ print S1
+ print "\n"
+ end
+CODE
+abcdefghi\n
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape ctrl" );
+ set S0, "\a\b\t\n\v"
+ escape S1, S0
+ print S1
+ print "\n"
+ end
+CODE
+\a\b\t\n\v
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape latin1");
+ set S0, iso-8859-1:"t�tsch leo"
+ escape S1, S0
+ print S1
+ print "\n"
+ end
+CODE
+t\xf6tsch leo
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "escape unicode" );
+ set S0, unicode:"\u2001\u2002\u2003\u2004"
+ escape S1, S0
+ print S1
+ print "\n"
+ end
+CODE
+\u2001\u2002\u2003\u2004
+OUTPUT