Author: leo
Date: Wed Nov 9 14:26:49 2005
New Revision: 9873
Modified:
trunk/include/parrot/charset.h
trunk/include/parrot/string_funcs.h
trunk/ops/ops.num
trunk/ops/string.ops
trunk/src/charset.c
trunk/src/encoding.c
trunk/src/string.c
trunk/t/op/string_cs.t
Log:
encodings - opcodes
* create needed encoding opcodes
* fix a bug in encoding bootstrapping:
registering charsets needs encodings (and charsets) for charset names
registering encodings needs charsets (and encodings) for encoding names
* add a test that uses trans_encoding instead of utf8 filter
More tests are very welcome. Thanks.
Modified: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h (original)
+++ trunk/include/parrot/charset.h Wed Nov 9 14:26:49 2005
@@ -150,7 +150,7 @@ struct _charset {
#define CHARSET_COMPUTE_HASH(interp, source, seed) ((CHARSET
*)source->charset)->compute_hash(interpreter, source, seed)
#define CHARSET_GET_PREFERRED_ENCODING(interp, source) ((CHARSET
*)source->charset)->preferred_encoding
-#define CHARSET_TO_ENCODING(interp, source, offset, count) ((ENCODING
*)source->encoding)->to_encoding(interp, source, offset, count)
+#define CHARSET_TO_ENCODING(interp, source) ((ENCODING
*)source->encoding)->to_encoding(interp, source)
#define CHARSET_COPY_TO_ENCODING(interp, source) ((ENCODING
*)source->encoding)->copy_to_encoding(interp, source)
#define CHARSET_GET_CODEPOINT(interp, source, offset) ((ENCODING
*)source->encoding)->get_codepoint(interp, source, offset)
#define CHARSET_SET_CODEPOINT(interp, source, offset, codepoint) ((ENCODING
*)source->encoding)->set_codepoint(interp, source, offset, codepoint)
Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Wed Nov 9 14:26:49 2005
@@ -120,6 +120,8 @@ INTVAL Parrot_string_find_word_boundary(
STRING* Parrot_string_trans_charset(Interp *, STRING *src,
INTVAL charset_nr, STRING *dest);
+STRING* Parrot_string_trans_encoding(Interp *, STRING *src,
+ INTVAL encoding_nr, STRING *dest);
CHARSET* string_rep_compatible (Interp *, STRING *a, const STRING *b,
ENCODING**);
Modified: trunk/ops/ops.num
==============================================================================
--- trunk/ops/ops.num (original)
+++ trunk/ops/ops.num Wed Nov 9 14:26:49 2005
@@ -1227,3 +1227,15 @@ if_null_p_ic 1196
if_null_s_ic 1197
unless_null_p_ic 1198
unless_null_s_ic 1199
+encoding_i_s 1200
+encoding_i_sc 1201
+encodingname_s_i 1202
+encodingname_s_ic 1203
+find_encoding_i_s 1204
+find_encoding_i_sc 1205
+trans_encoding_s_i 1206
+trans_encoding_s_ic 1207
+trans_encoding_s_s_i 1208
+trans_encoding_s_s_ic 1209
+trans_encoding_s_sc_i 1210
+trans_encoding_s_sc_ic 1211
Modified: trunk/ops/string.ops
==============================================================================
--- trunk/ops/string.ops (original)
+++ trunk/ops/string.ops Wed Nov 9 14:26:49 2005
@@ -583,6 +583,61 @@ op trans_charset(out STR, in STR, in INT
goto NEXT();
}
+=item B<encoding>(out INT, in STR)
+
+Return the encoding number of string $2.
+
+=item B<encodingname>(out STR, in INT)
+
+Return the name of encoding numbered $2.
+
+=item B<find_encoding>(out INT, in STR)
+
+Return the encoding number of the encoding named $2. If the encoding doesn't
+exit, throw an exception.
+
+=item B<trans_encoding>(inout STR, in INT)
+
+Change the string to have the specified encoding.
+
+=item B<trans_encoding>(out STR, in STR, in INT)
+
+Create a string $1 from $2 with the specified encoding.
+
+Both functions may throw an exception on information loss.
+
+=cut
+
+op encoding(out INT, in STR) :base_core {
+ $1 = Parrot_encoding_number_of_str(interpreter, $2);
+ goto NEXT();
+}
+
+op encodingname(out STR, in INT) :base_core {
+ $1 = string_copy(interpreter, Parrot_encoding_name(interpreter, $2));
+ goto NEXT();
+}
+
+op find_encoding(out INT, in STR) :base_core {
+ INTVAL n = Parrot_encoding_number(interpreter, $2);
+ if (n < 0)
+ real_exception(interpreter, NULL, 1,
+ "encoding '%Ss' not found", $2);
+ $1 = n;
+ goto NEXT();
+}
+
+op trans_encoding(inout STR, in INT) {
+ $1 = Parrot_string_trans_encoding(interpreter, $1, $2, NULL);
+ goto NEXT();
+}
+
+op trans_encoding(out STR, in STR, in INT) {
+ STRING *dest = new_string_header(interpreter, 0);
+ $1 = Parrot_string_trans_encoding(interpreter, $2, $3, dest);
+ goto NEXT();
+}
+
=item B<is_cclass>(out INT, in INT, in STR, in INT)
Set $1 to 1 if the codepoint of $3 at position $4 is in
Modified: trunk/src/charset.c
==============================================================================
--- trunk/src/charset.c (original)
+++ trunk/src/charset.c Wed Nov 9 14:26:49 2005
@@ -238,6 +238,8 @@ Parrot_register_charset(Interp *interpre
return 0;
}
+void parrot_init_encodings_2(Interp *interpreter);
+
void
Parrot_charsets_encodings_init(Interp *interpreter)
{
@@ -253,6 +255,11 @@ Parrot_charsets_encodings_init(Interp *i
Parrot_charset_iso_8859_1_init(interpreter);
Parrot_charset_binary_init(interpreter);
Parrot_charset_unicode_init(interpreter);
+
+ /*
+ * now encoding strings don't have a charset yet - set default
+ */
+ parrot_init_encodings_2(interpreter);
/*
* now install charset converters
*/
Modified: trunk/src/encoding.c
==============================================================================
--- trunk/src/encoding.c (original)
+++ trunk/src/encoding.c Wed Nov 9 14:26:49 2005
@@ -32,6 +32,18 @@ typedef struct {
static All_encodings *all_encodings;
+void parrot_init_encodings_2(Interp *interpreter);
+void
+parrot_init_encodings_2(Interp *interpreter)
+{
+ int i, n;
+
+ n = all_encodings->n_encodings;
+ for (i = 0; i < n; ++i) {
+ all_encodings->enc[i].name->charset = Parrot_default_charset_ptr;
+ }
+}
+
void
parrot_deinit_encodings(Interp *interpreter)
{
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Wed Nov 9 14:26:49 2005
@@ -2679,6 +2679,39 @@ Parrot_string_trans_charset(Interp *inte
return CHARSET_TO_CHARSET(interpreter, src, new_charset, dest);
}
+STRING*
+Parrot_string_trans_encoding(Interp *interpreter, STRING *src,
+ INTVAL encoding_nr, STRING *dest)
+{
+ ENCODING *new_encoding;
+
+ if (!src)
+ return NULL;
+ new_encoding = Parrot_get_encoding(interpreter, encoding_nr);
+ if (!new_encoding)
+ real_exception(interpreter, NULL, INVALID_CHARTYPE,
+ "encoding #%d not found", (int) encoding_nr);
+ /*
+ * dest is an empty string header or NULL, if an inplace
+ * operation is desired
+ */
+ if (dest) {
+ if (new_encoding == src->encoding) {
+ Parrot_reuse_COW_reference(interpreter, src, dest);
+ dest->encoding = new_encoding;
+ return dest;
+ }
+ return new_encoding->copy_to_encoding(interpreter, src);
+ }
+ else {
+ if (new_encoding == src->encoding) {
+ return src;
+ }
+ }
+ new_encoding->to_encoding(interpreter, src);
+ return src;
+}
+
/*
=back
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Wed Nov 9 14:26:49 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 32;
+use Parrot::Test tests => 33;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -516,5 +516,19 @@ output_is( <<'CODE', <<"OUTPUT", "unicod
print "\n"
end
CODE
+t\xc3\xb6tsch
+OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transcharset");
+ set S0, iso-8859-1:"T�TSCH"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ downcase S1
+ find_encoding I0, "utf8"
+ trans_encoding S2, S1, I0
+ print S2
+ print "\n"
+ end
+CODE
t\xc3\xb6tsch
OUTPUT