Attached is a patch to add string comparison ops, along with the necessary infrastructure in the string code.
The current behaviour is that if the two strings do not have the same encoding then both are promoted to UTF-32 before comparison as that should generally preserve information. Tom -- Tom Hughes ([EMAIL PROTECTED]) http://www.compton.nu/
? t/tom.pasm Index: basic_opcodes.ops =================================================================== RCS file: /home/perlcvs/parrot/basic_opcodes.ops,v retrieving revision 1.36 diff -u -w -r1.36 basic_opcodes.ops --- basic_opcodes.ops 2001/10/08 14:04:20 1.36 +++ basic_opcodes.ops 2001/10/09 23:46:56 @@ -604,6 +604,90 @@ AUTO_OP concat_s { STRING *s = string_concat(STR_REG(P1), STR_REG(P2), 1); STR_REG(P1) = s; +} + +/* EQ Sx, Sy, EQ_BRANCH */ +MANUAL_OP eq_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) == 0) { + RETURN(INT_CONST(P3)); + } +} + +/* EQ Sx, CONSTANT, EQ_BRANCH */ +MANUAL_OP eq_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) == 0) { + RETURN(INT_CONST(P3)); + } +} + +/* NE Sx, Sy, NE_BRANCH */ +MANUAL_OP ne_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) != 0) { + RETURN(INT_CONST(P3)); + } +} + +/* NE Sx, CONSTANT, NE_BRANCH */ +MANUAL_OP ne_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) != 0) { + RETURN(INT_CONST(P3)); + } +} + +/* LT Sx, Sy, LT_BRANCH */ +MANUAL_OP lt_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) < 0) { + RETURN(INT_CONST(P3)); + } +} + +/* LT Sx, CONSTANT, LT_BRANCH */ +MANUAL_OP lt_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) < 0) { + RETURN(INT_CONST(P3)); + } +} + +/* LE Sx, Sy, LE_BRANCH */ +MANUAL_OP le_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) <= 0) { + RETURN(INT_CONST(P3)); + } +} + +/* LE Sx, CONSTANT, LE_BRANCH */ +MANUAL_OP le_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) <= 0) { + RETURN(INT_CONST(P3)); + } +} + +/* GT Sx, Sy, GT_BRANCH */ +MANUAL_OP gt_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) > 0) { + RETURN(INT_CONST(P3)); + } +} + +/* GT Sx, CONSTANT, GT_BRANCH */ +MANUAL_OP gt_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) > 0) { + RETURN(INT_CONST(P3)); + } +} + +/* GE Sx, Sy, GE_BRANCH */ +MANUAL_OP ge_s_ic { + if (string_compare(STR_REG(P1), STR_REG(P2)) >= 0) { + RETURN(INT_CONST(P3)); + } +} + +/* GE Sx, CONSTANT, GE_BRANCH */ +MANUAL_OP ge_sc_ic { + if (string_compare(STR_REG(P1), STR_CONST(P2)) >= 0) { + RETURN(INT_CONST(P3)); + } } /* NOOP */ Index: opcode_table =================================================================== RCS file: /home/perlcvs/parrot/opcode_table,v retrieving revision 1.24 diff -u -w -r1.24 opcode_table --- opcode_table 2001/10/08 13:45:21 1.24 +++ opcode_table 2001/10/09 23:46:57 @@ -67,7 +67,7 @@ substr_s_s_i 4 S S I I concat_s 2 S S -# Comparators (TODO: String comparators) +# Comparators eq_i_ic 3 I I D eq_ic_ic 3 I i D @@ -94,6 +94,19 @@ gt_nc_ic 3 N n D ge_n_ic 3 N N D ge_nc_ic 3 N n D + +eq_s_ic 3 S S D +eq_sc_ic 3 S s D +ne_s_ic 3 S S D +ne_sc_ic 3 S s D +lt_s_ic 3 S S D +lt_sc_ic 3 S s D +le_s_ic 3 S S D +le_sc_ic 3 S s D +gt_s_ic 3 S S D +gt_sc_ic 3 S s D +ge_s_ic 3 S S D +ge_sc_ic 3 S s D # Flow control Index: string.c =================================================================== RCS file: /home/perlcvs/parrot/string.c,v retrieving revision 1.12 diff -u -w -r1.12 string.c --- string.c 2001/10/08 07:49:10 1.12 +++ string.c 2001/10/09 23:46:57 @@ -152,6 +152,23 @@ return (ENC_VTABLE(s)->chopn)(s, n); } +/*=for api string string_compare + * compare two strings. + */ +INTVAL +string_compare(STRING* s1, STRING* s2) { + if (s1->encoding != s2->encoding) { + if (s1->encoding->which != enc_utf32) { + s1 = Parrot_transcode_table[s1->encoding->which][enc_utf32](s1, NULL); + } + if (s2->encoding->which != enc_utf32) { + s2 = Parrot_transcode_table[s2->encoding->which][enc_utf32](s2, NULL); + } + } + + return (ENC_VTABLE(s1)->compare)(s1, s2); +} + /* * Local variables: * c-indentation-style: bsd Index: strnative.c =================================================================== RCS file: /home/perlcvs/parrot/strnative.c,v retrieving revision 1.15 diff -u -w -r1.15 strnative.c --- strnative.c 2001/10/08 07:49:10 1.15 +++ strnative.c 2001/10/09 23:46:58 @@ -82,6 +82,25 @@ return dest; } +/*=for api string_native string_native_compare + compare two strings +*/ +static INTVAL +string_native_compare(STRING* s1, STRING* s2) { + INTVAL cmp; + + if (s1->bufused < s2->bufused) { + cmp = memcmp(s1->bufstart, s2->bufstart, s1->bufused); + if (cmp == 0) cmp == -1; + } + else { + cmp = memcmp(s1->bufstart, s2->bufstart, s2->bufused); + if (cmp ==0 && s1->bufused > s2->bufused) cmp = 1; + } + + return cmp; +} + /*=for api string_native string_native_vtable return the vtable for the native string */ @@ -94,6 +113,7 @@ string_native_concat, string_native_chopn, string_native_substr, + string_native_compare, }; return sv; } Index: strutf16.c =================================================================== RCS file: /home/perlcvs/parrot/strutf16.c,v retrieving revision 1.1 diff -u -w -r1.1 strutf16.c --- strutf16.c 2001/10/08 07:50:52 1.1 +++ strutf16.c 2001/10/09 23:46:59 @@ -123,6 +123,37 @@ return dest; } +/*=for api string_utf16 string_utf16_compare + compare two strings +*/ +static INTVAL +string_utf16_compare(STRING* s1, STRING* s2) { + utf16_t *s1start = s1->bufstart; + utf16_t *s1end = s1start + s1->bufused / sizeof(utf16_t); + utf16_t *s2start = s2->bufstart; + utf16_t *s2end = s2start + s2->bufused / sizeof(utf16_t); + INTVAL cmp = 0; + + while (cmp == 0 && s1start < s1end && s2start < s2end) { + utf32_t c1 = *s1start++; + utf32_t c2 = *s2start++; + + if (UNICODE_IS_HIGH_SURROGATE(c1)) { + c1 = UNICODE_DECODE_SURROGATE(c1, *s1start++); + } + if (UNICODE_IS_HIGH_SURROGATE(c2)) { + c2 = UNICODE_DECODE_SURROGATE(c2, *s2start++); + } + + cmp = c1 - c2; + } + + if (cmp == 0 && s1start < s1end) cmp = 1; + if (cmp == 0 && s2start < s2end) cmp = -1; + + return cmp; +} + /*=for api string_utf16 string_utf16_vtable return the vtable for the native string */ @@ -135,6 +166,7 @@ string_utf16_concat, string_utf16_chopn, string_utf16_substr, + string_utf16_compare, }; return sv; } Index: strutf32.c =================================================================== RCS file: /home/perlcvs/parrot/strutf32.c,v retrieving revision 1.1 diff -u -w -r1.1 strutf32.c --- strutf32.c 2001/10/08 07:50:52 1.1 +++ strutf32.c 2001/10/09 23:46:59 @@ -11,6 +11,7 @@ */ #include "parrot/parrot.h" +#include "parrot/unicode.h" /* Functions for handling strings in UTF-32 format */ @@ -80,6 +81,27 @@ return dest; } +/*=for api string_utf32 string_utf32_compare + compare two strings +*/ +static INTVAL +string_utf32_compare(STRING* s1, STRING* s2) { + utf32_t *s1start = s1->bufstart; + utf32_t *s1end = s1start + s1->strlen; + utf32_t *s2start = s2->bufstart; + utf32_t *s2end = s2start + s2->strlen; + INTVAL cmp = 0; + + while (cmp == 0 && s1start < s1end && s2start < s2end) { + cmp = *s1start++ - *s2start++; + } + + if (cmp == 0 && s1start < s1end) cmp = 1; + if (cmp == 0 && s2start < s2end) cmp = -1; + + return cmp; +} + /*=for api string_utf32 string_utf32_vtable return the vtable for the native string */ @@ -92,6 +114,7 @@ string_utf32_concat, string_utf32_chopn, string_utf32_substr, + string_utf32_compare, }; return sv; } Index: strutf8.c =================================================================== RCS file: /home/perlcvs/parrot/strutf8.c,v retrieving revision 1.1 diff -u -w -r1.1 strutf8.c --- strutf8.c 2001/10/08 07:50:52 1.1 +++ strutf8.c 2001/10/09 23:47:00 @@ -124,6 +124,58 @@ return dest; } +/*=for api string_utf8 string_utf8_compare + compare two strings +*/ +static INTVAL +string_utf8_compare(STRING* s1, STRING* s2) { + utf8_t *s1start = s1->bufstart; + utf8_t *s1end = s1start + s1->bufused; + utf8_t *s2start = s2->bufstart; + utf8_t *s2end = s2start + s2->bufused; + INTVAL cmp = 0; + + while (cmp == 0 && s1start < s1end && s2start < s2end) { + utf32_t c1 = *s1start++; + utf32_t c2 = *s2start++; + + if (UTF8_IS_START(c1)) { + INTVAL len = UTF8SKIP(s1start - 1); + INTVAL count; + + c1 &= UTF8_START_MASK(len); + for (count = 1; count < len; count++) { + if (!UTF8_IS_CONTINUATION(*s1start)) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + c1 = UTF8_ACCUMULATE(c1, *s1start++); + } + } + + if (UTF8_IS_START(c2)) { + INTVAL len = UTF8SKIP(s2start - 1); + INTVAL count; + + c2 &= UTF8_START_MASK(len); + for (count = 1; count < len; count++) { + if (!UTF8_IS_CONTINUATION(*s2start)) { + INTERNAL_EXCEPTION(MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + c2 = UTF8_ACCUMULATE(c1, *s2start++); + } + } + + cmp = c1 - c2; + } + + if (cmp == 0 && s1start < s1end) cmp = 1; + if (cmp == 0 && s2start < s2end) cmp = -1; + + return cmp; +} + /*=for api string_utf8 string_utf8_vtable return the vtable for the native string */ @@ -136,6 +188,7 @@ string_utf8_concat, string_utf8_chopn, string_utf8_substr, + string_utf8_compare, }; return sv; } Index: include/parrot/string.h =================================================================== RCS file: /home/perlcvs/parrot/include/parrot/string.h,v retrieving revision 1.4 diff -u -w -r1.4 string.h --- include/parrot/string.h 2001/10/08 07:49:03 1.4 +++ include/parrot/string.h 2001/10/09 23:47:04 @@ -33,6 +33,7 @@ typedef STRING* (*two_strings_iv_to_string_t)(STRING *, STRING *, INTVAL); typedef STRING* (*substr_t)(STRING*, INTVAL, INTVAL, STRING*); typedef INTVAL (*iv_to_iv_t)(INTVAL); +typedef INTVAL (*two_strings_to_iv_t)(STRING*, STRING*); struct string_vtable { encoding_t which; /* What sort of encoding is this? */ @@ -41,6 +42,7 @@ two_strings_iv_to_string_t concat; /* Append string b to the end of string a */ string_iv_to_string_t chopn; /* Remove n characters from the end of a string */ substr_t substr; /* Substring operation */ + two_strings_to_iv_t compare; /* Compare operation */ }; struct parrot_string { @@ -67,6 +69,8 @@ string_chopn(STRING*, INTVAL); STRING* string_substr(STRING*, INTVAL, INTVAL, STRING**); +INTVAL +string_compare(STRING*, STRING*); /* Declarations of other functions */ INTVAL