cvsuser 05/02/28 09:17:57
Modified: charset ascii.c ascii.h iso-8859-1.c iso-8859-1.h
encodings fixed_8.c
include/parrot string_funcs.h
ops ops.num string.ops
src string.c
t/op string_cs.t
Log:
Strings. Finally. 4 - character classification
* is_digit, _wordchar, _punctuation, _newline, _whitespace
opcodes and interface functions
* make a bitmask from typetables
Revision Changes Path
1.10 +106 -44 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- ascii.c 28 Feb 2005 15:10:55 -0000 1.9
+++ ascii.c 28 Feb 2005 17:17:51 -0000 1.10
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.9 2005/02/28 15:10:55 leo Exp $
+$Id: ascii.c,v 1.10 2005/02/28 17:17:51 leo Exp $
=head1 NAME
@@ -21,21 +21,26 @@
/* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
+#define WHITESPACE 1
+#define WORDCHAR 2
+#define PUNCTUATION 4
+#define DIGIT 8
+
static const unsigned char typetable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
- 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
- 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
- 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
+ 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
+ 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4,
/*48.*/
+ 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
+ 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160-175 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176-191 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192-207 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 207-223 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208-223 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224-239 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240-255 */
};
@@ -44,19 +49,13 @@
ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
unsigned char type, const unsigned char *table)
{
- INTVAL retval = -1;
- INTVAL found = 0;
for (; start < string->strlen; start++) {
if (table[ENCODING_GET_CODEPOINT(interpreter, string, start)] ==
type) {
- found = 1;
- break;
+ return start;
}
}
- if (found) {
- retval = start;
- }
- return retval;
+ return -1;
}
INTVAL
@@ -251,112 +250,175 @@
return -1;
}
-/* Binary's always valid */
static UINTVAL
-validate(Interp *interpreter, STRING *source_string)
+validate(Interp *interpreter, STRING *src)
{
+ UINTVAL codepoint, offset;
+
+ for (offset = 0; offset < string_length(interpreter, src); ++offset) {
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, src, offset);
+ if (codepoint >= 0x80)
+ return 0;
+ }
return 1;
}
-/* No word chars in binary data */
+
static INTVAL
is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
}
static INTVAL
find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return -1;
+ return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
+ typetable);
}
static INTVAL
find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return offset;
+ return ascii_find_not_thing(interpreter, source_string, offset, WORDCHAR,
+ typetable);
}
static INTVAL
is_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == WHITESPACE);
}
static INTVAL
find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return -1;
+ return ascii_find_thing(interpreter, source_string, offset, WHITESPACE,
+ typetable);
}
static INTVAL
find_not_whitespace(Interp *interpreter, STRING *source_string,
UINTVAL offset)
{
- return offset;
+ return ascii_find_not_thing(interpreter, source_string, offset,
+ WHITESPACE, typetable);
}
static INTVAL
is_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] & DIGIT) ? 1 : 0;
}
static INTVAL
find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return -1;
+ return ascii_find_thing(interpreter, source_string, offset, DIGIT,
+ typetable);
}
static INTVAL
find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return offset;
+ return ascii_find_not_thing(interpreter, source_string, offset, DIGIT,
+ typetable);
}
static INTVAL
is_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return (typetable[codepoint] == PUNCTUATION);
}
static INTVAL
find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return -1;
+ return ascii_find_thing(interpreter, source_string, offset, PUNCTUATION,
+ typetable);
}
static INTVAL
find_not_punctuation(Interp *interpreter, STRING *source_string,
UINTVAL offset)
{
- return offset;
+ return ascii_find_not_thing(interpreter, source_string, offset,
+ PUNCTUATION, typetable);
}
-static INTVAL
-is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
+INTVAL
+ascii_is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return 0;
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+ return codepoint == 10;
}
-static INTVAL
-find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
+
+INTVAL
+ascii_find_newline(Interp *interpreter, STRING *string, UINTVAL start)
{
+ for (; start < string->strlen; start++) {
+ if (ENCODING_GET_CODEPOINT(interpreter, string, start) == 10) {
+ return start;
+ }
+ }
return -1;
}
-static INTVAL
-find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
+INTVAL
+ascii_find_not_newline(Interp *interpreter, STRING *string, UINTVAL start)
+{
+ for (; start < string->strlen; start++) {
+ if (ENCODING_GET_CODEPOINT(interpreter, string, start) != 10) {
+ return start;
+ }
+ }
+ return -1;
+}
+
+INTVAL
+ascii_find_word_boundary(Interp *interpreter, STRING *string,
+ UINTVAL offset, const unsigned char *table)
{
- return offset;
+ UINTVAL c, len;
+ int is_wc1, is_wc2;
+
+ len = string->strlen;
+ if (!len)
+ return -1;
+ c = ENCODING_GET_CODEPOINT(interpreter, string, offset);
+ is_wc1 = (table[c] & WORDCHAR) ? 1 : 0;
+ /* begin of string */
+ if (!offset && is_wc1)
+ return 0;
+ for (++offset; offset < len; offset++) {
+ c = ENCODING_GET_CODEPOINT(interpreter, string, offset);
+ is_wc2 = (table[c] & WORDCHAR) ? 1 : 0;
+ if (is_wc1 ^ is_wc2)
+ return offset - 1;
+ is_wc1 = is_wc2;
+ }
+ /* end of string */
+ if (is_wc1 && offset == len)
+ return offset - 1;
+ return -1;
}
static INTVAL
find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL
offset)
{
- return -1;
+ return ascii_find_word_boundary(interpreter, source_string,
+ offset, typetable);
}
-
static STRING *
string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
{
@@ -419,9 +481,9 @@
is_punctuation,
find_punctuation,
find_not_punctuation,
- is_newline,
- find_newline,
- find_not_newline,
+ ascii_is_newline,
+ ascii_find_newline,
+ ascii_find_not_newline,
find_word_boundary,
string_from_codepoint,
compute_hash,
1.8 +6 -5 parrot/charset/ascii.h
Index: ascii.h
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.h,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- ascii.h 28 Feb 2005 15:10:55 -0000 1.7
+++ ascii.h 28 Feb 2005 17:17:51 -0000 1.8
@@ -1,7 +1,7 @@
/* ascii.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: ascii.h,v 1.7 2005/02/28 15:10:55 leo Exp $
+ * $Id: ascii.h,v 1.8 2005/02/28 17:17:51 leo Exp $
* Overview:
* This is the header for the ascii charset functions
* Data Structure and Algorithms:
@@ -27,6 +27,11 @@
UINTVAL offset, UINTVAL count);
STRING *ascii_get_graphemes_inplace(Interp *, STRING *source_string,
STRING *dest_string, UINTVAL offset, UINTVAL count);
+INTVAL ascii_is_newline(Interp *, STRING *source_string, UINTVAL offset);
+INTVAL ascii_find_newline(Interp *, STRING *source_string, UINTVAL offset);
+INTVAL ascii_find_not_newline(Interp *, STRING *source_string, UINTVAL
offset);
+INTVAL ascii_find_word_boundary(Interp *, STRING *source_string,
+ UINTVAL offset, const unsigned char *typetable);
static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset,
UINTVAL replace_count, STRING *insert_string);
static void to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
@@ -56,10 +61,6 @@
static INTVAL is_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
-static INTVAL is_newline(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_newline(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_newline(Interp *, STRING *source_string, UINTVAL
offset);
-static INTVAL find_word_boundary(Interp *, STRING *source_string, UINTVAL
offset);
static size_t compute_hash(Interp *, STRING *source_string);
CHARSET *Parrot_charset_ascii_init(Interp *);
1.8 +29 -35 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- iso-8859-1.c 28 Feb 2005 15:10:55 -0000 1.7
+++ iso-8859-1.c 28 Feb 2005 17:17:51 -0000 1.8
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.7 2005/02/28 15:10:55 leo Exp $
+$Id: iso-8859-1.c,v 1.8 2005/02/28 17:17:51 leo Exp $
=head1 NAME
@@ -23,22 +23,22 @@
#define WHITESPACE 1
#define WORDCHAR 2
-#define PUNCTUATION 3
-#define DIGIT 4
+#define PUNCTUATION 4
+#define DIGIT 8
static const unsigned char typetable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
- 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
- 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
- 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
+ 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
+ 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4, /*
48 */
+ 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
+ 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
- 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 160-175 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, /* 176-191 */
+ 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 160-175 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, /* 176-191 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 192-207 */
2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 208-223 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224-239 */
@@ -274,26 +274,33 @@
return retval;
}
-/* Binary's always valid */
+
static UINTVAL
-validate(Interp *interpreter, STRING *source_string)
+validate(Interp *interpreter, STRING *src)
{
+ UINTVAL codepoint, offset;
+
+ for (offset = 0; offset < string_length(interpreter, src); ++offset) {
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, src, offset);
+ if (codepoint >= 0x100)
+ return 0;
+ }
return 1;
}
-/* No word chars in binary data */
static INTVAL
is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == WORDCHAR);
+ return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
}
static INTVAL
find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
typetable);
+ return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
+ typetable);
}
static INTVAL
@@ -331,7 +338,7 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == DIGIT);
+ return (typetable[codepoint] & DIGIT) ? 1 : 0;
}
static INTVAL
@@ -372,20 +379,6 @@
}
static INTVAL
-is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
-{
- UINTVAL codepoint;
- codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return codepoint == 13;
-}
-
-static INTVAL
-find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
-{
- return -1;
-}
-
-static INTVAL
find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return offset;
@@ -394,7 +387,8 @@
static INTVAL
find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL
offset)
{
- return -1;
+ return ascii_find_word_boundary(interpreter, source_string,
+ offset, typetable);
}
static STRING *
@@ -460,9 +454,9 @@
is_punctuation,
find_punctuation,
find_not_punctuation,
- is_newline,
- find_newline,
- find_not_newline,
+ ascii_is_newline,
+ ascii_find_newline,
+ ascii_find_not_newline,
find_word_boundary,
string_from_codepoint,
compute_hash,
1.6 +1 -5 parrot/charset/iso-8859-1.h
Index: iso-8859-1.h
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- iso-8859-1.h 28 Feb 2005 15:10:55 -0000 1.5
+++ iso-8859-1.h 28 Feb 2005 17:17:51 -0000 1.6
@@ -1,7 +1,7 @@
/* iso_8859_1.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: iso-8859-1.h,v 1.5 2005/02/28 15:10:55 leo Exp $
+ * $Id: iso-8859-1.h,v 1.6 2005/02/28 17:17:51 leo Exp $
* Overview:
* This is the header for the iso_8859-1 charset functions
* Data Structure and Algorithms:
@@ -41,10 +41,6 @@
static INTVAL is_punctuation(Interp *interpreter, STRING *source_string,
UINTVAL offset);
static INTVAL find_punctuation(Interp *interpreter, STRING *source_string,
UINTVAL offset);
static INTVAL find_not_punctuation(Interp *interpreter, STRING
*source_string, UINTVAL offset);
-static INTVAL is_newline(Interp *interpreter, STRING *source_string, UINTVAL
offset);
-static INTVAL find_newline(Interp *interpreter, STRING *source_string,
UINTVAL offset);
-static INTVAL find_not_newline(Interp *interpreter, STRING *source_string,
UINTVAL offset);
-static INTVAL find_word_boundary(Interp *interpreter, STRING *source_string,
UINTVAL offset);
static size_t compute_hash(Interp *interpreter, STRING *source_string);
CHARSET *Parrot_charset_iso_8859_1_init(Interp *interpreter);
1.8 +3 -3 parrot/encodings/fixed_8.c
Index: fixed_8.c
===================================================================
RCS file: /cvs/public/parrot/encodings/fixed_8.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- fixed_8.c 27 Feb 2005 12:12:25 -0000 1.7
+++ fixed_8.c 28 Feb 2005 17:17:53 -0000 1.8
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: fixed_8.c,v 1.7 2005/02/27 12:12:25 leo Exp $
+$Id: fixed_8.c,v 1.8 2005/02/28 17:17:53 leo Exp $
=head1 NAME
@@ -51,7 +51,7 @@
static UINTVAL
get_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset)
{
- char *contents = source_string->strstart;
+ unsigned char *contents = source_string->strstart;
if (offset >= source_string->bufused) {
internal_exception(0,
"get_byte past the end of the buffer (%i of %i)",
@@ -64,7 +64,7 @@
set_byte(Interp *interpreter, const STRING *source_string,
UINTVAL offset, UINTVAL byte)
{
- char *contents;
+ unsigned char *contents;
if (offset >= source_string->bufused) {
internal_exception(0, "set_byte past the end of the buffer");
}
1.48 +7 -1 parrot/include/parrot/string_funcs.h
Index: string_funcs.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -r1.47 -r1.48
--- string_funcs.h 27 Feb 2005 09:58:43 -0000 1.47
+++ string_funcs.h 28 Feb 2005 17:17:54 -0000 1.48
@@ -1,7 +1,7 @@
/* string_funcs.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string_funcs.h,v 1.47 2005/02/27 09:58:43 leo Exp $
+ * $Id: string_funcs.h,v 1.48 2005/02/28 17:17:54 leo Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -102,6 +102,12 @@
void string_downcase_inplace(Interp *, STRING *);
void string_titlecase_inplace(Interp *, STRING *);
+INTVAL Parrot_string_is_whitespace(Interp *, STRING *, INTVAL offset);
+INTVAL Parrot_string_is_digit(Interp *, STRING *, INTVAL offset);
+INTVAL Parrot_string_is_wordchar(Interp *, STRING *, INTVAL offset);
+INTVAL Parrot_string_is_punctuation(Interp *, STRING *, INTVAL offset);
+INTVAL Parrot_string_is_newline(Interp *, STRING *, INTVAL offset);
+
#endif /* PARROT_IN_CORE */
#endif /* PARROT_STRING_FUNCS_H_GUARD */
1.58 +20 -0 parrot/ops/ops.num
Index: ops.num
===================================================================
RCS file: /cvs/public/parrot/ops/ops.num,v
retrieving revision 1.57
retrieving revision 1.58
diff -u -r1.57 -r1.58
--- ops.num 28 Feb 2005 13:35:45 -0000 1.57
+++ ops.num 28 Feb 2005 17:17:55 -0000 1.58
@@ -1378,3 +1378,23 @@
charsetname_s_ic 1348
find_charset_i_s 1349
find_charset_i_sc 1350
+is_whitespace_i_s_i 1351
+is_whitespace_i_s_ic 1352
+is_whitespace_i_sc_i 1353
+is_whitespace_i_sc_ic 1354
+is_digit_i_s_i 1355
+is_digit_i_s_ic 1356
+is_digit_i_sc_i 1357
+is_digit_i_sc_ic 1358
+is_wordchar_i_s_i 1359
+is_wordchar_i_s_ic 1360
+is_wordchar_i_sc_i 1361
+is_wordchar_i_sc_ic 1362
+is_punctuation_i_s_i 1363
+is_punctuation_i_s_ic 1364
+is_punctuation_i_sc_i 1365
+is_punctuation_i_sc_ic 1366
+is_newline_i_s_i 1367
+is_newline_i_s_ic 1368
+is_newline_i_sc_i 1369
+is_newline_i_sc_ic 1370
1.33 +48 -0 parrot/ops/string.ops
Index: string.ops
===================================================================
RCS file: /cvs/public/parrot/ops/string.ops,v
retrieving revision 1.32
retrieving revision 1.33
diff -u -r1.32 -r1.33
--- string.ops 28 Feb 2005 13:35:45 -0000 1.32
+++ string.ops 28 Feb 2005 17:17:55 -0000 1.33
@@ -655,6 +655,54 @@
goto NEXT();
}
+=item B<is_whitespace>(out INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of string $2 at offset $3 is whitespace.
+
+=item B<is_wordchar>(out INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of string $2 at offset $3 is a wordchar.
+
+=item B<is_digit>(out INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of string $2 at offset $3 is a digit.
+
+=item B<is_punctuation>(out INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of string $2 at offset $3 is a punctuation char.
+
+=item B<is_newline>(out INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of string $2 at offset $3 is a newline char.
+
+=cut
+
+op is_whitespace(out INT, in STR, in INT) {
+ $1 = Parrot_string_is_whitespace(interpreter, $2, $3);
+ goto NEXT();
+}
+
+op is_wordchar(out INT, in STR, in INT) {
+ $1 = Parrot_string_is_wordchar(interpreter, $2, $3);
+ goto NEXT();
+}
+
+op is_digit(out INT, in STR, in INT) {
+ $1 = Parrot_string_is_digit(interpreter, $2, $3);
+ goto NEXT();
+}
+
+op is_punctuation(out INT, in STR, in INT) {
+ $1 = Parrot_string_is_punctuation(interpreter, $2, $3);
+ goto NEXT();
+}
+
+op is_newline(out INT, in STR, in INT) {
+ $1 = Parrot_string_is_newline(interpreter, $2, $3);
+ goto NEXT();
+}
+
+
=back
=head1 COPYRIGHT
1.236 +41 -1 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.235
retrieving revision 1.236
diff -u -r1.235 -r1.236
--- string.c 28 Feb 2005 13:35:46 -0000 1.235
+++ string.c 28 Feb 2005 17:17:56 -0000 1.236
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.235 2005/02/28 13:35:46 leo Exp $
+$Id: string.c,v 1.236 2005/02/28 17:17:56 leo Exp $
=head1 NAME
@@ -2503,6 +2503,46 @@
return NULL;
}
+INTVAL
+Parrot_string_is_whitespace(Interp *interpreter, STRING *s, INTVAL offset)
+{
+ if (!s)
+ return 0;
+ return CHARSET_IS_WHITESPACE(interpreter, s, offset);
+}
+
+INTVAL
+Parrot_string_is_digit(Interp *interpreter, STRING *s, INTVAL offset)
+{
+ if (!s)
+ return 0;
+ return CHARSET_IS_DIGIT(interpreter, s, offset);
+}
+
+INTVAL
+Parrot_string_is_wordchar(Interp *interpreter, STRING *s, INTVAL offset)
+{
+ if (!s)
+ return 0;
+ return CHARSET_IS_WORDCHAR(interpreter, s, offset);
+}
+
+INTVAL
+Parrot_string_is_punctuation(Interp *interpreter, STRING *s, INTVAL offset)
+{
+ if (!s)
+ return 0;
+ return CHARSET_IS_PUNCTUATION(interpreter, s, offset);
+}
+
+INTVAL
+Parrot_string_is_newline(Interp *interpreter, STRING *s, INTVAL offset)
+{
+ if (!s)
+ return 0;
+ return CHARSET_IS_NEWLINE(interpreter, s, offset);
+}
+
/*
=back
1.4 +90 -2 parrot/t/op/string_cs.t
Index: string_cs.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string_cs.t,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- string_cs.t 28 Feb 2005 15:10:58 -0000 1.3
+++ string_cs.t 28 Feb 2005 17:17:57 -0000 1.4
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: string_cs.t,v 1.3 2005/02/28 15:10:58 leo Exp $
+# $Id: string_cs.t,v 1.4 2005/02/28 17:17:57 leo Exp $
=head1 NAME
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 7;
+use Parrot::Test tests => 12;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -92,3 +92,91 @@
Zaeiou_���
OUTPUT
+output_is( <<'CODE', <<OUTPUT, "is_whitespace");
+ set S0, "a\t\n \xa0"
+ is_whitespace I0, S0, 0
+ is_whitespace I1, S0, 1
+ is_whitespace I2, S0, 2
+ is_whitespace I3, S0, 3
+ set I4, 4
+ is_whitespace I4, S0, I4
+ print I0
+ print I1
+ print I2
+ print I3
+ print I4
+ print "\n"
+ set S0, ascii:"a\t\n "
+ is_whitespace I0, S0, 0
+ is_whitespace I1, S0, 1
+ is_whitespace I2, S0, 2
+ is_whitespace I3, S0, 3
+ print I0
+ print I1
+ print I2
+ print I3
+ print "\n"
+ end
+CODE
+01111
+0111
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "is_wordchar");
+ set S0, "az019-,._"
+ length I1, S0
+ set I2, 0
+lp:
+ is_wordchar I0, S0, I2
+ print I0
+ inc I2
+ lt I2, I1, lp
+ print "\n"
+ end
+CODE
+111110001
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "is_digit");
+ set S0, "az019-,._"
+ length I1, S0
+ set I2, 0
+lp:
+ is_digit I0, S0, I2
+ print I0
+ inc I2
+ lt I2, I1, lp
+ print "\n"
+ end
+CODE
+001110000
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "is_punctuation");
+ set S0, "az019-,._"
+ length I1, S0
+ set I2, 0
+lp:
+ is_punctuation I0, S0, I2
+ print I0
+ inc I2
+ lt I2, I1, lp
+ print "\n"
+ end
+CODE
+000001110
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "is_newline");
+ set S0, "a\n"
+ is_newline I0, S0, 0
+ print I0
+ is_newline I0, S0, 1
+ print I0
+ print "\n"
+ end
+CODE
+01
+OUTPUT
+
+