Author: jrieks
Date: Sun May 8 14:24:53 2005
New Revision: 8014
Added:
trunk/charset/tables.c
trunk/charset/tables.h
trunk/include/parrot/cclass.h
trunk/t/op/string_cclass.t
Modified:
trunk/MANIFEST
trunk/charset/ascii.c
trunk/charset/ascii.h
trunk/charset/binary.c
trunk/charset/iso-8859-1.c
trunk/charset/unicode.c
trunk/config/gen/parrot_include.pl
trunk/include/parrot/charset.h
trunk/include/parrot/string_funcs.h
trunk/ops/experimental.ops
trunk/src/string.c
trunk/t/op/string_cs.t
Log:
character classification, part 1:
- added include/parrot/cclass.h
- added PARROT_CCLASS_FLAGS enum
- added is_cclass (working), find_cclass (noop), find_not_cclass (noop) to
experimental.ops
- removed old charset tables
- added new, automatically generated character classification tables
- is_punctuation now threads '_' as punctuation character (modified a test)
- find_wordchar also threads '_' as word character (modified a test)
- modified a test to also test access beyond string end
Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST (original)
+++ trunk/MANIFEST Sun May 8 14:24:53 2005
@@ -44,8 +44,11 @@
charset/ascii.h []
charset/binary.c []
charset/binary.h []
+charset/gen_tables.pl [devel]
charset/iso-8859-1.c []
charset/iso-8859-1.h []
+charset/tables.c []
+charset/tables.h []
charset/unicode.c []
charset/unicode.h []
classes/array.pmc []
@@ -708,6 +711,7 @@
include/parrot/autoprefix.h [devel]include
include/parrot/builtin.h [devel]include
include/parrot/caches.h [devel]include
+include/parrot/cclass.h [devel]include
include/parrot/charset.h [devel]include
include/parrot/datatypes.h [devel]include
include/parrot/debug.h [devel]include
@@ -1716,6 +1720,7 @@
t/op/spawnw.t []
t/op/stacks.t []
t/op/string.t []
+t/op/string_cclass.t []
t/op/string_cs.t []
t/op/stringu.t []
t/op/time.t []
Modified: trunk/charset/ascii.c
==============================================================================
--- trunk/charset/ascii.c (original)
+++ trunk/charset/ascii.c Sun May 8 14:24:53 2005
@@ -30,33 +30,11 @@
#define EXCEPTION(err, str) \
real_exception(interpreter, NULL, err, str)
-#define WHITESPACE 1
-#define WORDCHAR 2
-#define PUNCTUATION 4
-#define DIGIT 8
-
-static const unsigned char typetable[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
- 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
- 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4, /*48.*/
- 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
- 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160-175 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176-191 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192-207 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208-223 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224-239 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240-255 */
-};
+#include "tables.h"
INTVAL
ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
- unsigned char type, const unsigned char *table)
+ PARROT_CCLASS_FLAGS type, const PARROT_CCLASS_FLAGS *table)
{
for (; start < string->strlen; start++) {
if (table[ENCODING_GET_BYTE(interpreter, string, start)] & type) {
@@ -68,7 +46,7 @@
INTVAL
ascii_find_not_thing(Interp *interpreter, STRING *string, UINTVAL start,
- unsigned char type, const unsigned char *table)
+ PARROT_CCLASS_FLAGS type, const PARROT_CCLASS_FLAGS *table)
{
for (; start < string->strlen; start++) {
if (!(table[ENCODING_GET_BYTE(interpreter, string, start)] & type)) {
@@ -413,21 +391,21 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
+ return (Parrot_ascii_typetable[codepoint] & WORDCHAR) ? 1 : 0;
}
static INTVAL
find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset, WORDCHAR,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
@@ -435,14 +413,14 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == WHITESPACE);
+ return (Parrot_ascii_typetable[codepoint] & WHITESPACE) == WHITESPACE;
}
static INTVAL
find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, WHITESPACE,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
@@ -450,7 +428,7 @@
UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset,
- WHITESPACE, typetable);
+ WHITESPACE, Parrot_ascii_typetable);
}
static INTVAL
@@ -458,21 +436,21 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] & DIGIT) ? 1 : 0;
+ return (Parrot_ascii_typetable[codepoint] & DIGIT) == DIGIT;
}
static INTVAL
find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, DIGIT,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset, DIGIT,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
@@ -480,14 +458,14 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == PUNCTUATION);
+ return (Parrot_ascii_typetable[codepoint] & PUNCTUATION) == PUNCTUATION;
}
static INTVAL
find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, PUNCTUATION,
- typetable);
+ Parrot_ascii_typetable);
}
static INTVAL
@@ -495,7 +473,7 @@
UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset,
- PUNCTUATION, typetable);
+ PUNCTUATION, Parrot_ascii_typetable);
}
INTVAL
@@ -531,7 +509,7 @@
INTVAL
ascii_find_word_boundary(Interp *interpreter, STRING *string,
- UINTVAL offset, const unsigned char *table)
+ UINTVAL offset, const PARROT_CCLASS_FLAGS *table)
{
UINTVAL c, len;
int is_wc1, is_wc2;
@@ -561,7 +539,7 @@
find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_word_boundary(interpreter, source_string,
- offset, typetable);
+ offset, Parrot_ascii_typetable);
}
static STRING *
@@ -573,6 +551,32 @@
return return_string;
}
+static INTVAL
+is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset)
+{
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+
+ if (codepoint >= sizeof(Parrot_ascii_typetable) /
sizeof(Parrot_ascii_typetable[0])) {
+ return 0;
+ }
+ return (Parrot_ascii_typetable[codepoint] & flags) ? 1 : 0;
+}
+
+static INTVAL
+find_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ real_exception(interpreter, NULL, UNIMPLEMENTED, "unimplemented
ascii:find_cclass");
+ return -1;
+}
+
+static INTVAL
+find_not_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ real_exception(interpreter, NULL, UNIMPLEMENTED, "unimplemented
ascii:find_not_cclass");
+ return -1;
+}
+
/*
* TODO pass in the Hash's seed value as initial hashval
*/
@@ -617,6 +621,9 @@
ascii_cs_index,
ascii_cs_rindex,
validate,
+ is_cclass,
+ find_cclass,
+ find_not_cclass,
is_wordchar,
find_wordchar,
find_not_wordchar,
Modified: trunk/charset/ascii.h
==============================================================================
--- trunk/charset/ascii.h (original)
+++ trunk/charset/ascii.h Sun May 8 14:24:53 2005
@@ -19,10 +19,10 @@
INTVAL
ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
- unsigned char type, const unsigned char *table);
+ PARROT_CCLASS_FLAGS type, const PARROT_CCLASS_FLAGS *table);
INTVAL
ascii_find_not_thing(Interp *interpreter, STRING *string, UINTVAL start,
- unsigned char type, const unsigned char *table);
+ PARROT_CCLASS_FLAGS type, const PARROT_CCLASS_FLAGS *table);
STRING *ascii_get_graphemes(Interp *, STRING *source_string,
UINTVAL offset, UINTVAL count);
STRING *ascii_get_graphemes_inplace(Interp *, STRING *source_string,
@@ -31,7 +31,7 @@
INTVAL ascii_find_newline(Interp *, STRING *source_string, UINTVAL offset);
INTVAL ascii_find_not_newline(Interp *, STRING *source_string, UINTVAL offset);
INTVAL ascii_find_word_boundary(Interp *, STRING *source_string,
- UINTVAL offset, const unsigned char *typetable);
+ UINTVAL offset, const PARROT_CCLASS_FLAGS *typetable);
INTVAL ascii_compare(Interp *, STRING *lhs, STRING *rhs);
INTVAL ascii_compare(Interp *, STRING *lhs, STRING *rhs);
INTVAL ascii_cs_index(Interp *, STRING *source_string,
Modified: trunk/charset/binary.c
==============================================================================
--- trunk/charset/binary.c (original)
+++ trunk/charset/binary.c Sun May 8 14:24:53 2005
@@ -233,6 +233,24 @@
return -1;
}
+static INTVAL
+is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset)
+{
+ return 0;
+}
+
+static INTVAL
+find_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ return -1;
+}
+
+static INTVAL
+find_not_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ return -1;
+}
+
static STRING *
string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
{
@@ -268,6 +286,9 @@
cs_index,
cs_rindex,
validate,
+ is_cclass,
+ find_cclass,
+ find_not_cclass,
is_wordchar,
find_wordchar,
find_not_wordchar,
Modified: trunk/charset/iso-8859-1.c
==============================================================================
--- trunk/charset/iso-8859-1.c (original)
+++ trunk/charset/iso-8859-1.c Sun May 8 14:24:53 2005
@@ -29,31 +29,7 @@
#define EXCEPTION(err, str) \
real_exception(interpreter, NULL, err, str)
-#define WHITESPACE 1
-#define WORDCHAR 2
-#define PUNCTUATION 4
-#define DIGIT 8
-
-static const unsigned char typetable[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
- 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
- 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4, /* 48
*/
- 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
- 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
- 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 160-175 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, /* 176-191 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 192-207 */
- 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 208-223 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224-239 */
- 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 240-255 */
-};
-
-
+#include "tables.h"
static void
set_graphemes(Interp *interpreter, STRING *source_string,
@@ -244,21 +220,21 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
+ return (Parrot_iso_8859_1_typetable[codepoint] & WORDCHAR) == WORDCHAR;
}
static INTVAL
find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset, WORDCHAR,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -266,14 +242,14 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == WHITESPACE);
+ return (Parrot_iso_8859_1_typetable[codepoint] & WHITESPACE) == WHITESPACE;
}
static INTVAL
find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, WHITESPACE,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -281,7 +257,7 @@
UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset,
- WHITESPACE, typetable);
+ WHITESPACE, Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -289,21 +265,21 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] & DIGIT) ? 1 : 0;
+ return (Parrot_iso_8859_1_typetable[codepoint] & DIGIT) == DIGIT;
}
static INTVAL
find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, DIGIT,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset, DIGIT,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -311,14 +287,14 @@
{
UINTVAL codepoint;
codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
- return (typetable[codepoint] == PUNCTUATION);
+ return (Parrot_iso_8859_1_typetable[codepoint] & PUNCTUATION) ==
PUNCTUATION;
}
static INTVAL
find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_thing(interpreter, source_string, offset, PUNCTUATION,
- typetable);
+ Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -326,7 +302,7 @@
UINTVAL offset)
{
return ascii_find_not_thing(interpreter, source_string, offset,
- PUNCTUATION, typetable);
+ PUNCTUATION, Parrot_iso_8859_1_typetable);
}
static INTVAL
@@ -339,9 +315,34 @@
find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
return ascii_find_word_boundary(interpreter, source_string,
- offset, typetable);
+ offset, Parrot_iso_8859_1_typetable);
+}
+
+static INTVAL
+is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset)
+{
+ UINTVAL codepoint;
+ codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
+
+ if (codepoint >= sizeof(Parrot_ascii_typetable) /
sizeof(Parrot_ascii_typetable[0])) {
+ return 0;
+ }
+ return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
}
+static INTVAL
+find_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ return -1;
+}
+
+static INTVAL
+find_not_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ return -1;
+}
+
+
static STRING *
string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
{
@@ -377,6 +378,9 @@
ascii_cs_index,
ascii_cs_rindex,
validate,
+ is_cclass,
+ find_cclass,
+ find_not_cclass,
is_wordchar,
find_wordchar,
find_not_wordchar,
Added: trunk/charset/tables.c
==============================================================================
--- (empty file)
+++ trunk/charset/tables.c Sun May 8 14:24:53 2005
@@ -0,0 +1,84 @@
+/* $id $
+ * Copyright: 2005 The Perl Foundation. All Rights Reserved.
+ *
+ * DO NOT EDIT THIS FILE DIRECTLY!
+ * please update the charset/gen_tables.pl script instead.
+ *
+ * Created by gen_tables.pl jrieks
+ * Overview:
+ * This file contains various charset tables.
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#include "tables.h"
+const PARROT_CCLASS_FLAGS Parrot_ascii_typetable[256] = {
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 0-7 */
+0x0200, 0x0320, 0x1220, 0x0220, 0x0220, 0x1220, 0x0200, 0x0200, /* 8-15 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 16-23 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 24-31 */
+0x0160, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 32-39 */
+0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 40-47 */
+0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, /* 48-55 */
+0x28d8, 0x28d8, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 56-63 */
+0x04c0, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28c5, /* 64-71 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 72-79 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 80-87 */
+0x28c5, 0x28c5, 0x28c5, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x24c0, /* 88-95 */
+0x04c0, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28c6, /* 96-103 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
+0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 128-135 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 136-143 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 144-151 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 152-159 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 168-175 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 176-183 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 184-191 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 192-199 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 200-207 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 208-215 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 216-223 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 224-231 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 232-239 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 240-247 */
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 248-255 */
+};
+const PARROT_CCLASS_FLAGS Parrot_iso_8859_1_typetable[256] = {
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 0-7 */
+0x0200, 0x0320, 0x1220, 0x0220, 0x0220, 0x1220, 0x0200, 0x0200, /* 8-15 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 16-23 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 24-31 */
+0x0160, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 32-39 */
+0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 40-47 */
+0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, /* 48-55 */
+0x28d8, 0x28d8, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 56-63 */
+0x04c0, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28c5, /* 64-71 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 72-79 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 80-87 */
+0x28c5, 0x28c5, 0x28c5, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x24c0, /* 88-95 */
+0x04c0, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28c6, /* 96-103 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
+0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 128-135 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 136-143 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 144-151 */
+0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 152-159 */
+0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
+0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 168-175 */
+0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x28c6, 0x04c0, 0x04c0, /* 176-183 */
+0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 184-191 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 192-199 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 200-207 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x04c0, /* 208-215 */
+0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c6, /* 216-223 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 224-231 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 232-239 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x04c0, /* 240-247 */
+0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 248-255 */
+};
Added: trunk/charset/tables.h
==============================================================================
--- (empty file)
+++ trunk/charset/tables.h Sun May 8 14:24:53 2005
@@ -0,0 +1,25 @@
+/* $id $
+ * Copyright: 2005 The Perl Foundation. All Rights Reserved.
+ *
+ * DO NOT EDIT THIS FILE DIRECTLY!
+ * please update the charset/gen_tables.pl script instead.
+ *
+ * Created by gen_tables.pl jrieks
+ * Overview:
+ * This file contains various charset tables.
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#if !defined(PARROT_CHARSET_TABLES_H_GUARD)
+#define PARROT_CHARSET_TABLES_H_GUARD
+#include "parrot/cclass.h"
+#define WHITESPACE enum_cclass_whitespace
+#define WORDCHAR enum_cclass_word
+#define PUNCTUATION enum_cclass_punctuation
+#define DIGIT enum_cclass_numeric
+extern const PARROT_CCLASS_FLAGS Parrot_ascii_typetable[256];
+extern const PARROT_CCLASS_FLAGS Parrot_iso_8859_1_typetable[256];
+#endif /* PARROT_CHARSET_TABLES_H_GUARD */
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Sun May 8 14:24:53 2005
@@ -277,6 +277,27 @@
return 0;
}
+static INTVAL
+is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset)
+{
+ UNIMPL;
+ return 0;
+}
+
+static INTVAL
+find_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ UNIMPL;
+ return -1;
+}
+
+static INTVAL
+find_not_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*source_string, UINTVAL offset, UINTVAL count)
+{
+ UNIMPL;
+ return -1;
+}
+
static STRING *
string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
{
@@ -333,6 +354,9 @@
mixed_cs_index,
cs_rindex,
validate,
+ is_cclass,
+ find_cclass,
+ find_not_cclass,
is_wordchar,
find_wordchar,
find_not_wordchar,
Modified: trunk/config/gen/parrot_include.pl
==============================================================================
--- trunk/config/gen/parrot_include.pl (original)
+++ trunk/config/gen/parrot_include.pl Sun May 8 14:24:53 2005
@@ -20,6 +20,7 @@
$description="Generating runtime/parrot/include...";
my @files = qw(
+ include/parrot/cclass.h
include/parrot/core_pmcs.h
include/parrot/datatypes.h
include/parrot/enums.h
Added: trunk/include/parrot/cclass.h
==============================================================================
--- (empty file)
+++ trunk/include/parrot/cclass.h Sun May 8 14:24:53 2005
@@ -0,0 +1,32 @@
+/* dynext.h
+*
+* $Id: cclass.h jrieks $
+*
+* Parrot character classes
+*/
+
+#if !defined(PARROT_CCLASS_H_GUARD)
+#define PARROT_CCLASS_H_GUARD
+
+/* &gen_from_enum(cclass.pasm) subst(s/enum_cclass_(\w+)/uc("CCLASS_$1")/e) */
+typedef enum { /* ASCII characters matching this
class: */
+enum_cclass_any = 0x0000, /* all */
+enum_cclass_none = 0xffff, /* none */
+enum_cclass_uppercase = 0x0001, /* A-Z */
+enum_cclass_lowercase = 0x0002, /* a-z */
+enum_cclass_alphabetic = 0x0004, /* a-z, A-Z */
+enum_cclass_numeric = 0x0008, /* 0-9 */
+enum_cclass_hexadecimal = 0x0010, /* 0-9, a-f, A-F */
+enum_cclass_whitespace = 0x0020, /* ' ', '\f', '\n', '\r', '\t', '\v' */
+enum_cclass_printing = 0x0040, /* any printable character including
space */
+enum_cclass_graphical = 0x0080, /* any printable character
except space */
+enum_cclass_blank = 0x0100, /* ' ', '\t' */
+enum_cclass_control = 0x0200, /* control characters */
+enum_cclass_punctuation = 0x0400, /* all except ' ', a-z, A-Z, 0-9 */
+enum_cclass_alphanumeric = 0x0800, /* a-z, A-Z, 0-9 */
+enum_cclass_newline = 0x1000, /* '\n', '\r' */
+enum_cclass_word = 0x2000, /* a-z, A-Z, 0-9, '_'*/
+} PARROT_CCLASS_FLAGS;
+/* &end_gen */
+
+#endif /* PARROT_CCLASS_H_GUARD */
Modified: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h (original)
+++ trunk/include/parrot/charset.h Sun May 8 14:24:53 2005
@@ -15,6 +15,7 @@
#include "parrot/encoding.h"
+#include "parrot/cclass.h"
struct _charset;
typedef struct _charset CHARSET;
@@ -55,6 +56,9 @@
typedef INTVAL (*charset_index_t)(Interp *, STRING *source_string, STRING
*search_string, UINTVAL offset);
typedef INTVAL (*charset_rindex_t)(Interp *, STRING *source_string, STRING
*search_string, UINTVAL offset);
typedef UINTVAL (*charset_validate_t)(Interp *, STRING *source_string);
+typedef INTVAL (*charset_is_cclass_t)(Interp *, PARROT_CCLASS_FLAGS, STRING
*source_string, UINTVAL offset);
+typedef INTVAL (*charset_find_cclass_t)(Interp *, PARROT_CCLASS_FLAGS, STRING
*source_string, UINTVAL offset, UINTVAL count);
+typedef INTVAL (*charset_find_not_cclass_t)(Interp *, PARROT_CCLASS_FLAGS,
STRING *source_string, UINTVAL offset, UINTVAL count);
typedef INTVAL (*charset_is_wordchar_t)(Interp *, STRING *source_string,
UINTVAL offset);
typedef INTVAL (*charset_find_wordchar_t)(Interp *, STRING *source_string,
UINTVAL offset);
typedef INTVAL (*charset_find_not_wordchar_t)(Interp *, STRING *source_string,
UINTVAL offset);
@@ -115,6 +119,9 @@
charset_index_t index;
charset_rindex_t rindex;
charset_validate_t validate;
+ charset_is_cclass_t is_cclass;
+ charset_find_cclass_t find_cclass;
+ charset_find_not_cclass_t find_not_cclass;
charset_is_wordchar_t is_wordchar;
charset_find_wordchar_t find_wordchar;
charset_find_not_wordchar_t find_not_wordchar;
@@ -153,6 +160,9 @@
#define CHARSET_INDEX(interp, source, search, offset) ((CHARSET
*)source->charset)->index(interpreter, source, search, offset)
#define CHARSET_RINDEX(interp, source, search, offset) ((CHARSET
*)source->charset)->rindex(interpreter, source, search, offset)
#define CHARSET_VALIDATE(interp, source, offset) ((CHARSET
*)source->charset)->validate(interpreter, source)
+#define CHARSET_IS_CCLASS(interp, flags, source, offset) ((CHARSET
*)source->charset)->is_cclass(interpreter, flags, source, offset)
+#define CHARSET_FIND_CCLASS(interp, flags, source, offset, count) ((CHARSET
*)source->charset)->find_cclass(interpreter, flags, source, offset, count)
+#define CHARSET_FIND_NOT_CCLASS(interp, flags, source, offset, count)
((CHARSET *)source->charset)->find_not_cclass(interpreter, flags, source,
offset, count)
#define CHARSET_IS_WORDCHAR(interp, source, offset) ((CHARSET
*)source->charset)->is_wordchar(interpreter, source, offset)
#define CHARSET_FIND_WORDCHAR(interp, source, offset) ((CHARSET
*)source->charset)->find_wordchar(interpreter, source, offset)
#define CHARSET_FIND_NOT_WORDCHAR(interp, source, offset) ((CHARSET
*)source->charset)->find_not_wordchar(interpreter, source, offset)
Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Sun May 8 14:24:53 2005
@@ -102,6 +102,9 @@
void string_downcase_inplace(Interp *, STRING *);
void string_titlecase_inplace(Interp *, STRING *);
+INTVAL Parrot_string_is_cclass(Interp *, PARROT_CCLASS_FLAGS, STRING *,
UINTVAL offset);
+INTVAL Parrot_string_find_cclass(Interp *, PARROT_CCLASS_FLAGS, STRING *,
UINTVAL offset, UINTVAL count);
+INTVAL Parrot_string_find_not_cclass(Interp *, PARROT_CCLASS_FLAGS, STRING *,
UINTVAL offset, UINTVAL count);
INTVAL Parrot_string_is_whitespace(Interp *, STRING *, INTVAL offset);
INTVAL Parrot_string_is_digit(Interp *, STRING *, INTVAL offset);
INTVAL Parrot_string_is_wordchar(Interp *, STRING *, INTVAL offset);
Modified: trunk/ops/experimental.ops
==============================================================================
--- trunk/ops/experimental.ops (original)
+++ trunk/ops/experimental.ops Sun May 8 14:24:53 2005
@@ -246,6 +246,45 @@
goto NEXT();
}
+=item B<is_cclass>(out INT, in INT, in STR, in INT)
+
+Set $1 to 1 if the codepoint of $3 at position $4 is in
+the character class(es) given by $2.
+
+=cut
+
+inline op is_cclass(out INT, in INT, in STR, in INT) {
+ $1 = Parrot_string_is_cclass(interpreter, $2, $3, $4);
+ goto NEXT();
+}
+
+=item B<find_cclass>(out INT, in INT, in STR, in INT, in INT)
+
+Set $1 to the offset of the first codepoint matching
+the character class(es) given by $2 in string $3, starting
+at offset $4 for up to $5 codepoints. If no matching
+character is found, set $1 to -1.
+
+=cut
+
+inline op find_cclass(out INT, in INT, in STR, in INT, in INT) {
+ $1 = Parrot_string_find_cclass(interpreter, $2, $3, $4, $5);
+ goto NEXT();
+}
+
+=item B<find_not_cclass>(out INT, in INT, in STR, in INT, in INT)
+
+Set $1 to the offset of the first codepoint not matching
+the character class(es) given by $2 in string $3, starting
+at offset $4 for up to $5 codepoints. If the substring
+consists entirely of matching characters, set $1 to -1.
+
+=cut
+
+inline op find_not_cclass(out INT, in INT, in STR, in INT, in INT) {
+ $1 = Parrot_string_find_not_cclass(interpreter, $2, $3, $4, $5);
+ goto NEXT();
+}
=back
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Sun May 8 14:24:53 2005
@@ -2645,6 +2645,30 @@
return CHARSET_FIND_WORD_BOUNDARY(interpreter, s, offset);
}
+INTVAL
+Parrot_string_is_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags, STRING
*s, UINTVAL offset)
+{
+ if (!s)
+ return -1;
+ return CHARSET_IS_CCLASS(interpreter, flags, s, offset);
+}
+
+INTVAL
+Parrot_string_find_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags,
STRING *s, UINTVAL offset, UINTVAL count)
+{
+ if (!s)
+ return -1;
+ return CHARSET_FIND_CCLASS(interpreter, flags, s, offset, count);
+}
+
+INTVAL
+Parrot_string_find_not_cclass(Interp *interpreter, PARROT_CCLASS_FLAGS flags,
STRING *s, UINTVAL offset, UINTVAL count)
+{
+ if (!s)
+ return -1;
+ return CHARSET_FIND_NOT_CCLASS(interpreter, flags, s, offset, count);
+}
+
STRING*
Parrot_string_trans_charset(Interp *interpreter, STRING *src,
INTVAL charset_nr, STRING *dest)
Added: trunk/t/op/string_cclass.t
==============================================================================
--- (empty file)
+++ trunk/t/op/string_cclass.t Sun May 8 14:24:53 2005
@@ -0,0 +1,100 @@
+#! perl -w
+# Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
+# $Id: cclass.t jrieks $
+
+=head1 NAME
+
+t/op/cclass.t - character class tests
+
+=head1 SYNOPSIS
+
+ % perl -Ilib t/op/cclass.t
+
+=head1 DESCRIPTION
+
+Tests find_cclass find_not_cclass, is_cclass.
+
+=cut
+
+use strict;
+
+use Parrot::Test tests => 1;
+
+pir_output_is(<<'CODE', <<'OUT', "is_cclass");
+.include "cclass.pasm"
+.sub main @MAIN
+ $S1 = ascii:"ab\nCX34.\0 \t!"
+ test1( $S1 )
+ $S1 = iso-8859-1:"ab\nCX34.\0 \t!"
+ test1( $S1 )
+.end
+.sub test1
+ .param string str
+ test2( str, .CCLASS_UPPERCASE)
+ test2( str, .CCLASS_LOWERCASE)
+ test2( str, .CCLASS_ALPHABETIC)
+ test2( str, .CCLASS_NUMERIC)
+ test2( str, .CCLASS_HEXADECIMAL)
+ test2( str, .CCLASS_WHITESPACE)
+ test2( str, .CCLASS_PRINTING)
+ test2( str, .CCLASS_GRAPHICAL)
+ test2( str, .CCLASS_BLANK)
+ test2( str, .CCLASS_CONTROL)
+ test2( str, .CCLASS_PUNCTUATION)
+ test2( str, .CCLASS_ALPHANUMERIC)
+ test2( str, .CCLASS_NEWLINE)
+
+ $I0 = .CCLASS_NEWLINE|.CCLASS_WHITESPACE
+ test2( str, $I0)
+ $I0 = .CCLASS_WHITESPACE|.CCLASS_LOWERCASE
+ test2( str, $I0)
+ $I0 = .CCLASS_UPPERCASE|.CCLASS_PUNCTUATION
+ test2( str, $I0)
+.end
+.sub test2
+ .param string str
+ .param int code
+
+ $I1 = length str
+ set $I0, 0
+loop:
+ $I2 = is_cclass code, str, $I0
+ print $I2
+ inc $I0
+ if $I0 < $I1 goto loop
+ print "\n"
+.end
+CODE
+000110000000
+110000000000
+110110000000
+000001100000
+110101100000
+001000000110
+110111110101
+110111110001
+000000000110
+001000001010
+000000010001
+110111100000
+001000000000
+001000000110
+111000000110
+000110010001
+000110000000
+110000000000
+110110000000
+000001100000
+110101100000
+001000000110
+110111110101
+110111110001
+000000000110
+001000001010
+000000010001
+110111100000
+001000000000
+001000000110
+111000000110
+000110010001
+OUT
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Sun May 8 14:24:53 2005
@@ -93,7 +93,7 @@
OUTPUT
output_is( <<'CODE', <<OUTPUT, "is_whitespace");
- set S0, iso-8859-1:"a\t\n \xa0"
+ set S0, iso-8859-1:"a\t\n \xa0" # is 0xa0 a whitespace in iso-8859-1??
is_whitespace I0, S0, 0
is_whitespace I1, S0, 1
is_whitespace I2, S0, 2
@@ -111,15 +111,17 @@
is_whitespace I1, S0, 1
is_whitespace I2, S0, 2
is_whitespace I3, S0, 3
+ is_whitespace I4, S0, 4 # access past string boundary: not a whitespace
print I0
print I1
print I2
print I3
+ print I4
print "\n"
end
CODE
-01111
-0111
+01110
+01110
OUTPUT
output_is( <<'CODE', <<OUTPUT, "is_wordchar");
@@ -164,7 +166,7 @@
print "\n"
end
CODE
-000001110
+000001111
OUTPUT
output_is( <<'CODE', <<OUTPUT, "is_newline");
@@ -226,7 +228,7 @@
print "ok\n"
end
CODE
-2 5 -1 ok
+0 2 5 -1 ok
OUTPUT
output_is( <<'CODE', <<OUTPUT, "find_word_boundary");