Pretty simple. Mucks with the internals of regexes to use bitmaps in
rx_oneof, rx_is_w, rx_is_s, rx_dot, and rx_zwa_boundary. Also adds
three new ops:
-rx_clearinfo: clears out an info structure (useful for speeding up
more than one regex call).
-rx_makebmp: prebuilds a bitmap.
-rx_oneof_bmp: uses a prebuilt bitmap.
This also recognizes that test 14 of rx.t no longer fails under
Win32--it revokes its TODO status.
This patch is NOT bytecode-compatible with the last version of regexes.
--Brent Dax
[EMAIL PROTECTED]
Parrot Configure pumpking and regex hacker
<obra> mmmm. hawt sysadmin chx0rs
<lathos> This is sad. I know of *a* hawt sysamin chx0r.
<obra> I know more than a few.
<lathos> obra: There are two? Are you sure it's not the same one?
diff -ur ..\..\parrot-cvs\parrot/include/parrot/rx.h ../include/parrot/rx.h
--- ..\..\parrot-cvs\parrot/include/parrot/rx.h Mon Jan 14 01:19:30 2002
+++ ./include/parrot/rx.h Mon Jan 14 01:23:22 2002
@@ -17,6 +17,8 @@
#include "parrot/parrot.h"
+typedef char* Bitmap;
+
typedef enum rxflags {
enum_rxflags_none=0,
enum_rxflags_case_insensitive=1,
@@ -31,6 +33,11 @@
} rxdirection;
extern const INTVAL RX_MARK;
+extern const char * RX_WORDCHARS;
+extern const char * RX_NUMCHARS;
+extern const char * RX_SPACECHARS;
+
+#define cstr2pstr(cstr) string_make(interpreter, cstr, strlen(cstr), 0, 0, 0)
typedef struct rxinfo {
STRING *string;
@@ -52,9 +59,12 @@
rxinfo * rx_allocate_info(struct Parrot_Interp *, STRING *);
-BOOLVAL rx_is_word_character(char ch);
-BOOLVAL rx_is_number_character(char ch);
-BOOLVAL rx_is_whitespace_character(char ch);
+BOOLVAL rx_is_word_character(struct Parrot_Interp *, INTVAL ch);
+BOOLVAL rx_is_number_character(struct Parrot_Interp *, INTVAL ch);
+BOOLVAL rx_is_whitespace_character(struct Parrot_Interp *, INTVAL ch);
+
+Bitmap make_bitmap(STRING* str);
+BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp);
STRING *rxP_get_substr(struct Parrot_Interp *, STRING *, INTVAL, INTVAL);
diff -ur ..\..\parrot-cvs\parrot/rx.c ./rx.c
--- ..\..\parrot-cvs\parrot/rx.c Thu Jan 10 16:04:54 2002
+++ ./rx.c Mon Jan 14 01:21:46 2002
@@ -15,6 +15,9 @@
#include "parrot/rx.h"
const INTVAL RX_MARK=-1;
+const char *
+RX_WORDCHARS="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
+const char * RX_NUMCHARS="0123456789";
+const char * RX_SPACECHARS="\r\n\t ";
/*************************************************************
* Parrot Regular Expression Engine, version 3.0 alpha (Rx3) *
@@ -39,21 +42,19 @@
return rx;
}
-BOOLVAL rx_is_word_character(char ch) {
- if(
- (ch >= 'a' && ch <= 'z') ||
- (ch >= 'A' && ch <= 'Z') ||
- (ch >= '0' && ch <= '9') ||
- ch == '_'
- ) {
- return 1;
- }
- else {
- return 0;
+BOOLVAL rx_is_word_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+ static Bitmap bmp=NULL;
+
+ if(!bmp) {
+ bmp=make_bitmap(cstr2pstr(RX_WORDCHARS));
}
+
+ return check_bitmap(ch, bmp);
}
-BOOLVAL rx_is_number_character(char ch) {
+BOOLVAL rx_is_number_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+ /* faster to do less-than/greater-than */
+
if(ch >= '0' && ch <= '9') {
return 1;
}
@@ -62,13 +63,14 @@
}
}
-BOOLVAL rx_is_whitespace_character(char ch) {
- if(ch == '\n' || ch == '\r' || ch == '\t' || ch == ' ') {
- return 1;
- }
- else {
- return 0;
+BOOLVAL rx_is_whitespace_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+ static Bitmap bmp=NULL;
+
+ if(!bmp) {
+ bmp=make_bitmap(cstr2pstr(RX_SPACECHARS));
}
+
+ return check_bitmap(ch, bmp);
}
STRING *rxP_get_substr(struct Parrot_Interp *interpreter, STRING * source, INTVAL
startindex, INTVAL length) {
@@ -82,3 +84,33 @@
return ret;
}
+
+Bitmap make_bitmap(STRING* str) {
+ INTVAL i, ch;
+ Bitmap bmp=mem_sys_allocate(32);
+
+ /* XXX Bitmaps currently do not support chars > 255 --
+ the memory space needed grows _very_ fast */
+
+ for(i=0; i < 32; i++) {
+ bmp[i]=0;
+ }
+
+ for(i=0; i < string_length(str); i++) {
+ ch=string_ord(str, i);
+
+ if(ch > 255) return NULL; /* Can't do it--caller must figure out
+another way */
+
+ bmp[ch>>3] |= (1<<(ch & 7));
+ }
+
+ return bmp;
+}
+
+BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp) {
+ if(ch > 255) {
+ return 0;
+ }
+
+ return bmp[ch>>3] & (1<<(ch & 7));
+}
\ No newline at end of file
diff -ur ..\..\parrot-cvs\parrot/rx.ops ./rx.ops
--- ..\..\parrot-cvs\parrot/rx.ops Mon Jan 14 01:19:28 2002
+++ ./rx.ops Mon Jan 14 01:31:10 2002
@@ -207,6 +207,23 @@
goto NEXT();
}
+op rx_clearinfo(inout pmc, in str) {
+ RX_dUNPACK($1);
+ rx->index=rx->startindex=0;
+ rx->flags=enum_rxflags_none;
+ rx->success=0;
+ rx->minlength=0;
+ rx->whichway=enum_rxdirection_forwards;
+
+ while(stack_depth(interpreter, rx->stack)) {
+ stack_pop(interpreter, rx->stack, NULL, STACK_ENTRY_INT);
+ }
+
+ rx->string=$2;
+
+ goto NEXT();
+}
+
########################################
=item C<rx_freeinfo>(inout pmc)
@@ -230,13 +247,26 @@
structure in another register, the stack, or a symbol table entry before calling this
opcode.
-B<XXX> Currently this op has not been implemented.
+This opcode actually creates a new stack for the new regular expression structure, but
+all other fields (including the group-related ones) stay the same. It's primarily
+used
+for things like lookaheads and lookbehinds, where the regex's state should be
+completely
+restored to the original version if the match succeeds. (Well, almost
+completely--groups
+matched with a cloned structure live on in the original.)
=cut
op rx_cloneinfo(inout pmc) {
+ rxinfo *rx2;
RX_dUNPACK($1);
+
+ rx2=rx_allocate_info(interpreter, rx->string);
+ *rx2=*rx;
+
+ rx2->stack=new_stack(interpreter);
+ $1=pmc_new(interpreter, enum_class_ParrotPointer);
+ $1->data=rx2;
+
goto NEXT();
}
@@ -376,7 +406,6 @@
op rx_pushmark(in pmc) {
RX_dUNPACK($1);
- /* Don't worry about the const * to void * cast on the next line */
stack_push(interpreter, rx->stack, (void *)&RX_MARK, STACK_ENTRY_INT, NULL);
goto NEXT();
@@ -651,8 +680,8 @@
RX_dUNPACK($1);
RxAssertMore(rx, $2);
-
- if(rx_is_word_character(RxCurChar(rx))) {
+
+ if(rx_is_word_character(interpreter, RxCurChar(rx))) {
RxAdvance(rx);
goto NEXT();
}
@@ -661,6 +690,7 @@
}
}
+
########################################
=item C<rx_is_n>(in pmc, in int)
@@ -674,7 +704,7 @@
RxAssertMore(rx, $2);
- if(rx_is_number_character(RxCurChar(rx))) {
+ if(rx_is_number_character(interpreter, RxCurChar(rx))) {
RxAdvance(rx);
goto NEXT();
}
@@ -696,7 +726,7 @@
RxAssertMore(rx, $2);
- if(rx_is_whitespace_character(RxCurChar(rx))) {
+ if(rx_is_whitespace_character(interpreter, RxCurChar(rx))) {
RxAdvance(rx);
goto NEXT();
}
@@ -722,18 +752,19 @@
op rx_oneof(in pmc, in str, in int) {
RX_dUNPACK($1);
- STRING *ch1;
- STRING *ch2;
- INTVAL i;
-
- /* XXX In the future, this ought to use bitmaps. */
-
+ Bitmap bitmap;
+
RxAssertMore(rx, $3);
- ch1=RxCurCharS(rx);
+ bitmap=make_bitmap($2);
- if(string_length($2) < 8) { /* XXX run benchmarks to find a good value */
- /* modified linear search--slow, but zero overhead */
+ if(!bitmap) {
+ INTVAL i;
+ STRING *ch1;
+ STRING *ch2;
+
+ ch1=RxCurCharS(rx);
+
for(i=0; i < string_length($2); i++) {
ch2=rxP_get_substr(interpreter, $2, i, 1);
@@ -745,41 +776,63 @@
goto OFFSET($3);
}
}
+
+ goto OFFSET($3);
}
else {
- /* binary search--fast but complicated */
- INTVAL upper, lower=0, index=0, lastindex=-1, cmp;
-
- upper=string_length($2);
-
- while(upper > lower) {
- index=(upper+lower)/2;
-
- if(index==lastindex) {
- goto OFFSET($3);
- }
- else if(index==string_length($2)) {
- goto OFFSET($3);
- }
-
- cmp=string_compare(interpreter, RxCurCharS(rx),
rxP_get_substr(interpreter, $2, index, 1));
-
- if(0==cmp) {
- RxAdvance(rx);
- goto NEXT();
- }
- else if(0 > cmp) {
- upper=index;
- }
- else {
- lower=index;
- }
-
- lastindex=index;
+ if(check_bitmap(RxCurChar(rx), bitmap)) {
+ mem_sys_free(bitmap);
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($3);
}
}
+}
+
+
+########################################
+
+=item C<rx_oneof_bmp>(in pmc, in pmc, in int)
+
+This op has the exact same behavior as C<rx_oneof>, except that the second parameter
+is a ParrotPointer to a bitmap generated by C<rx_makebmp>.
+
+=cut
+
+op rx_oneof_bmp(in pmc, in pmc, in int) {
+ RX_dUNPACK($1);
+
+ RxAssertMore(rx, $3);
- goto OFFSET($3);
+ if(check_bitmap(RxCurChar(rx), $2->data)) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($3);
+ }
+}
+
+#######################################
+
+=item C<rx_makebmp>(out pmc, in str)
+
+This op pre-generates bitmaps to be used with C<rx_oneof_bmp>, increasing performance.
+The first parameter will be set to a ParrotPointer to the bitmap; the second parameter
+is the string to be bitmapped.
+
+Note that bitmaps are currently NOT compatible with characters above 255 (as defined
+by
+whatever character set you're using). This may change in the future.
+
+=cut
+
+op rx_makebmp(out pmc, in str) {
+ $1=pmc_new(interpreter, enum_class_ParrotPointer);
+ $1->data=(void*)make_bitmap($2);
+
+ goto NEXT();
}
########################################
@@ -792,6 +845,7 @@
=cut
op rx_dot(in pmc, in int) {
+ static Bitmap bmp;
RX_dUNPACK($1);
RxAssertMore(rx, $2);
@@ -801,10 +855,11 @@
goto NEXT();
}
else {
- STRING *ch=RxCurCharS(rx);
- STRING *nl=string_make(interpreter, "\n", 1, 0, 0, 0);
+ if(!bmp) {
+ bmp=make_bitmap(cstr2pstr("\r\n"));
+ }
- if(string_compare(interpreter, ch, nl)!=0) {
+ if(!check_bitmap(RxCurChar(rx), bmp)) {
RxAdvance(rx);
goto NEXT();
}
@@ -825,14 +880,14 @@
op rx_zwa_boundary(in pmc, in int) {
RX_dUNPACK($1);
- char ch1, ch2;
+ BOOLVAL one, two;
- ch1=RxCurChar(rx);
+ one=rx_is_word_character(interpreter, RxCurChar(rx));
RxAdvanceX(rx, -1);
- ch2=RxCurChar(rx);
+ two=rx_is_word_character(interpreter, RxCurChar(rx));
RxAdvance(rx);
- if(rx_is_word_character(ch1) == rx_is_word_character(ch2)) {
+ if(one && two || !one && !two) {
goto OFFSET($2);
}
--- ..\..\parrot-cvs\parrot/t/op/rx.t Thu Jan 10 16:04:56 2002
+++ ./t/op/rx.t Mon Jan 14 01:35:16 2002
@@ -139,8 +139,6 @@
no match
OUTPUT
-TODO: {
- local $TODO="pending key fixes" if $^O eq "MSWin32";
output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'groups');
rx_startgroup P0, 0
rx_literal P0, "a", $advance
@@ -156,7 +154,6 @@
(a)
<><a><>
OUTPUT
-}
output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (success)');
rx_zwa_atbeginning P0, $advance